Message ID | 1516298002-4618-7-git-send-email-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Refactor qsort implementation | expand |
Adhemerval Zanella wrote: > +static inline bool > +check_alignment (const void *base, size_t align) > +{ > + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; > +} Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? > +static inline void > +swap_generic (void *a, void *b, size_t size) Why is this inline? It's used only as a function pointer, and the other functions so used are not declared inline. > +static inline swap_t > +select_swap_func (const void *base, size_t size) > +{ > + if (size == 4 && check_alignment (base, 4)) > + return swap_u32; > + else if (size == 8 && check_alignment (base, 8)) > + return swap_u64; > + return swap_generic; > +} The conditions aren't portable enough. Use something like this instead for swap_u32, and similarly for swap_u64. if (size == sizeof (uint32_t) && check_alignment (base, alignof (uint32_t))) return swap_u32; > +static void > +swap_u32 (void *a, void *b, size_t size) The pointer arguments should be declared 'void *restrict'. This can help GCC generate better code. Similarly for the other swap functions. > + uint32_t tmp = *(uint32_t*) a; > + *(uint32_t*) a = *(uint32_t*) b; > + *(uint32_t*) b = tmp; It's nicer to avoid casts when possible, as is the case here and elsewhere. This is because casts are too powerful in C. Something like this, say: uint32_t *ua = a, *ub = b, tmp = *ua; *ua = *ub, *ub = tmp; > + unsigned char tmp[128]; Why 128? A comment seems called for. > +static inline void > +swap_generic (void *a, void *b, size_t size) > +{ > + unsigned char tmp[128]; > + do > + { > + size_t s = size > sizeof (tmp) ? sizeof (tmp) : size; > + memcpy (tmp, a, s); > + a = __mempcpy (a, b, s); > + b = __mempcpy (b, tmp, s); > + size -= s; > + } > + while (size > 0); > +} On my platform (GCC 7.2.1 20170915 (Red Hat 7.2.1-2) x86-64) this inlined the memcpy but not the mempcpy calls. How about something like this instead? It should let the compiler do a better job of block-move-style operations in the loop. If mempcpy is inlined for you, feel free to substitute it for two of the loop's calls to memcpy. static void swap_generic (void *restrict va, void *restrict vb, size_t size) { char *a = va, *b = vb; enum { n = 128 }; /* Why 128? */ unsigned char tmp[n]; while (size >= n) { memcpy (tmp, a, n); memcpy (a, b, n); memcpy (b, tmp, n); a += n; b += n; size -= n; } memcpy (tmp, a, size); memcpy (a, b, size); memcpy (b, tmp, size); }
On 22/01/2018 06:27, Paul Eggert wrote: > Adhemerval Zanella wrote: >> +static inline bool >> +check_alignment (const void *base, size_t align) >> +{ >> + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; >> +} > > Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? Yes, I checked on sparc64 machine. This test is similar to the Linux kernel one at lib/sort.c. > >> +static inline void >> +swap_generic (void *a, void *b, size_t size) > > Why is this inline? It's used only as a function pointer, and the other functions so used are not declared inline. It should not be, I fixed it locally. > >> +static inline swap_t >> +select_swap_func (const void *base, size_t size) >> +{ >> + if (size == 4 && check_alignment (base, 4)) >> + return swap_u32; >> + else if (size == 8 && check_alignment (base, 8)) >> + return swap_u64; >> + return swap_generic; >> +} > > The conditions aren't portable enough. Use something like this instead for swap_u32, and similarly for swap_u64. > > if (size == sizeof (uint32_t) && check_alignment (base, alignof (uint32_t))) > return swap_u32; Ack, fixed locally. > >> +static void >> +swap_u32 (void *a, void *b, size_t size) > > The pointer arguments should be declared 'void *restrict'. This can help GCC generate better code. Similarly for the other swap functions. > >> + uint32_t tmp = *(uint32_t*) a; >> + *(uint32_t*) a = *(uint32_t*) b; >> + *(uint32_t*) b = tmp; > > It's nicer to avoid casts when possible, as is the case here and elsewhere. This is because casts are too powerful in C. Something like this, say: > > uint32_t *ua = a, *ub = b, tmp = *ua; > *ua = *ub, *ub = tmp; Right, I changed to your suggestion. > >> + unsigned char tmp[128]; > > Why 128? A comment seems called for. It is indeed an arbitrary value based on some real cases usage which covers all gcc and firefox usage (largest key size gcc uses is 56 and firefox is 40). I will add comment about it. > >> +static inline void >> +swap_generic (void *a, void *b, size_t size) >> +{ >> + unsigned char tmp[128]; >> + do >> + { >> + size_t s = size > sizeof (tmp) ? sizeof (tmp) : size; >> + memcpy (tmp, a, s); >> + a = __mempcpy (a, b, s); >> + b = __mempcpy (b, tmp, s); >> + size -= s; >> + } >> + while (size > 0); >> +} > > On my platform (GCC 7.2.1 20170915 (Red Hat 7.2.1-2) x86-64) this inlined the memcpy but not the mempcpy calls. How about something like this instead? It should let the compiler do a better job of block-move-style operations in the loop. If mempcpy is inlined for you, feel free to substitute it for two of the loop's calls to memcpy. > > static void > swap_generic (void *restrict va, void *restrict vb, size_t size) > { > char *a = va, *b = vb; > enum { n = 128 }; /* Why 128? */ > unsigned char tmp[n]; > while (size >= n) > { > memcpy (tmp, a, n); > memcpy (a, b, n); > memcpy (b, tmp, n); > a += n; > b += n; > size -= n; > } > memcpy (tmp, a, size); > memcpy (a, b, size); > memcpy (b, tmp, size); > } Because for this specific code inline is not always a gain, it depends 1. which is the minimum ISA compiler will use to generate the inline variants and 2. which ifunc variants glibc will provide for mempcpy (if if it is exposed for internal calls). On recent x86_64 for instance the mempcpy call will issue __memmove_avx_unaligned_erms, which is faster than default SSE2 inline variations. Your suggestion turns to be in fact slower (base is current approach) on a my machine (i7-4790K): Results for member size 32 Sorted nmemb | base | patched | diff 32| 1184 | 1268 | 7.09 4096| 325865 | 333332 | 2.29 32768| 3331750 | 3431695 | 3.00 524288| 69067176 | 68805735 | -0.38 Repeated nmemb | base | patched | diff 32| 4813 | 5779 | 20.07 4096| 1624137 | 1972045 | 21.42 32768| 15896739 | 19705289 | 23.96 524288| 316328778 | 393942797 | 24.54 MostlySorted nmemb | base | patched | diff 32| 5332 | 6198 | 16.24 4096| 1312703 | 1563919 | 19.14 32768| 12360726 | 14990070 | 21.27 524288| 231603294 | 283228681 | 22.29 Unsorted nmemb | base | patched | diff 32| 6047 | 7115 | 17.66 4096| 1695241 | 2010943 | 18.62 32768| 16430388 | 19636166 | 19.51 524288| 329496913 | 395355847 | 19.99 In fact if I use -fno-builtin to force the memcpy call to issue the ifunc I get another speed up (and it could be faster, for x86_64 at least, if glibc is build with -mavx): Results for member size 32 Sorted nmemb | base | patched | diff 32| 1184 | 1240 | 4.73 4096| 325865 | 326596 | 0.22 32768| 3331750 | 3613807 | 8.47 524288| 69067176 | 74352201 | 7.65 Repeated nmemb | base | patched | diff 32| 4813 | 4133 | -14.13 4096| 1624137 | 1707452 | 5.13 32768| 15896739 | 13999315 | -11.94 524288| 316328778 | 280461810 | -11.34 MostlySorted nmemb | base | patched | diff 32| 5332 | 4681 | -12.21 4096| 1312703 | 1226684 | -6.55 32768| 12360726 | 11362772 | -8.07 524288| 231603294 | 212250739 | -8.36 Unsorted nmemb | base | patched | diff 32| 6047 | 6676 | 10.40 4096| 1695241 | 1492257 | -11.97 32768| 16430388 | 14799600 | -9.93 524288| 329496913 | 303681410 | -7.83 It might be that your approach is faster for other architectures which do not have ifunc mempcpy, however I do not want to over-engineer this code since most real word correspond to key sizes of 4 and 8.
On Mon, 22 Jan 2018, Adhemerval Zanella wrote: > On 22/01/2018 06:27, Paul Eggert wrote: > > Adhemerval Zanella wrote: > >> +static inline bool > >> +check_alignment (const void *base, size_t align) > >> +{ > >> + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; > >> +} > > > > Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? > > Yes, I checked on sparc64 machine. This test is similar to the Linux kernel one > at lib/sort.c. But the kernel source correctly uses '&' there rather than '%'. Alexander
> Il giorno 22 gen 2018, alle ore 11:46, Alexander Monakov <amonakov@ispras.ru> ha scritto: > >> On Mon, 22 Jan 2018, Adhemerval Zanella wrote: >>> On 22/01/2018 06:27, Paul Eggert wrote: >>> Adhemerval Zanella wrote: >>>> +static inline bool >>>> +check_alignment (const void *base, size_t align) >>>> +{ >>>> + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; >>>> +} >>> >>> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? >> >> Yes, I checked on sparc64 machine. This test is similar to the Linux kernel one >> at lib/sort.c. > > But the kernel source correctly uses '&' there rather than '%' Indeed, I assume I got luck on sparc64 (only uses aligned buffers). I fixed it locally.
On 01/22/2018 02:55 AM, Adhemerval Zanella wrote: > On 22/01/2018 06:27, Paul Eggert wrote: >> Adhemerval Zanella wrote: >>> +static inline bool >>> +check_alignment (const void *base, size_t align) >>> +{ >>> + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; >>> +} >> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? > Yes, I checked on sparc64 machine. This test is similar to the Linux kernel one > at lib/sort.c. The Linux kernel lib/sort.c test is (((unsigned long)base & (align - 1)) == 0), which is correct. The test above uses '% (align - 1)' instead, which is clearly wrong; the "%" should be "&". As the tests evidently did not catch the error, they need to be improved to catch it. > It might be that your approach is faster for other architectures which do not have > ifunc mempcpy, however I do not want to over-engineer this code since most real > word correspond to key sizes of 4 and 8. Thanks, that all makes sense. One other question. Would it improve performance to partially evaluate qsort for the case where the key size is that of a pointer, to allow the swap to be done inline with four insns? I would imagine that this is the most common case.
On 22/01/2018 15:15, Paul Eggert wrote: > On 01/22/2018 02:55 AM, Adhemerval Zanella wrote: >> On 22/01/2018 06:27, Paul Eggert wrote: >>> Adhemerval Zanella wrote: >>>> +static inline bool >>>> +check_alignment (const void *base, size_t align) >>>> +{ >>>> + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; >>>> +} >>> Surely the '(align - 1)' was supposed to be 'align'. Has this been tested on an architecture that does not allow unaligned access? >> Yes, I checked on sparc64 machine. This test is similar to the Linux kernel one >> at lib/sort.c. > > The Linux kernel lib/sort.c test is (((unsigned long)base & (align - 1)) == 0), which is correct. The test above uses '% (align - 1)' instead, which is clearly wrong; the "%" should be "&". As the tests evidently did not catch the error, they need to be improved to catch it. Indeed, Alexander Monakov pointed out this issue is his message and I have fixed it locally. > >> It might be that your approach is faster for other architectures which do not have >> ifunc mempcpy, however I do not want to over-engineer this code since most real >> word correspond to key sizes of 4 and 8. > > Thanks, that all makes sense. > > One other question. Would it improve performance to partially evaluate qsort for the case where the key size is that of a pointer, to allow the swap to be done inline with four insns? I would imagine that this is the most common case. > I noted that, at least for x86_64, calling a function pointer it slight faster than embedded the test in a switch (as current msort). One option I have not tested, and which will trade code side for performance; would parametrize the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t, qsort_uint64_t, and qsort_generic for instance (which calls the swap inline). So we will have something as: void qsort (void *pbase, size_t total_elems, size_t size) { if (size == sizeof (uint32_t) && check_alignment (base, sizeof (uint32_t))) return qsort_uint32_t (pbase, total_elems, size); else if (size == sizeof (uint64_t) && check_alignment (base, sizeof (uint64_t))) return qsort_uint64_t (pbase, total_elems, size); return qsort_generic (pbase, total_elems, size); }
On 01/22/2018 09:48 AM, Adhemerval Zanella wrote: > One option I have not > tested, and which will trade code side for performance; would parametrize > the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t, > qsort_uint64_t, and qsort_generic for instance (which calls the swap inline). > > So we will have something as: > > void qsort (void *pbase, size_t total_elems, size_t size) > { > if (size == sizeof (uint32_t) > && check_alignment (base, sizeof (uint32_t))) > return qsort_uint32_t (pbase, total_elems, size); > else if (size == sizeof (uint64_t) > && check_alignment (base, sizeof (uint64_t))) > return qsort_uint64_t (pbase, total_elems, size); > return qsort_generic (pbase, total_elems, size); > } Yes, that's the option I was thinking of, except I was thinking that the first test should be "if (size == sizeof (void *) && check_alignment (base, alignof (void *))) return qsort_voidptr (pbase, total_elems, size);" because sorting arrays of pointers is the most common. (Also, check_alignment's argument should use alignof not sizeof.)
On 22/01/2018 16:29, Paul Eggert wrote: > On 01/22/2018 09:48 AM, Adhemerval Zanella wrote: >> One option I have not >> tested, and which will trade code side for performance; would parametrize >> the qsort creation (as for the 7/7 patch in this set) to have qsort_uint32_t, >> qsort_uint64_t, and qsort_generic for instance (which calls the swap inline). >> >> So we will have something as: >> >> void qsort (void *pbase, size_t total_elems, size_t size) >> { >> if (size == sizeof (uint32_t) >> && check_alignment (base, sizeof (uint32_t))) >> return qsort_uint32_t (pbase, total_elems, size); >> else if (size == sizeof (uint64_t) >> && check_alignment (base, sizeof (uint64_t))) >> return qsort_uint64_t (pbase, total_elems, size); >> return qsort_generic (pbase, total_elems, size); >> } > > Yes, that's the option I was thinking of, except I was thinking that the first test should be "if (size == sizeof (void *) && check_alignment (base, alignof (void *))) return qsort_voidptr (pbase, total_elems, size);" because sorting arrays of pointers is the most common. (Also, check_alignment's argument should use alignof not sizeof.) > I add the implementation size and the results are slight better: Results for member size 8 Sorted nmemb | base | patched | diff 32| 1173 | 1282 | 9.29 4096| 325485 | 332451 | 2.14 32768| 3232255 | 3293842 | 1.91 524288| 65645381 | 66182948 | 0.82 Repeated nmemb | base | patched | diff 32| 2074 | 2034 | -1.93 4096| 948339 | 913363 | -3.69 32768| 8906214 | 8651378 | -2.86 524288| 173498547 | 166294093 | -4.15 MostlySorted nmemb | base | patched | diff 32| 2211 | 2147 | -2.89 4096| 757543 | 739765 | -2.35 32768| 7785343 | 7570811 | -2.76 524288| 133912169 | 129728791 | -3.12 Unsorted nmemb | base | patched | diff 32| 2219 | 2191 | -1.26 4096| 1017790 | 989068 | -2.82 32768| 9747216 | 9456092 | -2.99 524288| 191726744 | 185012121 | -3.50 At the cost of large text sizes and slight more code: # Before $ size stdlib/qsort.os text data bss dec hex filename 2578 0 0 2578 a12 stdlib/qsort.os # After $ size stdlib/qsort.os text data bss dec hex filename 6037 0 0 6037 1795 stdlib/qsort.os I still prefer my version where generates shorter text segment and also optimizes for uint32_t.
Adhemerval Zanella wrote: > At the cost of large text sizes and slight more code: Yes, that's a common tradeoff for this sort of optimization. My guess is that most glibc users these days would like to spend 4 kB of text space to gain a 2%-or-so CPU speedup. (But it's just a guess. :-) > I still prefer my version where generates shorter text segment and also > optimizes for uint32_t. The more-inlined version could also optimize for uint32_t. Such an optimization should not change the machine code on platforms with 32-bit pointers (since uint32_t has the same size and alignment restrictions as void *, and GCC should be smart enough to figure this out) but should speed up the size-4 case on platforms with 64-bit pointers. Any thoughts on why the more-inlined version is a bit slower when input is already sorted?
On 23/01/2018 04:04, Paul Eggert wrote: > Adhemerval Zanella wrote: >> At the cost of large text sizes and slight more code: > > Yes, that's a common tradeoff for this sort of optimization. My guess is that most glibc users these days would like to spend 4 kB of text space to gain a 2%-or-so CPU speedup. (But it's just a guess. :-) >> I still prefer my version where generates shorter text segment and also >> optimizes for uint32_t. > > The more-inlined version could also optimize for uint32_t. Such an optimization should not change the machine code on platforms with 32-bit pointers (since uint32_t has the same size and alignment restrictions as void *, and GCC should be smart enough to figure this out) but should speed up the size-4 case on platforms with 64-bit pointers. > > Any thoughts on why the more-inlined version is a bit slower when input is already sorted? Again do we really to over-engineering it? GCC profile usage shows 95% to total issues done with up to 9 elements and 92% of key size 8. Firefox is somewhat more diverse with 72% up to 17 elements and 95% of key size 8. I think that adding even more code complexity by parametrizing the qsort calls to inline the swap operations won't really make much difference in the aforementioned user cases. I would rather add specialized sort implementation such as BSD family, heapsort and mergesort, to provide different algorithm for different constraints (mergesort for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might even extend it to add something like introsort.
On 01/23/2018 10:28 AM, Adhemerval Zanella wrote: > Again do we really to over-engineering it? GCC profile usage shows 95% to total > issues done with up to 9 elements and 92% of key size 8. Firefox is somewhat > more diverse with 72% up to 17 elements and 95% of key size 8. You have a point. I assume these were on machines with 64-bit pointers. In that case why bother with a size-4 special case? Special-casing pointer-size items should suffice. > I would rather add specialized sort implementation such as BSD family, heapsort > and mergesort, to provide different algorithm for different constraints (mergesort > for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might > even extend it to add something like introsort. Each of us over-engineers in his own way (:-).
On 23/01/2018 21:37, Paul Eggert wrote: > On 01/23/2018 10:28 AM, Adhemerval Zanella wrote: > >> Again do we really to over-engineering it? GCC profile usage shows 95% to total >> issues done with up to 9 elements and 92% of key size 8. Firefox is somewhat >> more diverse with 72% up to 17 elements and 95% of key size 8. > > You have a point. I assume these were on machines with 64-bit pointers. In that case why bother with a size-4 special case? Special-casing pointer-size items should suffice. Yes, I just tested on x86_64 and I added the size-4 mainly because is quite simple in terms of code complexity and resulting code size. > >> I would rather add specialized sort implementation such as BSD family, heapsort >> and mergesort, to provide different algorithm for different constraints (mergesort >> for stable-sort, heapsort/mergesort to avoid worse-case from quicksort). We might >> even extend it to add something like introsort. > > Each of us over-engineers in his own way (:-). > I do think your points are fair, most usage of qsort is already hitting the quicksort implementation (due the total size of array) and for these cases they will do see a small speed up due swap optimization and the undefined behaviour fix for qsort_r. I think if speed is the focus, there is some other idea to optimize for it like BZ#17941.
diff --git a/stdlib/qsort.c b/stdlib/qsort.c index b3a5102..2194003 100644 --- a/stdlib/qsort.c +++ b/stdlib/qsort.c @@ -23,20 +23,59 @@ #include <limits.h> #include <stdlib.h> #include <string.h> +#include <stdbool.h> -/* Byte-wise swap two items of size SIZE. */ -#define SWAP(a, b, size) \ - do \ - { \ - size_t __size = (size); \ - char *__a = (a), *__b = (b); \ - do \ - { \ - char __tmp = *__a; \ - *__a++ = *__b; \ - *__b++ = __tmp; \ - } while (--__size > 0); \ - } while (0) +/* Swap SIZE bytes between addresses A and B. Helper to generic types + are provided as an optimization. */ + +typedef void (*swap_t)(void *, void *, size_t); + +static inline bool +check_alignment (const void *base, size_t align) +{ + return _STRING_ARCH_unaligned || ((uintptr_t)base % (align - 1)) == 0; +} + +static void +swap_u32 (void *a, void *b, size_t size) +{ + uint32_t tmp = *(uint32_t*) a; + *(uint32_t*) a = *(uint32_t*) b; + *(uint32_t*) b = tmp; +} + +static void +swap_u64 (void *a, void *b, size_t size) +{ + uint64_t tmp = *(uint64_t*) a; + *(uint64_t*) a = *(uint64_t*) b; + *(uint64_t*) b = tmp; +} + +static inline void +swap_generic (void *a, void *b, size_t size) +{ + unsigned char tmp[128]; + do + { + size_t s = size > sizeof (tmp) ? sizeof (tmp) : size; + memcpy (tmp, a, s); + a = __mempcpy (a, b, s); + b = __mempcpy (b, tmp, s); + size -= s; + } + while (size > 0); +} + +static inline swap_t +select_swap_func (const void *base, size_t size) +{ + if (size == 4 && check_alignment (base, 4)) + return swap_u32; + else if (size == 8 && check_alignment (base, 8)) + return swap_u64; + return swap_generic; +} /* Discontinue quicksort algorithm when partition gets below this size. This particular magic number was chosen to work best on a Sun 4/260. */ @@ -96,6 +135,8 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size, /* Avoid lossage with unsigned arithmetic below. */ return; + swap_t swap = select_swap_func (pbase, size); + if (total_elems > MAX_THRESH) { char *lo = base_ptr; @@ -119,13 +160,13 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size, char *mid = lo + size * ((hi - lo) / size >> 1); if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + swap (mid, lo, size); if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) - SWAP (mid, hi, size); + swap (mid, hi, size); else goto jump_over; if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + swap (mid, lo, size); jump_over:; left_ptr = lo + size; @@ -144,7 +185,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size, if (left_ptr < right_ptr) { - SWAP (left_ptr, right_ptr, size); + swap (left_ptr, right_ptr, size); if (mid == left_ptr) mid = right_ptr; else if (mid == right_ptr) @@ -216,7 +257,7 @@ __qsort_r (void *const pbase, size_t total_elems, size_t size, tmp_ptr = run_ptr; if (tmp_ptr != base_ptr) - SWAP (tmp_ptr, base_ptr, size); + swap (tmp_ptr, base_ptr, size); /* Insertion sort, running from left-hand-side up to right-hand-side. */