Message ID | 20240304175902.1479562-1-adhemerval.zanella@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | riscv: Fix alignment-ignorant memcpy implementation | expand |
On 04/03/24 14:59, Adhemerval Zanella wrote: > The memcpy optimization (commit 587a1290a1af7bee6db) has a series > of mistakes: > > - The implementation is wrong: the chunk size calculation is wrong > leading to invalid memory access. > > - It adds ifunc supports as default, so --disable-multi-arch does > not work as expected for riscv. > > - It mixes Linux files (memcpy ifunc selection which requires the > vDSO/syscall mechanism) with generic support (the memcpy > optimization itself). > > - There is no __libc_ifunc_impl_list, which makes testing only > check the selected implementation instead of all supported > by the system. > > Also, there is no reason why the implementation can't be coded in C, > since it uses only normal registers and the compiler is able to > generate code as good as the assembly implementation. I have not > checked the performance, but the C implementation uses the same > strategies, the generate code with gcc 13 seems straightforward, and > the tail code also avoid byte-operations. > > This patch also simplifies the required bits to enable ifunc: there > is no need to memcopy.h; nor to add Linux-specific files. > > Checked on riscv64 and riscv32 by explicitly enabling the function > on __libc_ifunc_impl_list on qemu-system. > --- > sysdeps/riscv/memcpy_noalignment.S | 136 ------------------ > .../multiarch}/memcpy-generic.c | 8 +- > sysdeps/riscv/multiarch/memcpy_noalignment.c | 100 +++++++++++++ > sysdeps/unix/sysv/linux/riscv/Makefile | 9 -- > sysdeps/unix/sysv/linux/riscv/hwprobe.c | 1 + > .../sysv/linux/riscv/include/sys/hwprobe.h | 8 ++ > .../unix/sysv/linux/riscv/multiarch/Makefile | 7 + > .../linux/riscv/multiarch/ifunc-impl-list.c} | 33 +++-- > .../sysv/linux/riscv/multiarch}/memcpy.c | 16 +-- > 9 files changed, 151 insertions(+), 167 deletions(-) > delete mode 100644 sysdeps/riscv/memcpy_noalignment.S > rename sysdeps/{unix/sysv/linux/riscv => riscv/multiarch}/memcpy-generic.c (87%) > create mode 100644 sysdeps/riscv/multiarch/memcpy_noalignment.c > create mode 100644 sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h > create mode 100644 sysdeps/unix/sysv/linux/riscv/multiarch/Makefile > rename sysdeps/{riscv/memcopy.h => unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c} (51%) > rename sysdeps/{riscv => unix/sysv/linux/riscv/multiarch}/memcpy.c (90%) > > diff --git a/sysdeps/riscv/memcpy_noalignment.S b/sysdeps/riscv/memcpy_noalignment.S > deleted file mode 100644 > index 621f8d028f..0000000000 > --- a/sysdeps/riscv/memcpy_noalignment.S > +++ /dev/null > @@ -1,136 +0,0 @@ > -/* memcpy for RISC-V, ignoring buffer alignment > - Copyright (C) 2024 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library. If not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > -#include <sys/asm.h> > - > -/* void *memcpy(void *, const void *, size_t) */ > -ENTRY (__memcpy_noalignment) > - move t6, a0 /* Preserve return value */ > - > - /* Bail if 0 */ > - beqz a2, 7f > - > - /* Jump to byte copy if size < SZREG */ > - li a4, SZREG > - bltu a2, a4, 5f > - > - /* Round down to the nearest "page" size */ > - andi a4, a2, ~((16*SZREG)-1) > - beqz a4, 2f > - add a3, a1, a4 > - > - /* Copy the first word to get dest word aligned */ > - andi a5, t6, SZREG-1 > - beqz a5, 1f > - REG_L a6, (a1) > - REG_S a6, (t6) > - > - /* Align dst up to a word, move src and size as well. */ > - addi t6, t6, SZREG-1 > - andi t6, t6, ~(SZREG-1) > - sub a5, t6, a0 > - add a1, a1, a5 > - sub a2, a2, a5 > - > - /* Recompute page count */ > - andi a4, a2, ~((16*SZREG)-1) > - beqz a4, 2f > - > -1: > - /* Copy "pages" (chunks of 16 registers) */ > - REG_L a4, 0(a1) > - REG_L a5, SZREG(a1) > - REG_L a6, 2*SZREG(a1) > - REG_L a7, 3*SZREG(a1) > - REG_L t0, 4*SZREG(a1) > - REG_L t1, 5*SZREG(a1) > - REG_L t2, 6*SZREG(a1) > - REG_L t3, 7*SZREG(a1) > - REG_L t4, 8*SZREG(a1) > - REG_L t5, 9*SZREG(a1) > - REG_S a4, 0(t6) > - REG_S a5, SZREG(t6) > - REG_S a6, 2*SZREG(t6) > - REG_S a7, 3*SZREG(t6) > - REG_S t0, 4*SZREG(t6) > - REG_S t1, 5*SZREG(t6) > - REG_S t2, 6*SZREG(t6) > - REG_S t3, 7*SZREG(t6) > - REG_S t4, 8*SZREG(t6) > - REG_S t5, 9*SZREG(t6) > - REG_L a4, 10*SZREG(a1) > - REG_L a5, 11*SZREG(a1) > - REG_L a6, 12*SZREG(a1) > - REG_L a7, 13*SZREG(a1) > - REG_L t0, 14*SZREG(a1) > - REG_L t1, 15*SZREG(a1) > - addi a1, a1, 16*SZREG > - REG_S a4, 10*SZREG(t6) > - REG_S a5, 11*SZREG(t6) > - REG_S a6, 12*SZREG(t6) > - REG_S a7, 13*SZREG(t6) > - REG_S t0, 14*SZREG(t6) > - REG_S t1, 15*SZREG(t6) > - addi t6, t6, 16*SZREG > - bltu a1, a3, 1b > - andi a2, a2, (16*SZREG)-1 /* Update count */ > - > -2: > - /* Remainder is smaller than a page, compute native word count */ > - beqz a2, 7f > - andi a5, a2, ~(SZREG-1) > - andi a2, a2, (SZREG-1) > - add a3, a1, a5 > - /* Jump directly to last word if no words. */ > - beqz a5, 4f > - > -3: > - /* Use single native register copy */ > - REG_L a4, 0(a1) > - addi a1, a1, SZREG > - REG_S a4, 0(t6) > - addi t6, t6, SZREG > - bltu a1, a3, 3b > - > - /* Jump directly out if no more bytes */ > - beqz a2, 7f > - > -4: > - /* Copy the last word unaligned */ > - add a3, a1, a2 > - add a4, t6, a2 > - REG_L a5, -SZREG(a3) > - REG_S a5, -SZREG(a4) > - ret > - > -5: > - /* Copy bytes when the total copy is <SZREG */ > - add a3, a1, a2 > - > -6: > - lb a4, 0(a1) > - addi a1, a1, 1 > - sb a4, 0(t6) > - addi t6, t6, 1 > - bltu a1, a3, 6b > - > -7: > - ret > - > -END (__memcpy_noalignment) > diff --git a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c b/sysdeps/riscv/multiarch/memcpy-generic.c > similarity index 87% > rename from sysdeps/unix/sysv/linux/riscv/memcpy-generic.c > rename to sysdeps/riscv/multiarch/memcpy-generic.c > index f06f4bda15..4235d33859 100644 > --- a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c > +++ b/sysdeps/riscv/multiarch/memcpy-generic.c > @@ -18,7 +18,9 @@ > > #include <string.h> > > -extern __typeof (memcpy) __memcpy_generic; > -hidden_proto (__memcpy_generic) > - > +#if IS_IN(libc) > +# define MEMCPY __memcpy_generic > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(x) > +#endif > #include <string/memcpy.c> > diff --git a/sysdeps/riscv/multiarch/memcpy_noalignment.c b/sysdeps/riscv/multiarch/memcpy_noalignment.c > new file mode 100644 > index 0000000000..9552ae45fc > --- /dev/null > +++ b/sysdeps/riscv/multiarch/memcpy_noalignment.c > @@ -0,0 +1,100 @@ > +/* Copy memory to memory until the specified number of bytes. > + Copyright (C) 2024 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <string.h> > +#include <stdint.h> > +#include <string-optype.h> > + > +/* memcpy optimization for chips with fast unaligned support > + (RISCV_HWPROBE_MISALIGNED_FAST). */ > + > +#define OPSIZ (sizeof (op_t)) > + > +typedef uint32_t __attribute__ ((__may_alias__)) op32_t; > +typedef uint16_t __attribute__ ((__may_alias__)) op16_t; > + > +void * > +__memcpy_noalignment (void *dest, const void *src, size_t n) > +{ > + unsigned char *d = dest; > + const unsigned char *s = src; > + > + if (__glibc_unlikely (n == 0)) > + return dest; > + > + if (n < OPSIZ) > + goto tail; > + > +#define COPY_WORD(__d, __s, __i) \ > + *(op_t *)(__d + (__i * OPSIZ)) = *(op_t *)(__s + (__i * OPSIZ)) > + > + /* Copy the first op_t to get dest word aligned. */ > + COPY_WORD (d, s, 0); > + size_t off = OPSIZ - (uintptr_t)d % OPSIZ; > + d += off; > + s += off; > + n -= off; > + > + /* Copy chunks of 16 registers. */ > + enum { block_size = 16 * OPSIZ }; > + > + for (; n >= block_size; s += block_size, d += block_size, n -= block_size) > + { > + > + COPY_WORD (d, s, 0); > + COPY_WORD (d, s, 1); > + COPY_WORD (d, s, 2); > + COPY_WORD (d, s, 3); > + COPY_WORD (d, s, 4); > + COPY_WORD (d, s, 5); > + COPY_WORD (d, s, 6); > + COPY_WORD (d, s, 7); > + COPY_WORD (d, s, 8); > + COPY_WORD (d, s, 9); > + COPY_WORD (d, s, 10); > + COPY_WORD (d, s, 11); > + COPY_WORD (d, s, 12); > + COPY_WORD (d, s, 13); > + COPY_WORD (d, s, 14); > + COPY_WORD (d, s, 15); > + } > + > + /* Remainder of chunks, issue a op_t copy until n < OPSIZ. */ > + for (; n >= OPSIZ; s += OPSIZ, d += OPSIZ, n -= OPSIZ) > + COPY_WORD (d, s, 0); > + > +tail: > + if (n & 4) And I forgot to add this should be: if (sizeof (op_t) == sizeof (uint64_t) && n & 4) Since there is no need to enable this for riscv32. > + { > + *(op32_t *)(d) = *(op32_t *)s; > + s += sizeof (op32_t); > + d += sizeof (op32_t); > + } > + > + if (n & 2) > + { > + *(op16_t *)(d) = *(op16_t *)s; > + s += sizeof (op16_t); > + d += sizeof (op16_t); > + } > + > + if (n & 1) > + *d = *s; > + > + return dest; > +} > diff --git a/sysdeps/unix/sysv/linux/riscv/Makefile b/sysdeps/unix/sysv/linux/riscv/Makefile > index 398ff7418b..04abf226ad 100644 > --- a/sysdeps/unix/sysv/linux/riscv/Makefile > +++ b/sysdeps/unix/sysv/linux/riscv/Makefile > @@ -15,15 +15,6 @@ ifeq ($(subdir),stdlib) > gen-as-const-headers += ucontext_i.sym > endif > > -ifeq ($(subdir),string) > -sysdep_routines += \ > - memcpy \ > - memcpy-generic \ > - memcpy_noalignment \ > - # sysdep_routines > - > -endif > - > abi-variants := ilp32 ilp32d lp64 lp64d > > ifeq (,$(filter $(default-abi),$(abi-variants))) > diff --git a/sysdeps/unix/sysv/linux/riscv/hwprobe.c b/sysdeps/unix/sysv/linux/riscv/hwprobe.c > index e64c159eb3..9159045478 100644 > --- a/sysdeps/unix/sysv/linux/riscv/hwprobe.c > +++ b/sysdeps/unix/sysv/linux/riscv/hwprobe.c > @@ -34,3 +34,4 @@ int __riscv_hwprobe (struct riscv_hwprobe *pairs, size_t pair_count, > /* Negate negative errno values to match pthreads API. */ > return -r; > } > +libc_hidden_def (__riscv_hwprobe) > diff --git a/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h > new file mode 100644 > index 0000000000..cce91c1b53 > --- /dev/null > +++ b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h > @@ -0,0 +1,8 @@ > +#ifndef _SYS_HWPROBE_H > +# include_next <sys/hwprobe.h> > + > +#ifndef _ISOMAC > +libc_hidden_proto (__riscv_hwprobe) > +#endif > + > +#endif > diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile > new file mode 100644 > index 0000000000..7e567cb95c > --- /dev/null > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile > @@ -0,0 +1,7 @@ > +ifeq ($(subdir),string) > +sysdep_routines += \ > + memcpy \ > + memcpy-generic \ > + memcpy_noalignment \ > + # sysdep_routines > +endif > diff --git a/sysdeps/riscv/memcopy.h b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c > similarity index 51% > rename from sysdeps/riscv/memcopy.h > rename to sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c > index 27675964b0..9f806d7a9e 100644 > --- a/sysdeps/riscv/memcopy.h > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c > @@ -1,4 +1,4 @@ > -/* memcopy.h -- definitions for memory copy functions. RISC-V version. > +/* Enumerate available IFUNC implementations of a function. RISCV version. > Copyright (C) 2024 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > @@ -16,11 +16,28 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdeps/generic/memcopy.h> > +#include <ifunc-impl-list.h> > +#include <string.h> > +#include <sys/hwprobe.h> > > -/* Redefine the generic memcpy implementation to __memcpy_generic, so > - the memcpy ifunc can select between generic and special versions. > - In rtld, don't bother with all the ifunciness. */ > -#if IS_IN (libc) > -#define MEMCPY __memcpy_generic > -#endif > +size_t > +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > + size_t max) > +{ > + size_t i = max; > + > + bool fast_unaligned = false; > + > + struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 }; > + if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0 > + && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK) > + == RISCV_HWPROBE_MISALIGNED_FAST) > + fast_unaligned = true; > + > + IFUNC_IMPL (i, name, memcpy, > + IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned, > + __memcpy_noalignment) > + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) > + > + return 0; > +} > diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c > similarity index 90% > rename from sysdeps/riscv/memcpy.c > rename to sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c > index 20f9548c44..51d8ace858 100644 > --- a/sysdeps/riscv/memcpy.c > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c > @@ -28,8 +28,6 @@ > # include <riscv-ifunc.h> > # include <sys/hwprobe.h> > > -# define INIT_ARCH() > - > extern __typeof (__redirect_memcpy) __libc_memcpy; > > extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; > @@ -38,14 +36,9 @@ extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden; > static inline __typeof (__redirect_memcpy) * > select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func) > { > - unsigned long long int value; > - > - INIT_ARCH (); > - > - if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &value) != 0) > - return __memcpy_generic; > - > - if ((value & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST) > + unsigned long long int v; > + if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0 > + && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST) > return __memcpy_noalignment; > > return __memcpy_generic; > @@ -59,5 +52,6 @@ strong_alias (__libc_memcpy, memcpy); > __hidden_ver1 (memcpy, __GI_memcpy, __redirect_memcpy) > __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcpy); > # endif > - > +#else > +# include <string/memcpy.c> > #endif
On Mär 04 2024, Adhemerval Zanella wrote: > Also, there is no reason why the implementation can't be coded in C, > since it uses only normal registers and the compiler is able to > generate code as good as the assembly implementation. I have not > checked the performance, but the C implementation uses the same > strategies, the generate code with gcc 13 seems straightforward, and > the tail code also avoid byte-operations. RISC-V is a strict-alignment target, and there is a non-zero chance that the C compiler messes this up.
On 04/03/24 15:24, Andreas Schwab wrote: > On Mär 04 2024, Adhemerval Zanella wrote: > >> Also, there is no reason why the implementation can't be coded in C, >> since it uses only normal registers and the compiler is able to >> generate code as good as the assembly implementation. I have not >> checked the performance, but the C implementation uses the same >> strategies, the generate code with gcc 13 seems straightforward, and >> the tail code also avoid byte-operations. > > RISC-V is a strict-alignment target, and there is a non-zero chance that > the C compiler messes this up. > I am aware, but there is other project that successfully provides C implementation (musl, for instance, where it optimizes for aligned access) so I think if this implementation does have any mistake this should not be considered a blocked for this change.
On 04/03/24 15:28, Adhemerval Zanella Netto wrote: > > > On 04/03/24 15:24, Andreas Schwab wrote: >> On Mär 04 2024, Adhemerval Zanella wrote: >> >>> Also, there is no reason why the implementation can't be coded in C, >>> since it uses only normal registers and the compiler is able to >>> generate code as good as the assembly implementation. I have not >>> checked the performance, but the C implementation uses the same >>> strategies, the generate code with gcc 13 seems straightforward, and >>> the tail code also avoid byte-operations. >> >> RISC-V is a strict-alignment target, and there is a non-zero chance that >> the C compiler messes this up. >> > > I am aware, but there is other project that successfully provides C > implementation (musl, for instance, where it optimizes for aligned > access) so I think if this implementation does have any mistake this > should not be considered a blocked for this change. Maybe we should force -mno-strict-align for the unaligned implementation.
On 04/03/24 15:30, Adhemerval Zanella Netto wrote: > > > On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >> >> >> On 04/03/24 15:24, Andreas Schwab wrote: >>> On Mär 04 2024, Adhemerval Zanella wrote: >>> >>>> Also, there is no reason why the implementation can't be coded in C, >>>> since it uses only normal registers and the compiler is able to >>>> generate code as good as the assembly implementation. I have not >>>> checked the performance, but the C implementation uses the same >>>> strategies, the generate code with gcc 13 seems straightforward, and >>>> the tail code also avoid byte-operations. >>> >>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>> the C compiler messes this up. >>> >> >> I am aware, but there is other project that successfully provides C >> implementation (musl, for instance, where it optimizes for aligned >> access) so I think if this implementation does have any mistake this >> should not be considered a blocked for this change. > > Maybe we should force -mno-strict-align for the unaligned implementation. And at least with GCC 13.1 I see no difference besides the '.attribute unaligned_access, 1'.
On Mon, 04 Mar 2024 10:30:10 PST (-0800), adhemerval.zanella@linaro.org wrote: > > > On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >> >> >> On 04/03/24 15:24, Andreas Schwab wrote: >>> On Mär 04 2024, Adhemerval Zanella wrote: >>> >>>> Also, there is no reason why the implementation can't be coded in C, >>>> since it uses only normal registers and the compiler is able to >>>> generate code as good as the assembly implementation. I have not >>>> checked the performance, but the C implementation uses the same >>>> strategies, the generate code with gcc 13 seems straightforward, and >>>> the tail code also avoid byte-operations. >>> >>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>> the C compiler messes this up. >>> >> >> I am aware, but there is other project that successfully provides C >> implementation (musl, for instance, where it optimizes for aligned >> access) so I think if this implementation does have any mistake this >> should not be considered a blocked for this change. > > Maybe we should force -mno-strict-align for the unaligned implementation. I think that should do it, we'd had it as at outstanding TODO for a while but nobody had gotten around to benchmarking it. From your other email it looks like these are just broken, though, so maybe we just move to the C ones now? Then anyone who wants to add assembly can benchmark it, but at least we'll have something correct.
On 3/4/24 10:30, Adhemerval Zanella Netto wrote: > > On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >> >> On 04/03/24 15:24, Andreas Schwab wrote: >>> On Mär 04 2024, Adhemerval Zanella wrote: >>> >>>> Also, there is no reason why the implementation can't be coded in C, >>>> since it uses only normal registers and the compiler is able to >>>> generate code as good as the assembly implementation. I have not >>>> checked the performance, but the C implementation uses the same >>>> strategies, the generate code with gcc 13 seems straightforward, and >>>> the tail code also avoid byte-operations. >>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>> the C compiler messes this up. >>> >> I am aware, but there is other project that successfully provides C >> implementation (musl, for instance, where it optimizes for aligned >> access) so I think if this implementation does have any mistake this >> should not be considered a blocked for this change. > Maybe we should force -mno-strict-align for the unaligned implementation. That alone might not be sufficient. RISC-V gcc gates alignment with an additional cpu tune param and latter overrides the usual -m[no-]strict-align: meaning you could have more strict alignment but not any less. The default -mtune is rocket which penalizes unaligned access.
On 04/03/24 16:04, Vineet Gupta wrote: > > > On 3/4/24 10:30, Adhemerval Zanella Netto wrote: >> >> On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >>> >>> On 04/03/24 15:24, Andreas Schwab wrote: >>>> On Mär 04 2024, Adhemerval Zanella wrote: >>>> >>>>> Also, there is no reason why the implementation can't be coded in C, >>>>> since it uses only normal registers and the compiler is able to >>>>> generate code as good as the assembly implementation. I have not >>>>> checked the performance, but the C implementation uses the same >>>>> strategies, the generate code with gcc 13 seems straightforward, and >>>>> the tail code also avoid byte-operations. >>>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>>> the C compiler messes this up. >>>> >>> I am aware, but there is other project that successfully provides C >>> implementation (musl, for instance, where it optimizes for aligned >>> access) so I think if this implementation does have any mistake this >>> should not be considered a blocked for this change. >> Maybe we should force -mno-strict-align for the unaligned implementation. > > That alone might not be sufficient. RISC-V gcc gates alignment with an > additional cpu tune param and latter overrides the usual > -m[no-]strict-align: meaning you could have more strict alignment but > not any less. The default -mtune is rocket which penalizes unaligned access. How does gcc optimization and strict-align plays with __may_alias__, it would still tries to generated aligned access in this case?
On 3/4/24 11:07, Adhemerval Zanella Netto wrote: > > On 04/03/24 16:04, Vineet Gupta wrote: >> >> On 3/4/24 10:30, Adhemerval Zanella Netto wrote: >>> On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >>>> On 04/03/24 15:24, Andreas Schwab wrote: >>>>> On Mär 04 2024, Adhemerval Zanella wrote: >>>>> >>>>>> Also, there is no reason why the implementation can't be coded in C, >>>>>> since it uses only normal registers and the compiler is able to >>>>>> generate code as good as the assembly implementation. I have not >>>>>> checked the performance, but the C implementation uses the same >>>>>> strategies, the generate code with gcc 13 seems straightforward, and >>>>>> the tail code also avoid byte-operations. >>>>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>>>> the C compiler messes this up. >>>>> >>>> I am aware, but there is other project that successfully provides C >>>> implementation (musl, for instance, where it optimizes for aligned >>>> access) so I think if this implementation does have any mistake this >>>> should not be considered a blocked for this change. >>> Maybe we should force -mno-strict-align for the unaligned implementation. >> That alone might not be sufficient. RISC-V gcc gates alignment with an >> additional cpu tune param and latter overrides the usual >> -m[no-]strict-align: meaning you could have more strict alignment but >> not any less. The default -mtune is rocket which penalizes unaligned access. > How does gcc optimization and strict-align plays with __may_alias__ it > would still tries to generated aligned access in this case? I'm not sure about the semantics of this attribute. However gcc backend effectively enforces -mstrict-align for the default -mtune/-mcpu overriding -mno-strict-align. riscv_slow_unaligned_access_p = (cpu->tune_param->slow_unaligned_access || TARGET_STRICT_ALIGN); /* Implement TARGET_SLOW_UNALIGNED_ACCESS. */ static bool riscv_slow_unaligned_access (machine_mode, unsigned int) { return riscv_slow_unaligned_access_p; }
On 04/03/24 16:30, Vineet Gupta wrote: > > > On 3/4/24 11:07, Adhemerval Zanella Netto wrote: >> >> On 04/03/24 16:04, Vineet Gupta wrote: >>> >>> On 3/4/24 10:30, Adhemerval Zanella Netto wrote: >>>> On 04/03/24 15:28, Adhemerval Zanella Netto wrote: >>>>> On 04/03/24 15:24, Andreas Schwab wrote: >>>>>> On Mär 04 2024, Adhemerval Zanella wrote: >>>>>> >>>>>>> Also, there is no reason why the implementation can't be coded in C, >>>>>>> since it uses only normal registers and the compiler is able to >>>>>>> generate code as good as the assembly implementation. I have not >>>>>>> checked the performance, but the C implementation uses the same >>>>>>> strategies, the generate code with gcc 13 seems straightforward, and >>>>>>> the tail code also avoid byte-operations. >>>>>> RISC-V is a strict-alignment target, and there is a non-zero chance that >>>>>> the C compiler messes this up. >>>>>> >>>>> I am aware, but there is other project that successfully provides C >>>>> implementation (musl, for instance, where it optimizes for aligned >>>>> access) so I think if this implementation does have any mistake this >>>>> should not be considered a blocked for this change. >>>> Maybe we should force -mno-strict-align for the unaligned implementation. >>> That alone might not be sufficient. RISC-V gcc gates alignment with an >>> additional cpu tune param and latter overrides the usual >>> -m[no-]strict-align: meaning you could have more strict alignment but >>> not any less. The default -mtune is rocket which penalizes unaligned access. >> How does gcc optimization and strict-align plays with __may_alias__ it >> would still tries to generated aligned access in this case? > > I'm not sure about the semantics of this attribute. However gcc backend > effectively enforces -mstrict-align for the default -mtune/-mcpu > overriding -mno-strict-align. > > riscv_slow_unaligned_access_p = (cpu->tune_param->slow_unaligned_access > || TARGET_STRICT_ALIGN); > > /* Implement TARGET_SLOW_UNALIGNED_ACCESS. */ > > static bool > riscv_slow_unaligned_access (machine_mode, unsigned int) > { > return riscv_slow_unaligned_access_p; > } Right, so the question is whether compiler will really mess the unaligned code using __may_alias__ and if we really should keep the implementation as a assembly one.
On Mon, 4 Mar 2024, Adhemerval Zanella Netto wrote: > >> How does gcc optimization and strict-align plays with __may_alias__ it > >> would still tries to generated aligned access in this case? may_alias does not imply reduced alignment: this is necessary to avoid penalizing correct code that forms properly aligned aliased pointers. Like with may_alias, you can typedef a reduced-alignment type: typedef op_t misal_op_t __attribute__((aligned(1))); > Right, so the question is whether compiler will really mess the unaligned > code using __may_alias__ and if we really should keep the implementation as > a assembly one. As always, if you lie to the compiler, you get to keep the pieces. The solution here, if you know that a particular instruction works fine to perform a misaligned load, and the compiler doesn't expose it, is to make a simple asm wrapper: static inline op_t load(void *p) { typedef op_t misal_op_t __attribute__((aligned(1))); op_t r; asm("lw %0, %1" : "=r"(r) : "m"(*(misal_op_t *)p)); return r; } Alexander
On 04/03/24 17:13, Alexander Monakov wrote: > > On Mon, 4 Mar 2024, Adhemerval Zanella Netto wrote: > >>>> How does gcc optimization and strict-align plays with __may_alias__ it >>>> would still tries to generated aligned access in this case? > > may_alias does not imply reduced alignment: this is necessary to avoid > penalizing correct code that forms properly aligned aliased pointers. > > Like with may_alias, you can typedef a reduced-alignment type: > > typedef op_t misal_op_t __attribute__((aligned(1))); > >> Right, so the question is whether compiler will really mess the unaligned >> code using __may_alias__ and if we really should keep the implementation as >> a assembly one. > > As always, if you lie to the compiler, you get to keep the pieces. > > The solution here, if you know that a particular instruction works fine > to perform a misaligned load, and the compiler doesn't expose it, is > to make a simple asm wrapper: > > static inline > op_t load(void *p) > { > typedef op_t misal_op_t __attribute__((aligned(1))); > op_t r; > asm("lw %0, %1" : "=r"(r) : "m"(*(misal_op_t *)p)); > return r; > } > Sigh, these kind of extra hackery that lead to just keep the implementation as assembly.
diff --git a/sysdeps/riscv/memcpy_noalignment.S b/sysdeps/riscv/memcpy_noalignment.S deleted file mode 100644 index 621f8d028f..0000000000 --- a/sysdeps/riscv/memcpy_noalignment.S +++ /dev/null @@ -1,136 +0,0 @@ -/* memcpy for RISC-V, ignoring buffer alignment - Copyright (C) 2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <sys/asm.h> - -/* void *memcpy(void *, const void *, size_t) */ -ENTRY (__memcpy_noalignment) - move t6, a0 /* Preserve return value */ - - /* Bail if 0 */ - beqz a2, 7f - - /* Jump to byte copy if size < SZREG */ - li a4, SZREG - bltu a2, a4, 5f - - /* Round down to the nearest "page" size */ - andi a4, a2, ~((16*SZREG)-1) - beqz a4, 2f - add a3, a1, a4 - - /* Copy the first word to get dest word aligned */ - andi a5, t6, SZREG-1 - beqz a5, 1f - REG_L a6, (a1) - REG_S a6, (t6) - - /* Align dst up to a word, move src and size as well. */ - addi t6, t6, SZREG-1 - andi t6, t6, ~(SZREG-1) - sub a5, t6, a0 - add a1, a1, a5 - sub a2, a2, a5 - - /* Recompute page count */ - andi a4, a2, ~((16*SZREG)-1) - beqz a4, 2f - -1: - /* Copy "pages" (chunks of 16 registers) */ - REG_L a4, 0(a1) - REG_L a5, SZREG(a1) - REG_L a6, 2*SZREG(a1) - REG_L a7, 3*SZREG(a1) - REG_L t0, 4*SZREG(a1) - REG_L t1, 5*SZREG(a1) - REG_L t2, 6*SZREG(a1) - REG_L t3, 7*SZREG(a1) - REG_L t4, 8*SZREG(a1) - REG_L t5, 9*SZREG(a1) - REG_S a4, 0(t6) - REG_S a5, SZREG(t6) - REG_S a6, 2*SZREG(t6) - REG_S a7, 3*SZREG(t6) - REG_S t0, 4*SZREG(t6) - REG_S t1, 5*SZREG(t6) - REG_S t2, 6*SZREG(t6) - REG_S t3, 7*SZREG(t6) - REG_S t4, 8*SZREG(t6) - REG_S t5, 9*SZREG(t6) - REG_L a4, 10*SZREG(a1) - REG_L a5, 11*SZREG(a1) - REG_L a6, 12*SZREG(a1) - REG_L a7, 13*SZREG(a1) - REG_L t0, 14*SZREG(a1) - REG_L t1, 15*SZREG(a1) - addi a1, a1, 16*SZREG - REG_S a4, 10*SZREG(t6) - REG_S a5, 11*SZREG(t6) - REG_S a6, 12*SZREG(t6) - REG_S a7, 13*SZREG(t6) - REG_S t0, 14*SZREG(t6) - REG_S t1, 15*SZREG(t6) - addi t6, t6, 16*SZREG - bltu a1, a3, 1b - andi a2, a2, (16*SZREG)-1 /* Update count */ - -2: - /* Remainder is smaller than a page, compute native word count */ - beqz a2, 7f - andi a5, a2, ~(SZREG-1) - andi a2, a2, (SZREG-1) - add a3, a1, a5 - /* Jump directly to last word if no words. */ - beqz a5, 4f - -3: - /* Use single native register copy */ - REG_L a4, 0(a1) - addi a1, a1, SZREG - REG_S a4, 0(t6) - addi t6, t6, SZREG - bltu a1, a3, 3b - - /* Jump directly out if no more bytes */ - beqz a2, 7f - -4: - /* Copy the last word unaligned */ - add a3, a1, a2 - add a4, t6, a2 - REG_L a5, -SZREG(a3) - REG_S a5, -SZREG(a4) - ret - -5: - /* Copy bytes when the total copy is <SZREG */ - add a3, a1, a2 - -6: - lb a4, 0(a1) - addi a1, a1, 1 - sb a4, 0(t6) - addi t6, t6, 1 - bltu a1, a3, 6b - -7: - ret - -END (__memcpy_noalignment) diff --git a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c b/sysdeps/riscv/multiarch/memcpy-generic.c similarity index 87% rename from sysdeps/unix/sysv/linux/riscv/memcpy-generic.c rename to sysdeps/riscv/multiarch/memcpy-generic.c index f06f4bda15..4235d33859 100644 --- a/sysdeps/unix/sysv/linux/riscv/memcpy-generic.c +++ b/sysdeps/riscv/multiarch/memcpy-generic.c @@ -18,7 +18,9 @@ #include <string.h> -extern __typeof (memcpy) __memcpy_generic; -hidden_proto (__memcpy_generic) - +#if IS_IN(libc) +# define MEMCPY __memcpy_generic +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(x) +#endif #include <string/memcpy.c> diff --git a/sysdeps/riscv/multiarch/memcpy_noalignment.c b/sysdeps/riscv/multiarch/memcpy_noalignment.c new file mode 100644 index 0000000000..9552ae45fc --- /dev/null +++ b/sysdeps/riscv/multiarch/memcpy_noalignment.c @@ -0,0 +1,100 @@ +/* Copy memory to memory until the specified number of bytes. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <string.h> +#include <stdint.h> +#include <string-optype.h> + +/* memcpy optimization for chips with fast unaligned support + (RISCV_HWPROBE_MISALIGNED_FAST). */ + +#define OPSIZ (sizeof (op_t)) + +typedef uint32_t __attribute__ ((__may_alias__)) op32_t; +typedef uint16_t __attribute__ ((__may_alias__)) op16_t; + +void * +__memcpy_noalignment (void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (__glibc_unlikely (n == 0)) + return dest; + + if (n < OPSIZ) + goto tail; + +#define COPY_WORD(__d, __s, __i) \ + *(op_t *)(__d + (__i * OPSIZ)) = *(op_t *)(__s + (__i * OPSIZ)) + + /* Copy the first op_t to get dest word aligned. */ + COPY_WORD (d, s, 0); + size_t off = OPSIZ - (uintptr_t)d % OPSIZ; + d += off; + s += off; + n -= off; + + /* Copy chunks of 16 registers. */ + enum { block_size = 16 * OPSIZ }; + + for (; n >= block_size; s += block_size, d += block_size, n -= block_size) + { + + COPY_WORD (d, s, 0); + COPY_WORD (d, s, 1); + COPY_WORD (d, s, 2); + COPY_WORD (d, s, 3); + COPY_WORD (d, s, 4); + COPY_WORD (d, s, 5); + COPY_WORD (d, s, 6); + COPY_WORD (d, s, 7); + COPY_WORD (d, s, 8); + COPY_WORD (d, s, 9); + COPY_WORD (d, s, 10); + COPY_WORD (d, s, 11); + COPY_WORD (d, s, 12); + COPY_WORD (d, s, 13); + COPY_WORD (d, s, 14); + COPY_WORD (d, s, 15); + } + + /* Remainder of chunks, issue a op_t copy until n < OPSIZ. */ + for (; n >= OPSIZ; s += OPSIZ, d += OPSIZ, n -= OPSIZ) + COPY_WORD (d, s, 0); + +tail: + if (n & 4) + { + *(op32_t *)(d) = *(op32_t *)s; + s += sizeof (op32_t); + d += sizeof (op32_t); + } + + if (n & 2) + { + *(op16_t *)(d) = *(op16_t *)s; + s += sizeof (op16_t); + d += sizeof (op16_t); + } + + if (n & 1) + *d = *s; + + return dest; +} diff --git a/sysdeps/unix/sysv/linux/riscv/Makefile b/sysdeps/unix/sysv/linux/riscv/Makefile index 398ff7418b..04abf226ad 100644 --- a/sysdeps/unix/sysv/linux/riscv/Makefile +++ b/sysdeps/unix/sysv/linux/riscv/Makefile @@ -15,15 +15,6 @@ ifeq ($(subdir),stdlib) gen-as-const-headers += ucontext_i.sym endif -ifeq ($(subdir),string) -sysdep_routines += \ - memcpy \ - memcpy-generic \ - memcpy_noalignment \ - # sysdep_routines - -endif - abi-variants := ilp32 ilp32d lp64 lp64d ifeq (,$(filter $(default-abi),$(abi-variants))) diff --git a/sysdeps/unix/sysv/linux/riscv/hwprobe.c b/sysdeps/unix/sysv/linux/riscv/hwprobe.c index e64c159eb3..9159045478 100644 --- a/sysdeps/unix/sysv/linux/riscv/hwprobe.c +++ b/sysdeps/unix/sysv/linux/riscv/hwprobe.c @@ -34,3 +34,4 @@ int __riscv_hwprobe (struct riscv_hwprobe *pairs, size_t pair_count, /* Negate negative errno values to match pthreads API. */ return -r; } +libc_hidden_def (__riscv_hwprobe) diff --git a/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h new file mode 100644 index 0000000000..cce91c1b53 --- /dev/null +++ b/sysdeps/unix/sysv/linux/riscv/include/sys/hwprobe.h @@ -0,0 +1,8 @@ +#ifndef _SYS_HWPROBE_H +# include_next <sys/hwprobe.h> + +#ifndef _ISOMAC +libc_hidden_proto (__riscv_hwprobe) +#endif + +#endif diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile new file mode 100644 index 0000000000..7e567cb95c --- /dev/null +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile @@ -0,0 +1,7 @@ +ifeq ($(subdir),string) +sysdep_routines += \ + memcpy \ + memcpy-generic \ + memcpy_noalignment \ + # sysdep_routines +endif diff --git a/sysdeps/riscv/memcopy.h b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c similarity index 51% rename from sysdeps/riscv/memcopy.h rename to sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c index 27675964b0..9f806d7a9e 100644 --- a/sysdeps/riscv/memcopy.h +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c @@ -1,4 +1,4 @@ -/* memcopy.h -- definitions for memory copy functions. RISC-V version. +/* Enumerate available IFUNC implementations of a function. RISCV version. Copyright (C) 2024 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,11 +16,28 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdeps/generic/memcopy.h> +#include <ifunc-impl-list.h> +#include <string.h> +#include <sys/hwprobe.h> -/* Redefine the generic memcpy implementation to __memcpy_generic, so - the memcpy ifunc can select between generic and special versions. - In rtld, don't bother with all the ifunciness. */ -#if IS_IN (libc) -#define MEMCPY __memcpy_generic -#endif +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + size_t i = max; + + bool fast_unaligned = false; + + struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 }; + if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0 + && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK) + == RISCV_HWPROBE_MISALIGNED_FAST) + fast_unaligned = true; + + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned, + __memcpy_noalignment) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) + + return 0; +} diff --git a/sysdeps/riscv/memcpy.c b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c similarity index 90% rename from sysdeps/riscv/memcpy.c rename to sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c index 20f9548c44..51d8ace858 100644 --- a/sysdeps/riscv/memcpy.c +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/memcpy.c @@ -28,8 +28,6 @@ # include <riscv-ifunc.h> # include <sys/hwprobe.h> -# define INIT_ARCH() - extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; @@ -38,14 +36,9 @@ extern __typeof (__redirect_memcpy) __memcpy_noalignment attribute_hidden; static inline __typeof (__redirect_memcpy) * select_memcpy_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func) { - unsigned long long int value; - - INIT_ARCH (); - - if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &value) != 0) - return __memcpy_generic; - - if ((value & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST) + unsigned long long int v; + if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_CPUPERF_0, &v) == 0 + && (v & RISCV_HWPROBE_MISALIGNED_MASK) == RISCV_HWPROBE_MISALIGNED_FAST) return __memcpy_noalignment; return __memcpy_generic; @@ -59,5 +52,6 @@ strong_alias (__libc_memcpy, memcpy); __hidden_ver1 (memcpy, __GI_memcpy, __redirect_memcpy) __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcpy); # endif - +#else +# include <string/memcpy.c> #endif