Message ID | 20221219154119.154045458@infradead.org |
---|---|
State | New |
Headers | show |
Series | Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() | expand |
On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote: > For all architectures that currently support cmpxchg_double() > implement the cmpxchg128() family of functions that is basically the > same but with a saner interface. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > --- > arch/arm64/include/asm/atomic_ll_sc.h | 38 +++++++++++++++++++++++ > arch/arm64/include/asm/atomic_lse.h | 33 +++++++++++++++++++- > arch/arm64/include/asm/cmpxchg.h | 26 ++++++++++++++++ > arch/s390/include/asm/cmpxchg.h | 33 ++++++++++++++++++++ > arch/x86/include/asm/cmpxchg_32.h | 3 + > arch/x86/include/asm/cmpxchg_64.h | 55 +++++++++++++++++++++++++++++++++- > 6 files changed, 185 insertions(+), 3 deletions(-) > > --- a/arch/arm64/include/asm/atomic_ll_sc.h > +++ b/arch/arm64/include/asm/atomic_ll_sc.h > @@ -326,6 +326,44 @@ __CMPXCHG_DBL( , , , ) > __CMPXCHG_DBL(_mb, dmb ish, l, "memory") > > #undef __CMPXCHG_DBL > + > +union __u128_halves { > + u128 full; > + struct { > + u64 low, high; > + }; > +}; > + > +#define __CMPXCHG128(name, mb, rel, cl) \ > +static __always_inline u128 \ > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > +{ \ > + union __u128_halves r, o = { .full = (old) }, \ > + n = { .full = (new) }; \ > + \ > + asm volatile("// __cmpxchg128" #name "\n" \ > + " prfm pstl1strm, %2\n" \ > + "1: ldxp %0, %1, %2\n" \ > + " eor %3, %0, %3\n" \ > + " eor %4, %1, %4\n" \ > + " orr %3, %4, %3\n" \ > + " cbnz %3, 2f\n" \ > + " st" #rel "xp %w3, %5, %6, %2\n" \ > + " cbnz %w3, 1b\n" \ > + " " #mb "\n" \ > + "2:" \ > + : "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr) \ > + : "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high) \ > + : cl); \ > + \ > + return r.full; \ > +} > + > +__CMPXCHG128( , , , ) > +__CMPXCHG128(_mb, dmb ish, l, "memory") > + > +#undef __CMPXCHG128 > + > #undef K > > #endif /* __ASM_ATOMIC_LL_SC_H */ > --- a/arch/arm64/include/asm/atomic_lse.h > +++ b/arch/arm64/include/asm/atomic_lse.h > @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a > " " #asm_op #mb " %[i], %[old], %[v]" \ > : [v] "+Q" (v->counter), \ > [old] "=r" (old) \ > - : [i] "r" (i) \ > + : [i] "r" (i) \ > : cl); \ > \ > return old; \ > @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory") > > #undef __CMPXCHG_DBL > > +#define __CMPXCHG128(name, mb, cl...) \ > +static __always_inline u128 \ > +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > +{ \ > + union __u128_halves r, o = { .full = (old) }, \ > + n = { .full = (new) }; \ > + register unsigned long x0 asm ("x0") = o.low; \ > + register unsigned long x1 asm ("x1") = o.high; \ > + register unsigned long x2 asm ("x2") = n.low; \ > + register unsigned long x3 asm ("x3") = n.high; \ > + register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ > + \ > + asm volatile( \ > + __LSE_PREAMBLE \ > + " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ > + : [old1] "+&r" (x0), [old2] "+&r" (x1), \ > + [v] "+Q" (*(unsigned long *)ptr) \ > + : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ Issue #1: the line below can be removed, otherwise.. > + [oldval1] "r" (r.low), [oldval2] "r" (r.high) \ warning: ./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb': ./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized] 309 | [oldval1] "r" (r.low), [oldval2] "r" (r.high) > + : cl); \ > + \ > + r.low = x0; r.high = x1; \ > + \ > + return r.full; \ > +} > + > +__CMPXCHG128( , ) > +__CMPXCHG128(_mb, al, "memory") > + > +#undef __CMPXCHG128 > + > #endif /* __ASM_ATOMIC_LSE_H */ > --- a/arch/arm64/include/asm/cmpxchg.h > +++ b/arch/arm64/include/asm/cmpxchg.h > @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb) > > #undef __CMPXCHG_DBL > > +#define __CMPXCHG128(name) \ > +static inline long __cmpxchg128##name(volatile u128 *ptr, \ Issue #2: this should be static inline u128 __cmpxchg128##name(..) because cmpxchg* needs to return the old value. Regards, Boqun > + u128 old, u128 new) \ > +{ \ > + return __lse_ll_sc_body(_cmpxchg128##name, \ > + ptr, old, new); \ > +} > + > +__CMPXCHG128( ) > +__CMPXCHG128(_mb) > + > +#undef __CMPXCHG128 > + > #define __CMPXCHG_GEN(sfx) \ > static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ > unsigned long old, \ > @@ -229,6 +242,19 @@ __CMPXCHG_GEN(_mb) > __ret; \ > }) > > +/* cmpxchg128 */ > +#define system_has_cmpxchg128() 1 > + > +#define arch_cmpxchg128(ptr, o, n) \ > +({ \ > + __cmpxchg128_mb((ptr), (o), (n)); \ > +}) > + > +#define arch_cmpxchg128_local(ptr, o, n) \ > +({ \ > + __cmpxchg128((ptr), (o), (n)); \ > +}) > + > #define __CMPWAIT_CASE(w, sfx, sz) \ > static inline void __cmpwait_case_##sz(volatile void *ptr, \ > unsigned long val) \ > --- a/arch/s390/include/asm/cmpxchg.h > +++ b/arch/s390/include/asm/cmpxchg.h > @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou > (unsigned long)(n1), (unsigned long)(n2)); \ > }) > > +#define system_has_cmpxchg128() 1 > + > +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) > +{ > + asm volatile( > + " cdsg %[old],%[new],%[ptr]\n" > + : [old] "+&d" (old) > + : [new] "d" (new), > + [ptr] "QS" (*(unsigned long *)ptr) > + : "memory", "cc"); > + return old; > +} > + > +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new) > +{ > + u128 old = *oldp; > + int cc; > + > + asm volatile( > + " cdsg %[old],%[new],%[ptr]\n" > + " ipm %[cc]\n" > + " srl %[cc],28\n" > + : [cc] "=&d" (cc), [old] "+&d" (old) > + : [new] "d" (new), > + [ptr] "QS" (*(unsigned long *)ptr) > + : "memory", "cc"); > + > + if (unlikely(!cc)) > + *oldp = old; > + > + return likely(cc); > +} > + > #endif /* __ASM_CMPXCHG_H */ > --- a/arch/x86/include/asm/cmpxchg_32.h > +++ b/arch/x86/include/asm/cmpxchg_32.h > @@ -103,6 +103,7 @@ static inline bool __try_cmpxchg64(volat > > #endif > > -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) > +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) > +#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) > > #endif /* _ASM_X86_CMPXCHG_32_H */ > --- a/arch/x86/include/asm/cmpxchg_64.h > +++ b/arch/x86/include/asm/cmpxchg_64.h > @@ -20,6 +20,59 @@ > arch_try_cmpxchg((ptr), (po), (n)); \ > }) > > -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) > +union __u128_halves { > + u128 full; > + struct { > + u64 low, high; > + }; > +}; > + > +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) > +{ > + union __u128_halves o = { .full = old, }, n = { .full = new, }; > + > + asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]" > + : [ptr] "+m" (*ptr), > + "+a" (o.low), "+d" (o.high) > + : "b" (n.low), "c" (n.high) > + : "memory"); > + > + return o.full; > +} > + > +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new) > +{ > + union __u128_halves o = { .full = old, }, n = { .full = new, }; > + > + asm volatile("cmpxchg16b %[ptr]" > + : [ptr] "+m" (*ptr), > + "+a" (o.low), "+d" (o.high) > + : "b" (n.low), "c" (n.high) > + : "memory"); > + > + return o.full; > +} > + > +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new) > +{ > + union __u128_halves o = { .full = *old, }, n = { .full = new, }; > + bool ret; > + > + asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]" > + CC_SET(e) > + : CC_OUT(e) (ret), > + [ptr] "+m" (*ptr), > + "+a" (o.low), "+d" (o.high) > + : "b" (n.low), "c" (n.high) > + : "memory"); > + > + if (unlikely(!ret)) > + *old = o.full; > + > + return likely(ret); > +} > + > +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) > +#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16) > > #endif /* _ASM_X86_CMPXCHG_64_H */ > >
On Wed, Dec 21, 2022 at 05:25:20PM -0800, Boqun Feng wrote: > > +#define __CMPXCHG128(name, mb, cl...) \ > > +static __always_inline u128 \ > > +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > > +{ \ > > + union __u128_halves r, o = { .full = (old) }, \ > > + n = { .full = (new) }; \ > > + register unsigned long x0 asm ("x0") = o.low; \ > > + register unsigned long x1 asm ("x1") = o.high; \ > > + register unsigned long x2 asm ("x2") = n.low; \ > > + register unsigned long x3 asm ("x3") = n.high; \ > > + register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ > > + \ > > + asm volatile( \ > > + __LSE_PREAMBLE \ > > + " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ > > + : [old1] "+&r" (x0), [old2] "+&r" (x1), \ > > + [v] "+Q" (*(unsigned long *)ptr) \ > > + : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ > > Issue #1: the line below can be removed, otherwise.. > > > + [oldval1] "r" (r.low), [oldval2] "r" (r.high) \ > > warning: > > ./arch/arm64/include/asm/atomic_lse.h: In function '__lse__cmpxchg128_mb': > ./arch/arm64/include/asm/atomic_lse.h:309:27: warning: 'r.<U97b8>.low' is used uninitialized [-Wuninitialized] > 309 | [oldval1] "r" (r.low), [oldval2] "r" (r.high) > > > > + : cl); \ > > + \ > > + r.low = x0; r.high = x1; \ > > + \ > > + return r.full; \ > > +} > > + > > +__CMPXCHG128( , ) > > +__CMPXCHG128(_mb, al, "memory") > > + > > +#undef __CMPXCHG128 > > + > > #endif /* __ASM_ATOMIC_LSE_H */ > > --- a/arch/arm64/include/asm/cmpxchg.h > > +++ b/arch/arm64/include/asm/cmpxchg.h > > @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb) > > > > #undef __CMPXCHG_DBL > > > > +#define __CMPXCHG128(name) \ > > +static inline long __cmpxchg128##name(volatile u128 *ptr, \ > > Issue #2: this should be > > static inline u128 __cmpxchg128##name(..) > > because cmpxchg* needs to return the old value. Duh.. fixed both. Pushed out to queue/core/wip-u128. I'll probably continue all this in two weeks (yay xmas break!).
On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote: > On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote: > > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote: > > > On Mon, Dec 19, 2022 at 12:07:25PM -0800, Boqun Feng wrote: > > > > On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote: > > > > > For all architectures that currently support cmpxchg_double() > > > > > implement the cmpxchg128() family of functions that is basically the > > > > > same but with a saner interface. > > > > > > > > > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > > > > > --- > > > > > arch/arm64/include/asm/atomic_ll_sc.h | 38 +++++++++++++++++++++++ > > > > > arch/arm64/include/asm/atomic_lse.h | 33 +++++++++++++++++++- > > > > > arch/arm64/include/asm/cmpxchg.h | 26 ++++++++++++++++ > > > > > arch/s390/include/asm/cmpxchg.h | 33 ++++++++++++++++++++ > > > > > arch/x86/include/asm/cmpxchg_32.h | 3 + > > > > > arch/x86/include/asm/cmpxchg_64.h | 55 +++++++++++++++++++++++++++++++++- > > > > > 6 files changed, 185 insertions(+), 3 deletions(-) > > > > > > > > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h > > > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h > > > > > @@ -326,6 +326,44 @@ __CMPXCHG_DBL( , , , ) > > > > > __CMPXCHG_DBL(_mb, dmb ish, l, "memory") > > > > > > > > > > #undef __CMPXCHG_DBL > > > > > + > > > > > +union __u128_halves { > > > > > + u128 full; > > > > > + struct { > > > > > + u64 low, high; > > > > > + }; > > > > > +}; > > > > > + > > > > > +#define __CMPXCHG128(name, mb, rel, cl) \ > > > > > +static __always_inline u128 \ > > > > > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > > > > > +{ \ > > > > > + union __u128_halves r, o = { .full = (old) }, \ > > > > > + n = { .full = (new) }; \ > > > > > + \ > > > > > + asm volatile("// __cmpxchg128" #name "\n" \ > > > > > + " prfm pstl1strm, %2\n" \ > > > > > + "1: ldxp %0, %1, %2\n" \ > > > > > + " eor %3, %0, %3\n" \ > > > > > + " eor %4, %1, %4\n" \ > > > > > + " orr %3, %4, %3\n" \ > > > > > + " cbnz %3, 2f\n" \ > > > > > + " st" #rel "xp %w3, %5, %6, %2\n" \ > > > > > + " cbnz %w3, 1b\n" \ > > > > > + " " #mb "\n" \ > > > > > + "2:" \ > > > > > + : "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr) \ > > > > > > > > I wonder whether we should use "(*(u128 *)ptr)" instead of "(*(unsigned > > > > long *) ptr)"? Because compilers may think only 64bit value pointed by > > > > "ptr" gets modified, and they are allowed to do "useful" optimization. > > > > > > In this I've copied the existing cmpxchg_double() code; I'll have to let > > > the arch folks speak here, I've no clue. > > > > We definitely need to ensure the compiler sees we poke the whole thing, or it > > can get this horribly wrong, so that is a latent bug. > > > > See commit: > > > > fee960bed5e857eb ("arm64: xchg: hazard against entire exchange variable") > > > > ... for examples of GCC being clever, where I overlooked the *_double() cases. > Using __uint128_t instead, e.g. > > diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h > index 0890e4f568fb7..cbb3d961123b1 100644 > --- a/arch/arm64/include/asm/atomic_ll_sc.h > +++ b/arch/arm64/include/asm/atomic_ll_sc.h > @@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1, \ > " cbnz %w0, 1b\n" \ > " " #mb "\n" \ > "2:" \ > - : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr) \ > + : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr) \ > : "r" (old1), "r" (old2), "r" (new1), "r" (new2) \ > : cl); \ > \ > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h > index 52075e93de6c0..a94d6dacc0292 100644 > --- a/arch/arm64/include/asm/atomic_lse.h > +++ b/arch/arm64/include/asm/atomic_lse.h > @@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1, \ > " eor %[old2], %[old2], %[oldval2]\n" \ > " orr %[old1], %[old1], %[old2]" \ > : [old1] "+&r" (x0), [old2] "+&r" (x1), \ > - [v] "+Q" (*(unsigned long *)ptr) \ > + [v] "+Q" (*(__uint128_t *)ptr) \ > : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ > [oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ > : cl); \ > > ... makes GCC much happier: > ... I'll go check whether clang is happy with that, and how far back that can > go, otherwise we'll need to blat the high half with a separate constaint that > (ideally) doesn't end up allocating a pointless address register. Hmm... from the commit history it looks like GCC prior to 5.1 might not be happy with that, but that *might* just be if we actually do arithmetic on the value, and we might be ok just using it for memroy effects. I can't currently get such an old GCC to run on my machines so I haven't been able to check. I'll dig into this a bit more tomorrow, but it looks like the options (for a backport-suitable fix) will be: (a) use a __uint128_t input+output, as above, if we're lucky (b) introduce a second 64-bit input+output for the high half (likely a "+o") (c) use a full memory clobber for ancient compilers. Mark.
On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote: > On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote: > > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote: > >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote: > >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote: > > >> ... makes GCC much happier: > > > >> ... I'll go check whether clang is happy with that, and how far back that can > >> go, otherwise we'll need to blat the high half with a separate constaint that > >> (ideally) doesn't end up allocating a pointless address register. > > > > Hmm... from the commit history it looks like GCC prior to 5.1 might not be > > happy with that, but that *might* just be if we actually do arithmetic on the > > value, and we might be ok just using it for memroy effects. I can't currently > > get such an old GCC to run on my machines so I haven't been able to check. > > gcc-5.1 is the oldest (barely) supported compiler, the minimum was > last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are > affected, we're good on mainline but may still want a fix for stable > kernels. Yup; I just wanted something that would easily backport to stable, at least as far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked yesterday). > I checked that the cross-compiler binaries from [1] still work, but I noticed > that this version is missing the native aarch64-to-aarch64 compiler (x86 to > aarch64 and vice versa are there), and you need to install libmpfr4 [2] > as a dependency. The newer compilers (6.5.0 and up) don't have these problems. I was trying the old kernel.org crosstool binaries, but I was either missing a library (or I have an incompatible version) on my x86_64 host. I'll have another look today -- thanks for the pointers! Mark. > Arnd > > [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/arm64/5.5.0/ > [2] http://ftp.uk.debian.org/debian/pool/main/m/mpfr4/libmpfr4_3.1.5-1_arm64.deb
On Wed, Jan 04, 2023 at 11:36:18AM +0000, Mark Rutland wrote: > On Tue, Jan 03, 2023 at 05:50:00PM +0100, Arnd Bergmann wrote: > > On Tue, Jan 3, 2023, at 17:19, Mark Rutland wrote: > > > On Tue, Jan 03, 2023 at 02:03:37PM +0000, Mark Rutland wrote: > > >> On Tue, Jan 03, 2023 at 01:25:35PM +0000, Mark Rutland wrote: > > >> > On Tue, Dec 20, 2022 at 12:08:16PM +0100, Peter Zijlstra wrote: > > > > >> ... makes GCC much happier: > > > > > >> ... I'll go check whether clang is happy with that, and how far back that can > > >> go, otherwise we'll need to blat the high half with a separate constaint that > > >> (ideally) doesn't end up allocating a pointless address register. > > > > > > Hmm... from the commit history it looks like GCC prior to 5.1 might not be > > > happy with that, but that *might* just be if we actually do arithmetic on the > > > value, and we might be ok just using it for memroy effects. I can't currently > > > get such an old GCC to run on my machines so I haven't been able to check. > > > > gcc-5.1 is the oldest (barely) supported compiler, the minimum was > > last raised from gcc-4.9 in linux-5.15. If only gcc-4.9 and older are > > affected, we're good on mainline but may still want a fix for stable > > kernels. > > Yup; I just wanted something that would easily backport to stable, at least as > far as linux-4.9.y (where I couldn't find the minimum GCC version when I looked > yesterday). I'd missed that we backported commit: dca5244d2f5b94f1 ("compiler.h: Raise minimum version of GCC to 5.1 for arm64") ... all the way back to v4.4.y, so we can assume v5.1 even in stable. The earliest toolchain I could get running was GCC 4.8.5, and that was happy with the __uint128_t cast for the asm, Looking back through the history, the reason for the GCC 5.1 check was that prior to GCC 5.1 GCC would output library calls for arithmetic on 128-bit types, as noted in commit: fb8722735f50cd51 ("arm64: support __int128 on gcc 5+") ... but since we're not doing any actual manipulation of the value, that should be fine. I'll go write a commit message and send that out as a fix. > > I checked that the cross-compiler binaries from [1] still work, but I noticed > > that this version is missing the native aarch64-to-aarch64 compiler (x86 to > > aarch64 and vice versa are there), and you need to install libmpfr4 [2] > > as a dependency. The newer compilers (6.5.0 and up) don't have these problems. > > I was trying the old kernel.org crosstool binaries, but I was either missing a > library (or I have an incompatible version) on my x86_64 host. I'll have > another look today -- thanks for the pointers! It turns out I'd just missed that at some point the prefix used by the kernel.org cross compilers changed from: aarch64-linux-gnu- to: aarch64-linux- ... and I'd become so used to the latter that I was trying to invoke a binary that didn't exist. With the older prefix I could use the kernel.org GCC 4.8.5 without issue. Thanks, Mark.
Hi Peter, On Mon, Dec 19, 2022 at 04:35:30PM +0100, Peter Zijlstra wrote: > For all architectures that currently support cmpxchg_double() > implement the cmpxchg128() family of functions that is basically the > same but with a saner interface. > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> I tried giving this a go, specifically your queue core/wip-u128 branch with HEAD commit c05419246aa69cd3, but it locked up at boot. I spotted a couple of problems there which also apply here, noted below with suggested fixes. > --- > arch/arm64/include/asm/atomic_ll_sc.h | 38 +++++++++++++++++++++++ > arch/arm64/include/asm/atomic_lse.h | 33 +++++++++++++++++++- > arch/arm64/include/asm/cmpxchg.h | 26 ++++++++++++++++ > arch/s390/include/asm/cmpxchg.h | 33 ++++++++++++++++++++ > arch/x86/include/asm/cmpxchg_32.h | 3 + > arch/x86/include/asm/cmpxchg_64.h | 55 +++++++++++++++++++++++++++++++++- > 6 files changed, 185 insertions(+), 3 deletions(-) > > --- a/arch/arm64/include/asm/atomic_ll_sc.h > +++ b/arch/arm64/include/asm/atomic_ll_sc.h > @@ -326,6 +326,44 @@ __CMPXCHG_DBL( , , , ) > __CMPXCHG_DBL(_mb, dmb ish, l, "memory") > > #undef __CMPXCHG_DBL > + > +union __u128_halves { > + u128 full; > + struct { > + u64 low, high; > + }; > +}; > + > +#define __CMPXCHG128(name, mb, rel, cl) \ > +static __always_inline u128 \ > +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > +{ \ > + union __u128_halves r, o = { .full = (old) }, \ > + n = { .full = (new) }; \ > + \ > + asm volatile("// __cmpxchg128" #name "\n" \ > + " prfm pstl1strm, %2\n" \ > + "1: ldxp %0, %1, %2\n" \ > + " eor %3, %0, %3\n" \ > + " eor %4, %1, %4\n" \ > + " orr %3, %4, %3\n" \ > + " cbnz %3, 2f\n" \ These lines clobber %3 and %4, but per below, those are input operands, and so this blows up. > + " st" #rel "xp %w3, %5, %6, %2\n" \ > + " cbnz %w3, 1b\n" \ > + " " #mb "\n" \ > + "2:" \ > + : "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr) \ > + : "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high) \ > + : cl); \ > + \ > + return r.full; \ > +} > + > +__CMPXCHG128( , , , ) > +__CMPXCHG128(_mb, dmb ish, l, "memory") > + > +#undef __CMPXCHG128 I think we can do this simpler and more clearly if we use the u128 operand directly, with the 'H' modifier to get at the high register of the pair: | #define __CMPXCHG128(name, mb, rel, cl...) \ | static __always_inline u128 \ | __ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ | { \ | u128 ret; \ | unsigned int tmp; \ | \ | asm volatile("// __cmpxchg128" #name "\n" \ | " prfm pstl1strm, %[v]\n" \ | "1: ldxp %[ret], %H[ret], %[v]\n" \ | " cmp %[ret], %[old]\n" \ | " ccmp %H[ret], %H[old], #4, ne\n" \ | " b.ne 2f\n" \ | " st" #rel "xp %w[tmp], %[new], %H[new], %[v]\n" \ | " cbnz %w[tmp], 1b\n" \ | " " #mb "\n" \ | "2:" \ | : [ret] "=&r" (ret), \ | [tmp] "=&r" (tmp), \ | [v] "+Q" (*ptr) \ | : [old] "r" (old), \ | [new] "r" (new) \ | : "cc" , ##cl); \ | \ | return ret; \ | } | | __CMPXCHG128( , , ) | __CMPXCHG128(_mb, dmb ish, l, "memory") | | #undef __CMPXCHG128 Note: I've used CMP and CCMP to simplify the equality check, which clobbers the flags/condition-codes ("cc"), but requires two fewer GPRs. I'm assuming that's the better tradeoff here. The existing cmpxchg_double() code clobbers the loaded value as part of checking whether it was equal, but to be able to preserve the value and be able to replay the loop (which for hilarious LL/SC reasons *must* be in asm), we can't do the same here. I've boot-tested the suggestion with GCC 12.1.0. > + > #undef K > > #endif /* __ASM_ATOMIC_LL_SC_H */ > --- a/arch/arm64/include/asm/atomic_lse.h > +++ b/arch/arm64/include/asm/atomic_lse.h > @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a > " " #asm_op #mb " %[i], %[old], %[v]" \ > : [v] "+Q" (v->counter), \ > [old] "=r" (old) \ > - : [i] "r" (i) \ > + : [i] "r" (i) \ > : cl); \ > \ > return old; \ Spurious whitespace change? > @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory") > > #undef __CMPXCHG_DBL > > +#define __CMPXCHG128(name, mb, cl...) \ > +static __always_inline u128 \ > +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > +{ \ > + union __u128_halves r, o = { .full = (old) }, \ > + n = { .full = (new) }; \ > + register unsigned long x0 asm ("x0") = o.low; \ > + register unsigned long x1 asm ("x1") = o.high; \ > + register unsigned long x2 asm ("x2") = n.low; \ > + register unsigned long x3 asm ("x3") = n.high; \ > + register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ > + \ > + asm volatile( \ > + __LSE_PREAMBLE \ > + " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ > + : [old1] "+&r" (x0), [old2] "+&r" (x1), \ > + [v] "+Q" (*(unsigned long *)ptr) \ > + : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ > + [oldval1] "r" (r.low), [oldval2] "r" (r.high) \ > + : cl); \ > + \ > + r.low = x0; r.high = x1; \ > + \ > + return r.full; \ > +} > + > +__CMPXCHG128( , ) > +__CMPXCHG128(_mb, al, "memory") > + > +#undef __CMPXCHG128 Similarly, I'd suggest: | #define __CMPXCHG128(name, mb, cl...) \ | static __always_inline u128 \ | __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ | { \ | asm volatile( \ | __LSE_PREAMBLE \ | " casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n" \ | : [old] "+&r" (old), \ | [v] "+Q" (*(u128 *)ptr) \ | : [new] "r" (new) \ | : cl); \ | \ | return old; \ | } | | __CMPXCHG128( , ) | __CMPXCHG128(_mb, al, "memory") | | #undef __CMPXCHG128 Thanks, Mark.
On Mon, Jan 09, 2023 at 06:50:24PM +0000, Mark Rutland wrote: > Similarly, I'd suggest: > > | #define __CMPXCHG128(name, mb, cl...) \ > | static __always_inline u128 \ > | __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ > | { \ > | asm volatile( \ > | __LSE_PREAMBLE \ > | " casp" #mb "\t%[old], %H[old], %[new], %H[new], %[v]\n" \ > | : [old] "+&r" (old), \ > | [v] "+Q" (*(u128 *)ptr) \ > | : [new] "r" (new) \ > | : cl); \ > | \ > | return old; \ > | } > | > | __CMPXCHG128( , ) > | __CMPXCHG128(_mb, al, "memory") > | > | #undef __CMPXCHG128 clang-16 seems to hate on this like: arch/arm64/include/asm/atomic_lse.h:342:1: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths] arch/arm64/include/asm/atomic_lse.h:334:17: note: expanded from macro '__CMPXCHG128' : [old] "+&r" (old), \ ^ (much the same for the ll_sc version; if you want the full build thing, holler and I'll bounce you the robot mail).
--- a/arch/arm64/include/asm/atomic_ll_sc.h +++ b/arch/arm64/include/asm/atomic_ll_sc.h @@ -326,6 +326,44 @@ __CMPXCHG_DBL( , , , ) __CMPXCHG_DBL(_mb, dmb ish, l, "memory") #undef __CMPXCHG_DBL + +union __u128_halves { + u128 full; + struct { + u64 low, high; + }; +}; + +#define __CMPXCHG128(name, mb, rel, cl) \ +static __always_inline u128 \ +__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ +{ \ + union __u128_halves r, o = { .full = (old) }, \ + n = { .full = (new) }; \ + \ + asm volatile("// __cmpxchg128" #name "\n" \ + " prfm pstl1strm, %2\n" \ + "1: ldxp %0, %1, %2\n" \ + " eor %3, %0, %3\n" \ + " eor %4, %1, %4\n" \ + " orr %3, %4, %3\n" \ + " cbnz %3, 2f\n" \ + " st" #rel "xp %w3, %5, %6, %2\n" \ + " cbnz %w3, 1b\n" \ + " " #mb "\n" \ + "2:" \ + : "=&r" (r.low), "=&r" (r.high), "+Q" (*(unsigned long *)ptr) \ + : "r" (o.low), "r" (o.high), "r" (n.low), "r" (n.high) \ + : cl); \ + \ + return r.full; \ +} + +__CMPXCHG128( , , , ) +__CMPXCHG128(_mb, dmb ish, l, "memory") + +#undef __CMPXCHG128 + #undef K #endif /* __ASM_ATOMIC_LL_SC_H */ --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -151,7 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, a " " #asm_op #mb " %[i], %[old], %[v]" \ : [v] "+Q" (v->counter), \ [old] "=r" (old) \ - : [i] "r" (i) \ + : [i] "r" (i) \ : cl); \ \ return old; \ @@ -324,4 +324,35 @@ __CMPXCHG_DBL(_mb, al, "memory") #undef __CMPXCHG_DBL +#define __CMPXCHG128(name, mb, cl...) \ +static __always_inline u128 \ +__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \ +{ \ + union __u128_halves r, o = { .full = (old) }, \ + n = { .full = (new) }; \ + register unsigned long x0 asm ("x0") = o.low; \ + register unsigned long x1 asm ("x1") = o.high; \ + register unsigned long x2 asm ("x2") = n.low; \ + register unsigned long x3 asm ("x3") = n.high; \ + register unsigned long x4 asm ("x4") = (unsigned long)ptr; \ + \ + asm volatile( \ + __LSE_PREAMBLE \ + " casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\ + : [old1] "+&r" (x0), [old2] "+&r" (x1), \ + [v] "+Q" (*(unsigned long *)ptr) \ + : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ + [oldval1] "r" (r.low), [oldval2] "r" (r.high) \ + : cl); \ + \ + r.low = x0; r.high = x1; \ + \ + return r.full; \ +} + +__CMPXCHG128( , ) +__CMPXCHG128(_mb, al, "memory") + +#undef __CMPXCHG128 + #endif /* __ASM_ATOMIC_LSE_H */ --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -147,6 +147,19 @@ __CMPXCHG_DBL(_mb) #undef __CMPXCHG_DBL +#define __CMPXCHG128(name) \ +static inline long __cmpxchg128##name(volatile u128 *ptr, \ + u128 old, u128 new) \ +{ \ + return __lse_ll_sc_body(_cmpxchg128##name, \ + ptr, old, new); \ +} + +__CMPXCHG128( ) +__CMPXCHG128(_mb) + +#undef __CMPXCHG128 + #define __CMPXCHG_GEN(sfx) \ static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ unsigned long old, \ @@ -229,6 +242,19 @@ __CMPXCHG_GEN(_mb) __ret; \ }) +/* cmpxchg128 */ +#define system_has_cmpxchg128() 1 + +#define arch_cmpxchg128(ptr, o, n) \ +({ \ + __cmpxchg128_mb((ptr), (o), (n)); \ +}) + +#define arch_cmpxchg128_local(ptr, o, n) \ +({ \ + __cmpxchg128((ptr), (o), (n)); \ +}) + #define __CMPWAIT_CASE(w, sfx, sz) \ static inline void __cmpwait_case_##sz(volatile void *ptr, \ unsigned long val) \ --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -201,4 +201,37 @@ static __always_inline int __cmpxchg_dou (unsigned long)(n1), (unsigned long)(n2)); \ }) +#define system_has_cmpxchg128() 1 + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + asm volatile( + " cdsg %[old],%[new],%[ptr]\n" + : [old] "+&d" (old) + : [new] "d" (new), + [ptr] "QS" (*(unsigned long *)ptr) + : "memory", "cc"); + return old; +} + +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new) +{ + u128 old = *oldp; + int cc; + + asm volatile( + " cdsg %[old],%[new],%[ptr]\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), [old] "+&d" (old) + : [new] "d" (new), + [ptr] "QS" (*(unsigned long *)ptr) + : "memory", "cc"); + + if (unlikely(!cc)) + *oldp = old; + + return likely(cc); +} + #endif /* __ASM_CMPXCHG_H */ --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -103,6 +103,7 @@ static inline bool __try_cmpxchg64(volat #endif -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) +#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) #endif /* _ASM_X86_CMPXCHG_32_H */ --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,59 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) +union __u128_halves { + u128 full; + struct { + u64 low, high; + }; +}; + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + union __u128_halves o = { .full = old, }, n = { .full = new, }; + + asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]" + : [ptr] "+m" (*ptr), + "+a" (o.low), "+d" (o.high) + : "b" (n.low), "c" (n.high) + : "memory"); + + return o.full; +} + +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new) +{ + union __u128_halves o = { .full = old, }, n = { .full = new, }; + + asm volatile("cmpxchg16b %[ptr]" + : [ptr] "+m" (*ptr), + "+a" (o.low), "+d" (o.high) + : "b" (n.low), "c" (n.high) + : "memory"); + + return o.full; +} + +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *old, u128 new) +{ + union __u128_halves o = { .full = *old, }, n = { .full = new, }; + bool ret; + + asm volatile(LOCK_PREFIX "cmpxchg16b %[ptr]" + CC_SET(e) + : CC_OUT(e) (ret), + [ptr] "+m" (*ptr), + "+a" (o.low), "+d" (o.high) + : "b" (n.low), "c" (n.high) + : "memory"); + + if (unlikely(!ret)) + *old = o.full; + + return likely(ret); +} + +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) +#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */
For all architectures that currently support cmpxchg_double() implement the cmpxchg128() family of functions that is basically the same but with a saner interface. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- arch/arm64/include/asm/atomic_ll_sc.h | 38 +++++++++++++++++++++++ arch/arm64/include/asm/atomic_lse.h | 33 +++++++++++++++++++- arch/arm64/include/asm/cmpxchg.h | 26 ++++++++++++++++ arch/s390/include/asm/cmpxchg.h | 33 ++++++++++++++++++++ arch/x86/include/asm/cmpxchg_32.h | 3 + arch/x86/include/asm/cmpxchg_64.h | 55 +++++++++++++++++++++++++++++++++- 6 files changed, 185 insertions(+), 3 deletions(-)