Message ID | 2c06427f1180cf408a3e9750de3040dde0afe2ea.1406301772.git.ian.campbell@citrix.com |
---|---|
State | Accepted |
Commit | 65e9f0660697e895ada7999a4e9822c716bc4ec7 |
Headers | show |
Hi Ian, On 07/25/2014 04:22 PM, Ian Campbell wrote: > bitops, cmpxchg, atomics: Import: > c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics Compare to Linux we don't have specific prefetch* helpers. We directly use the compiler builtin ones. Shouldn't we import the ARM specific helpers to gain in performance? Regards, > Author: Will Deacon <will.deacon@arm.com> > Signed-off-by: Will Deacon <will.deacon@arm.com> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> > > atomics: In addition to the above import: > db38ee8 ARM: 7983/1: atomics: implement a better __atomic_add_unless for v6+ > Author: Will Deacon <will.deacon@arm.com> > Signed-off-by: Will Deacon <will.deacon@arm.com> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> > > spinlocks: We have diverged from Linux, so no updates but note this in the README. > > mem* and str*: Import: > d98b90e ARM: 7990/1: asm: rename logical shift macros push pull into lspush lspull > Author: Victor Kamensky <victor.kamensky@linaro.org> > Suggested-by: Will Deacon <will.deacon@arm.com> > Signed-off-by: Victor Kamensky <victor.kamensky@linaro.org> > Acked-by: Nicolas Pitre <nico@linaro.org> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> > > For some reason str* were mentioned under mem* in the README, fix. > > libgcc: No changes, update baseline > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com> > --- > xen/arch/arm/README.LinuxPrimitives | 17 +++++++-------- > xen/arch/arm/arm32/lib/assembler.h | 8 +++---- > xen/arch/arm/arm32/lib/bitops.h | 5 +++++ > xen/arch/arm/arm32/lib/copy_template.S | 36 ++++++++++++++++---------------- > xen/arch/arm/arm32/lib/memmove.S | 36 ++++++++++++++++---------------- > xen/include/asm-arm/arm32/atomic.h | 32 ++++++++++++++++++++++++++++ > xen/include/asm-arm/arm32/cmpxchg.h | 5 +++++ > 7 files changed, 90 insertions(+), 49 deletions(-) > > diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives > index 69eeb70..7e15b04 100644 > --- a/xen/arch/arm/README.LinuxPrimitives > +++ b/xen/arch/arm/README.LinuxPrimitives > @@ -65,7 +65,7 @@ linux/arch/arm64/lib/copy_page.S unused in Xen > arm32 > ===================================================================== > > -bitops: last sync @ v3.14-rc7 (last commit: b7ec699) > +bitops: last sync @ v3.16-rc6 (last commit: c32ffce0f66e) > > linux/arch/arm/lib/bitops.h xen/arch/arm/arm32/lib/bitops.h > linux/arch/arm/lib/changebit.S xen/arch/arm/arm32/lib/changebit.S > @@ -83,13 +83,13 @@ done > > --------------------------------------------------------------------- > > -cmpxchg: last sync @ v3.14-rc7 (last commit: 775ebcc) > +cmpxchg: last sync @ v3.16-rc6 (last commit: c32ffce0f66e) > > linux/arch/arm/include/asm/cmpxchg.h xen/include/asm-arm/arm32/cmpxchg.h > > --------------------------------------------------------------------- > > -atomics: last sync @ v3.14-rc7 (last commit: aed3a4e) > +atomics: last sync @ v3.16-rc6 (last commit: 030d0178bdbd) > > linux/arch/arm/include/asm/atomic.h xen/include/asm-arm/arm32/atomic.h > > @@ -99,6 +99,8 @@ spinlocks: last sync: 15e7e5c1ebf5 > > linux/arch/arm/include/asm/spinlock.h xen/include/asm-arm/arm32/spinlock.h > > +*** Linux has switched to ticket locks but we still use bitlocks. > + > resync to v3.14-rc7: > > 7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev > @@ -111,7 +113,7 @@ resync to v3.14-rc7: > > --------------------------------------------------------------------- > > -mem*: last sync @ v3.14-rc7 (last commit: 418df63a) > +mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0) > > linux/arch/arm/lib/copy_template.S xen/arch/arm/arm32/lib/copy_template.S > linux/arch/arm/lib/memchr.S xen/arch/arm/arm32/lib/memchr.S > @@ -120,9 +122,6 @@ linux/arch/arm/lib/memmove.S xen/arch/arm/arm32/lib/memmove.S > linux/arch/arm/lib/memset.S xen/arch/arm/arm32/lib/memset.S > linux/arch/arm/lib/memzero.S xen/arch/arm/arm32/lib/memzero.S > > -linux/arch/arm/lib/strchr.S xen/arch/arm/arm32/lib/strchr.S > -linux/arch/arm/lib/strrchr.S xen/arch/arm/arm32/lib/strrchr.S > - > for i in copy_template.S memchr.S memcpy.S memmove.S memset.S \ > memzero.S ; do > diff -u linux/arch/arm/lib/$i xen/arch/arm/arm32/lib/$i > @@ -130,7 +129,7 @@ done > > --------------------------------------------------------------------- > > -str*: last sync @ v3.13-rc7 (last commit: 93ed397) > +str*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0) > > linux/arch/arm/lib/strchr.S xen/arch/arm/arm32/lib/strchr.S > linux/arch/arm/lib/strrchr.S xen/arch/arm/arm32/lib/strrchr.S > @@ -145,7 +144,7 @@ clear_page == memset > > --------------------------------------------------------------------- > > -libgcc: last sync @ v3.14-rc7 (last commit: 01885bc) > +libgcc: last sync @ v3.16-rc6 (last commit: 01885bc) > > linux/arch/arm/lib/lib1funcs.S xen/arch/arm/arm32/lib/lib1funcs.S > linux/arch/arm/lib/lshrdi3.S xen/arch/arm/arm32/lib/lshrdi3.S > diff --git a/xen/arch/arm/arm32/lib/assembler.h b/xen/arch/arm/arm32/lib/assembler.h > index f8d4b3a..6de2638 100644 > --- a/xen/arch/arm/arm32/lib/assembler.h > +++ b/xen/arch/arm/arm32/lib/assembler.h > @@ -36,8 +36,8 @@ > * Endian independent macros for shifting bytes within registers. > */ > #ifndef __ARMEB__ > -#define pull lsr > -#define push lsl > +#define lspull lsr > +#define lspush lsl > #define get_byte_0 lsl #0 > #define get_byte_1 lsr #8 > #define get_byte_2 lsr #16 > @@ -47,8 +47,8 @@ > #define put_byte_2 lsl #16 > #define put_byte_3 lsl #24 > #else > -#define pull lsl > -#define push lsr > +#define lspull lsl > +#define lspush lsr > #define get_byte_0 lsr #24 > #define get_byte_1 lsr #16 > #define get_byte_2 lsr #8 > diff --git a/xen/arch/arm/arm32/lib/bitops.h b/xen/arch/arm/arm32/lib/bitops.h > index 25784c3..a167c2d 100644 > --- a/xen/arch/arm/arm32/lib/bitops.h > +++ b/xen/arch/arm/arm32/lib/bitops.h > @@ -37,6 +37,11 @@ UNWIND( .fnstart ) > add r1, r1, r0, lsl #2 @ Get word offset > mov r3, r2, lsl r3 @ create mask > smp_dmb > +#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) > + .arch_extension mp > + ALT_SMP(W(pldw) [r1]) > + ALT_UP(W(nop)) > +#endif > 1: ldrex r2, [r1] > ands r0, r2, r3 @ save old value of bit > \instr r2, r2, r3 @ toggle bit > diff --git a/xen/arch/arm/arm32/lib/copy_template.S b/xen/arch/arm/arm32/lib/copy_template.S > index 805e3f8..3bc8eb8 100644 > --- a/xen/arch/arm/arm32/lib/copy_template.S > +++ b/xen/arch/arm/arm32/lib/copy_template.S > @@ -197,24 +197,24 @@ > > 12: PLD( pld [r1, #124] ) > 13: ldr4w r1, r4, r5, r6, r7, abort=19f > - mov r3, lr, pull #\pull > + mov r3, lr, lspull #\pull > subs r2, r2, #32 > ldr4w r1, r8, r9, ip, lr, abort=19f > - orr r3, r3, r4, push #\push > - mov r4, r4, pull #\pull > - orr r4, r4, r5, push #\push > - mov r5, r5, pull #\pull > - orr r5, r5, r6, push #\push > - mov r6, r6, pull #\pull > - orr r6, r6, r7, push #\push > - mov r7, r7, pull #\pull > - orr r7, r7, r8, push #\push > - mov r8, r8, pull #\pull > - orr r8, r8, r9, push #\push > - mov r9, r9, pull #\pull > - orr r9, r9, ip, push #\push > - mov ip, ip, pull #\pull > - orr ip, ip, lr, push #\push > + orr r3, r3, r4, lspush #\push > + mov r4, r4, lspull #\pull > + orr r4, r4, r5, lspush #\push > + mov r5, r5, lspull #\pull > + orr r5, r5, r6, lspush #\push > + mov r6, r6, lspull #\pull > + orr r6, r6, r7, lspush #\push > + mov r7, r7, lspull #\pull > + orr r7, r7, r8, lspush #\push > + mov r8, r8, lspull #\pull > + orr r8, r8, r9, lspush #\push > + mov r9, r9, lspull #\pull > + orr r9, r9, ip, lspush #\push > + mov ip, ip, lspull #\pull > + orr ip, ip, lr, lspush #\push > str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f > bge 12b > PLD( cmn r2, #96 ) > @@ -225,10 +225,10 @@ > 14: ands ip, r2, #28 > beq 16f > > -15: mov r3, lr, pull #\pull > +15: mov r3, lr, lspull #\pull > ldr1w r1, lr, abort=21f > subs ip, ip, #4 > - orr r3, r3, lr, push #\push > + orr r3, r3, lr, lspush #\push > str1w r0, r3, abort=21f > bgt 15b > CALGN( cmp r2, #0 ) > diff --git a/xen/arch/arm/arm32/lib/memmove.S b/xen/arch/arm/arm32/lib/memmove.S > index 4e142b8..18634c3 100644 > --- a/xen/arch/arm/arm32/lib/memmove.S > +++ b/xen/arch/arm/arm32/lib/memmove.S > @@ -148,24 +148,24 @@ ENTRY(memmove) > > 12: PLD( pld [r1, #-128] ) > 13: ldmdb r1!, {r7, r8, r9, ip} > - mov lr, r3, push #\push > + mov lr, r3, lspush #\push > subs r2, r2, #32 > ldmdb r1!, {r3, r4, r5, r6} > - orr lr, lr, ip, pull #\pull > - mov ip, ip, push #\push > - orr ip, ip, r9, pull #\pull > - mov r9, r9, push #\push > - orr r9, r9, r8, pull #\pull > - mov r8, r8, push #\push > - orr r8, r8, r7, pull #\pull > - mov r7, r7, push #\push > - orr r7, r7, r6, pull #\pull > - mov r6, r6, push #\push > - orr r6, r6, r5, pull #\pull > - mov r5, r5, push #\push > - orr r5, r5, r4, pull #\pull > - mov r4, r4, push #\push > - orr r4, r4, r3, pull #\pull > + orr lr, lr, ip, lspull #\pull > + mov ip, ip, lspush #\push > + orr ip, ip, r9, lspull #\pull > + mov r9, r9, lspush #\push > + orr r9, r9, r8, lspull #\pull > + mov r8, r8, lspush #\push > + orr r8, r8, r7, lspull #\pull > + mov r7, r7, lspush #\push > + orr r7, r7, r6, lspull #\pull > + mov r6, r6, lspush #\push > + orr r6, r6, r5, lspull #\pull > + mov r5, r5, lspush #\push > + orr r5, r5, r4, lspull #\pull > + mov r4, r4, lspush #\push > + orr r4, r4, r3, lspull #\pull > stmdb r0!, {r4 - r9, ip, lr} > bge 12b > PLD( cmn r2, #96 ) > @@ -176,10 +176,10 @@ ENTRY(memmove) > 14: ands ip, r2, #28 > beq 16f > > -15: mov lr, r3, push #\push > +15: mov lr, r3, lspush #\push > ldr r3, [r1, #-4]! > subs ip, ip, #4 > - orr lr, lr, r3, pull #\pull > + orr lr, lr, r3, lspull #\pull > str lr, [r0, #-4]! > bgt 15b > CALGN( cmp r2, #0 ) > diff --git a/xen/include/asm-arm/arm32/atomic.h b/xen/include/asm-arm/arm32/atomic.h > index 3d601d1..7ec712f 100644 > --- a/xen/include/asm-arm/arm32/atomic.h > +++ b/xen/include/asm-arm/arm32/atomic.h > @@ -39,6 +39,7 @@ static inline int atomic_add_return(int i, atomic_t *v) > int result; > > smp_mb(); > + prefetchw(&v->counter); > > __asm__ __volatile__("@ atomic_add_return\n" > "1: ldrex %0, [%3]\n" > @@ -78,6 +79,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) > int result; > > smp_mb(); > + prefetchw(&v->counter); > > __asm__ __volatile__("@ atomic_sub_return\n" > "1: ldrex %0, [%3]\n" > @@ -100,6 +102,7 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) > unsigned long res; > > smp_mb(); > + prefetchw(&ptr->counter); > > do { > __asm__ __volatile__("@ atomic_cmpxchg\n" > @@ -117,6 +120,35 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) > return oldval; > } > > +static inline int __atomic_add_unless(atomic_t *v, int a, int u) > +{ > + int oldval, newval; > + unsigned long tmp; > + > + smp_mb(); > + prefetchw(&v->counter); > + > + __asm__ __volatile__ ("@ atomic_add_unless\n" > +"1: ldrex %0, [%4]\n" > +" teq %0, %5\n" > +" beq 2f\n" > +" add %1, %0, %6\n" > +" strex %2, %1, [%4]\n" > +" teq %2, #0\n" > +" bne 1b\n" > +"2:" > + : "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter) > + : "r" (&v->counter), "r" (u), "r" (a) > + : "cc"); > + > + if (oldval != u) > + smp_mb(); > + > + return oldval; > +} > + > +#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) > + > #define atomic_inc(v) atomic_add(1, v) > #define atomic_dec(v) atomic_sub(1, v) > > diff --git a/xen/include/asm-arm/arm32/cmpxchg.h b/xen/include/asm-arm/arm32/cmpxchg.h > index 9a511f2..03e0bed 100644 > --- a/xen/include/asm-arm/arm32/cmpxchg.h > +++ b/xen/include/asm-arm/arm32/cmpxchg.h > @@ -1,6 +1,8 @@ > #ifndef __ASM_ARM32_CMPXCHG_H > #define __ASM_ARM32_CMPXCHG_H > > +#include <xen/prefetch.h> > + > extern void __bad_xchg(volatile void *, int); > > static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size) > @@ -9,6 +11,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size > unsigned int tmp; > > smp_mb(); > + prefetchw((const void *)ptr); > > switch (size) { > case 1: > @@ -56,6 +59,8 @@ static always_inline unsigned long __cmpxchg( > { > unsigned long oldval, res; > > + prefetchw((const void *)ptr); > + > switch (size) { > case 1: > do { >
On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote: > Hi Ian, > > On 07/25/2014 04:22 PM, Ian Campbell wrote: > > bitops, cmpxchg, atomics: Import: > > c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics > > Compare to Linux we don't have specific prefetch* helpers. We directly > use the compiler builtin ones. Shouldn't we import the ARM specific > helpers to gain in performance? My binaries are full of pld instructions where I think I would expect them, so it seems like the compiler builtin ones are sufficient. I suspect the Linux define is there to cope with older compilers or something. Ian.
On 07/25/2014 04:48 PM, Ian Campbell wrote: > On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote: >> Hi Ian, >> >> On 07/25/2014 04:22 PM, Ian Campbell wrote: >>> bitops, cmpxchg, atomics: Import: >>> c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics >> >> Compare to Linux we don't have specific prefetch* helpers. We directly >> use the compiler builtin ones. Shouldn't we import the ARM specific >> helpers to gain in performance? > > My binaries are full of pld instructions where I think I would expect > them, so it seems like the compiler builtin ones are sufficient. > > I suspect the Linux define is there to cope with older compilers or > something. If so: Acked-by: Julien Grall <julien.grall@linaro.org> Regards,
On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote: > On 07/25/2014 04:48 PM, Ian Campbell wrote: > > On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote: > >> Hi Ian, > >> > >> On 07/25/2014 04:22 PM, Ian Campbell wrote: > >>> bitops, cmpxchg, atomics: Import: > >>> c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics > >> > >> Compare to Linux we don't have specific prefetch* helpers. We directly > >> use the compiler builtin ones. Shouldn't we import the ARM specific > >> helpers to gain in performance? > > > > My binaries are full of pld instructions where I think I would expect > > them, so it seems like the compiler builtin ones are sufficient. > > > > I suspect the Linux define is there to cope with older compilers or > > something. > > If so: The compiled output is very different if I use the arch specific explicit variants. The explicit variant generates (lots) more pldw and (somewhat) fewer pld. I've no idea what this means... Note that the builtins presumably let gcc reason about whether preloads are needed, whereas the explicit variants do not. I'm not sure how that results in fewer pld with the explicit variant though! (unless it's doing some sort of peephole optimisation and throwing them away?) I've no idea what the right answer is. How about we take the updates for now and revisit the question of builtin vs explicit prefetches some other time? > Acked-by: Julien Grall <julien.grall@linaro.org> > > Regards, >
On 07/25/2014 05:03 PM, Ian Campbell wrote: > On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote: >> On 07/25/2014 04:48 PM, Ian Campbell wrote: >>> On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote: >>>> Hi Ian, >>>> >>>> On 07/25/2014 04:22 PM, Ian Campbell wrote: >>>>> bitops, cmpxchg, atomics: Import: >>>>> c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics >>>> >>>> Compare to Linux we don't have specific prefetch* helpers. We directly >>>> use the compiler builtin ones. Shouldn't we import the ARM specific >>>> helpers to gain in performance? >>> >>> My binaries are full of pld instructions where I think I would expect >>> them, so it seems like the compiler builtin ones are sufficient. >>> >>> I suspect the Linux define is there to cope with older compilers or >>> something. >> >> If so: > > The compiled output is very different if I use the arch specific > explicit variants. The explicit variant generates (lots) more pldw and > (somewhat) fewer pld. I've no idea what this means... It looks like that pldw has been defined for ARMv7 with MP extensions. AFAIU, pldw is used to signal we will likely write on this address. I guess, we use the prefetch* helpers more often for write in the memory. > > Note that the builtins presumably let gcc reason about whether preloads > are needed, whereas the explicit variants do not. I'm not sure how that > results in fewer pld with the explicit variant though! (unless it's > doing some sort of peephole optimisation and throwing them away?) > > I've no idea what the right answer is. > > How about we take the updates for now and revisit the question of > builtin vs explicit prefetches some other time? I'm fine with it. You can keep the ack for this patch. Regards,
On Fri, 2014-07-25 at 17:17 +0100, Julien Grall wrote: > On 07/25/2014 05:03 PM, Ian Campbell wrote: > > On Fri, 2014-07-25 at 16:48 +0100, Julien Grall wrote: > >> On 07/25/2014 04:48 PM, Ian Campbell wrote: > >>> On Fri, 2014-07-25 at 16:42 +0100, Julien Grall wrote: > >>>> Hi Ian, > >>>> > >>>> On 07/25/2014 04:22 PM, Ian Campbell wrote: > >>>>> bitops, cmpxchg, atomics: Import: > >>>>> c32ffce ARM: 7984/1: prefetch: add prefetchw invocations for barriered atomics > >>>> > >>>> Compare to Linux we don't have specific prefetch* helpers. We directly > >>>> use the compiler builtin ones. Shouldn't we import the ARM specific > >>>> helpers to gain in performance? > >>> > >>> My binaries are full of pld instructions where I think I would expect > >>> them, so it seems like the compiler builtin ones are sufficient. > >>> > >>> I suspect the Linux define is there to cope with older compilers or > >>> something. > >> > >> If so: > > > > The compiled output is very different if I use the arch specific > > explicit variants. The explicit variant generates (lots) more pldw and > > (somewhat) fewer pld. I've no idea what this means... > > It looks like that pldw has been defined for ARMv7 with MP extensions. > > AFAIU, pldw is used to signal we will likely write on this address. Oh, I know *that*. What I couldn't explain is why the builtins should generate 181 pld's and 6 pldw's (total 187) while the explicit ones generate 127 pld's and 93 pldw's (total 220) for the exact same code base. Perhaps we simply use prefetchw too often in our code in gcc's opinion so it elides some of them. Or perhaps the volatile in the explicit version stops gcc from making other optimisations so there's simply more occasions where the prefetching is needed. The difference in the write prefetches is pretty stark though, 6 vs 93. Ian.
diff --git a/xen/arch/arm/README.LinuxPrimitives b/xen/arch/arm/README.LinuxPrimitives index 69eeb70..7e15b04 100644 --- a/xen/arch/arm/README.LinuxPrimitives +++ b/xen/arch/arm/README.LinuxPrimitives @@ -65,7 +65,7 @@ linux/arch/arm64/lib/copy_page.S unused in Xen arm32 ===================================================================== -bitops: last sync @ v3.14-rc7 (last commit: b7ec699) +bitops: last sync @ v3.16-rc6 (last commit: c32ffce0f66e) linux/arch/arm/lib/bitops.h xen/arch/arm/arm32/lib/bitops.h linux/arch/arm/lib/changebit.S xen/arch/arm/arm32/lib/changebit.S @@ -83,13 +83,13 @@ done --------------------------------------------------------------------- -cmpxchg: last sync @ v3.14-rc7 (last commit: 775ebcc) +cmpxchg: last sync @ v3.16-rc6 (last commit: c32ffce0f66e) linux/arch/arm/include/asm/cmpxchg.h xen/include/asm-arm/arm32/cmpxchg.h --------------------------------------------------------------------- -atomics: last sync @ v3.14-rc7 (last commit: aed3a4e) +atomics: last sync @ v3.16-rc6 (last commit: 030d0178bdbd) linux/arch/arm/include/asm/atomic.h xen/include/asm-arm/arm32/atomic.h @@ -99,6 +99,8 @@ spinlocks: last sync: 15e7e5c1ebf5 linux/arch/arm/include/asm/spinlock.h xen/include/asm-arm/arm32/spinlock.h +*** Linux has switched to ticket locks but we still use bitlocks. + resync to v3.14-rc7: 7c8746a ARM: 7955/1: spinlock: ensure we have a compiler barrier before sev @@ -111,7 +113,7 @@ resync to v3.14-rc7: --------------------------------------------------------------------- -mem*: last sync @ v3.14-rc7 (last commit: 418df63a) +mem*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0) linux/arch/arm/lib/copy_template.S xen/arch/arm/arm32/lib/copy_template.S linux/arch/arm/lib/memchr.S xen/arch/arm/arm32/lib/memchr.S @@ -120,9 +122,6 @@ linux/arch/arm/lib/memmove.S xen/arch/arm/arm32/lib/memmove.S linux/arch/arm/lib/memset.S xen/arch/arm/arm32/lib/memset.S linux/arch/arm/lib/memzero.S xen/arch/arm/arm32/lib/memzero.S -linux/arch/arm/lib/strchr.S xen/arch/arm/arm32/lib/strchr.S -linux/arch/arm/lib/strrchr.S xen/arch/arm/arm32/lib/strrchr.S - for i in copy_template.S memchr.S memcpy.S memmove.S memset.S \ memzero.S ; do diff -u linux/arch/arm/lib/$i xen/arch/arm/arm32/lib/$i @@ -130,7 +129,7 @@ done --------------------------------------------------------------------- -str*: last sync @ v3.13-rc7 (last commit: 93ed397) +str*: last sync @ v3.16-rc6 (last commit: d98b90ea22b0) linux/arch/arm/lib/strchr.S xen/arch/arm/arm32/lib/strchr.S linux/arch/arm/lib/strrchr.S xen/arch/arm/arm32/lib/strrchr.S @@ -145,7 +144,7 @@ clear_page == memset --------------------------------------------------------------------- -libgcc: last sync @ v3.14-rc7 (last commit: 01885bc) +libgcc: last sync @ v3.16-rc6 (last commit: 01885bc) linux/arch/arm/lib/lib1funcs.S xen/arch/arm/arm32/lib/lib1funcs.S linux/arch/arm/lib/lshrdi3.S xen/arch/arm/arm32/lib/lshrdi3.S diff --git a/xen/arch/arm/arm32/lib/assembler.h b/xen/arch/arm/arm32/lib/assembler.h index f8d4b3a..6de2638 100644 --- a/xen/arch/arm/arm32/lib/assembler.h +++ b/xen/arch/arm/arm32/lib/assembler.h @@ -36,8 +36,8 @@ * Endian independent macros for shifting bytes within registers. */ #ifndef __ARMEB__ -#define pull lsr -#define push lsl +#define lspull lsr +#define lspush lsl #define get_byte_0 lsl #0 #define get_byte_1 lsr #8 #define get_byte_2 lsr #16 @@ -47,8 +47,8 @@ #define put_byte_2 lsl #16 #define put_byte_3 lsl #24 #else -#define pull lsl -#define push lsr +#define lspull lsl +#define lspush lsr #define get_byte_0 lsr #24 #define get_byte_1 lsr #16 #define get_byte_2 lsr #8 diff --git a/xen/arch/arm/arm32/lib/bitops.h b/xen/arch/arm/arm32/lib/bitops.h index 25784c3..a167c2d 100644 --- a/xen/arch/arm/arm32/lib/bitops.h +++ b/xen/arch/arm/arm32/lib/bitops.h @@ -37,6 +37,11 @@ UNWIND( .fnstart ) add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 @ create mask smp_dmb +#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) + .arch_extension mp + ALT_SMP(W(pldw) [r1]) + ALT_UP(W(nop)) +#endif 1: ldrex r2, [r1] ands r0, r2, r3 @ save old value of bit \instr r2, r2, r3 @ toggle bit diff --git a/xen/arch/arm/arm32/lib/copy_template.S b/xen/arch/arm/arm32/lib/copy_template.S index 805e3f8..3bc8eb8 100644 --- a/xen/arch/arm/arm32/lib/copy_template.S +++ b/xen/arch/arm/arm32/lib/copy_template.S @@ -197,24 +197,24 @@ 12: PLD( pld [r1, #124] ) 13: ldr4w r1, r4, r5, r6, r7, abort=19f - mov r3, lr, pull #\pull + mov r3, lr, lspull #\pull subs r2, r2, #32 ldr4w r1, r8, r9, ip, lr, abort=19f - orr r3, r3, r4, push #\push - mov r4, r4, pull #\pull - orr r4, r4, r5, push #\push - mov r5, r5, pull #\pull - orr r5, r5, r6, push #\push - mov r6, r6, pull #\pull - orr r6, r6, r7, push #\push - mov r7, r7, pull #\pull - orr r7, r7, r8, push #\push - mov r8, r8, pull #\pull - orr r8, r8, r9, push #\push - mov r9, r9, pull #\pull - orr r9, r9, ip, push #\push - mov ip, ip, pull #\pull - orr ip, ip, lr, push #\push + orr r3, r3, r4, lspush #\push + mov r4, r4, lspull #\pull + orr r4, r4, r5, lspush #\push + mov r5, r5, lspull #\pull + orr r5, r5, r6, lspush #\push + mov r6, r6, lspull #\pull + orr r6, r6, r7, lspush #\push + mov r7, r7, lspull #\pull + orr r7, r7, r8, lspush #\push + mov r8, r8, lspull #\pull + orr r8, r8, r9, lspush #\push + mov r9, r9, lspull #\pull + orr r9, r9, ip, lspush #\push + mov ip, ip, lspull #\pull + orr ip, ip, lr, lspush #\push str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f bge 12b PLD( cmn r2, #96 ) @@ -225,10 +225,10 @@ 14: ands ip, r2, #28 beq 16f -15: mov r3, lr, pull #\pull +15: mov r3, lr, lspull #\pull ldr1w r1, lr, abort=21f subs ip, ip, #4 - orr r3, r3, lr, push #\push + orr r3, r3, lr, lspush #\push str1w r0, r3, abort=21f bgt 15b CALGN( cmp r2, #0 ) diff --git a/xen/arch/arm/arm32/lib/memmove.S b/xen/arch/arm/arm32/lib/memmove.S index 4e142b8..18634c3 100644 --- a/xen/arch/arm/arm32/lib/memmove.S +++ b/xen/arch/arm/arm32/lib/memmove.S @@ -148,24 +148,24 @@ ENTRY(memmove) 12: PLD( pld [r1, #-128] ) 13: ldmdb r1!, {r7, r8, r9, ip} - mov lr, r3, push #\push + mov lr, r3, lspush #\push subs r2, r2, #32 ldmdb r1!, {r3, r4, r5, r6} - orr lr, lr, ip, pull #\pull - mov ip, ip, push #\push - orr ip, ip, r9, pull #\pull - mov r9, r9, push #\push - orr r9, r9, r8, pull #\pull - mov r8, r8, push #\push - orr r8, r8, r7, pull #\pull - mov r7, r7, push #\push - orr r7, r7, r6, pull #\pull - mov r6, r6, push #\push - orr r6, r6, r5, pull #\pull - mov r5, r5, push #\push - orr r5, r5, r4, pull #\pull - mov r4, r4, push #\push - orr r4, r4, r3, pull #\pull + orr lr, lr, ip, lspull #\pull + mov ip, ip, lspush #\push + orr ip, ip, r9, lspull #\pull + mov r9, r9, lspush #\push + orr r9, r9, r8, lspull #\pull + mov r8, r8, lspush #\push + orr r8, r8, r7, lspull #\pull + mov r7, r7, lspush #\push + orr r7, r7, r6, lspull #\pull + mov r6, r6, lspush #\push + orr r6, r6, r5, lspull #\pull + mov r5, r5, lspush #\push + orr r5, r5, r4, lspull #\pull + mov r4, r4, lspush #\push + orr r4, r4, r3, lspull #\pull stmdb r0!, {r4 - r9, ip, lr} bge 12b PLD( cmn r2, #96 ) @@ -176,10 +176,10 @@ ENTRY(memmove) 14: ands ip, r2, #28 beq 16f -15: mov lr, r3, push #\push +15: mov lr, r3, lspush #\push ldr r3, [r1, #-4]! subs ip, ip, #4 - orr lr, lr, r3, pull #\pull + orr lr, lr, r3, lspull #\pull str lr, [r0, #-4]! bgt 15b CALGN( cmp r2, #0 ) diff --git a/xen/include/asm-arm/arm32/atomic.h b/xen/include/asm-arm/arm32/atomic.h index 3d601d1..7ec712f 100644 --- a/xen/include/asm-arm/arm32/atomic.h +++ b/xen/include/asm-arm/arm32/atomic.h @@ -39,6 +39,7 @@ static inline int atomic_add_return(int i, atomic_t *v) int result; smp_mb(); + prefetchw(&v->counter); __asm__ __volatile__("@ atomic_add_return\n" "1: ldrex %0, [%3]\n" @@ -78,6 +79,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) int result; smp_mb(); + prefetchw(&v->counter); __asm__ __volatile__("@ atomic_sub_return\n" "1: ldrex %0, [%3]\n" @@ -100,6 +102,7 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) unsigned long res; smp_mb(); + prefetchw(&ptr->counter); do { __asm__ __volatile__("@ atomic_cmpxchg\n" @@ -117,6 +120,35 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) return oldval; } +static inline int __atomic_add_unless(atomic_t *v, int a, int u) +{ + int oldval, newval; + unsigned long tmp; + + smp_mb(); + prefetchw(&v->counter); + + __asm__ __volatile__ ("@ atomic_add_unless\n" +"1: ldrex %0, [%4]\n" +" teq %0, %5\n" +" beq 2f\n" +" add %1, %0, %6\n" +" strex %2, %1, [%4]\n" +" teq %2, #0\n" +" bne 1b\n" +"2:" + : "=&r" (oldval), "=&r" (newval), "=&r" (tmp), "+Qo" (v->counter) + : "r" (&v->counter), "r" (u), "r" (a) + : "cc"); + + if (oldval != u) + smp_mb(); + + return oldval; +} + +#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) + #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) diff --git a/xen/include/asm-arm/arm32/cmpxchg.h b/xen/include/asm-arm/arm32/cmpxchg.h index 9a511f2..03e0bed 100644 --- a/xen/include/asm-arm/arm32/cmpxchg.h +++ b/xen/include/asm-arm/arm32/cmpxchg.h @@ -1,6 +1,8 @@ #ifndef __ASM_ARM32_CMPXCHG_H #define __ASM_ARM32_CMPXCHG_H +#include <xen/prefetch.h> + extern void __bad_xchg(volatile void *, int); static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size) @@ -9,6 +11,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size unsigned int tmp; smp_mb(); + prefetchw((const void *)ptr); switch (size) { case 1: @@ -56,6 +59,8 @@ static always_inline unsigned long __cmpxchg( { unsigned long oldval, res; + prefetchw((const void *)ptr); + switch (size) { case 1: do {