Message ID | 20190222192703.18177-2-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | [1/4] Add single-thread.h header | expand |
Ping. On 22/02/2019 16:27, Adhemerval Zanella wrote: > This patch optimizes both __lll_lock_wait_private and __lll_lock_wait > by issuing only one lll_futex_wait. Since it is defined as an inlined > syscall and inlined syscalls are defined using inlined assembly the > compiler usually can not see both calls are equal and optimize > accordingly. > > On aarch64 the resulting binary is change from: > > 0000000000000060 <__lll_lock_wait>: > 60: 2a0103e5 mov w5, w1 > 64: b9400001 ldr w1, [x0] > 68: aa0003e4 mov x4, x0 > 6c: 7100083f cmp w1, #0x2 > 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any > 74: 521900a1 eor w1, w5, #0x80 > 78: d2800042 mov x2, #0x2 // #2 > 7c: 93407c21 sxtw x1, w1 > 80: d2800003 mov x3, #0x0 // #0 > 84: d2800c48 mov x8, #0x62 // #98 > 88: d4000001 svc #0x0 > 8c: 521900a5 eor w5, w5, #0x80 > 90: 52800046 mov w6, #0x2 // #2 > 94: 93407ca5 sxtw x5, w5 > 98: 14000008 b b8 <__lll_lock_wait+0x58> > 9c: d503201f nop > a0: aa0403e0 mov x0, x4 > a4: aa0503e1 mov x1, x5 > a8: d2800042 mov x2, #0x2 // #2 > ac: d2800003 mov x3, #0x0 // #0 > b0: d2800c48 mov x8, #0x62 // #98 > b4: d4000001 svc #0x0 > b8: 885ffc80 ldaxr w0, [x4] > bc: 88017c86 stxr w1, w6, [x4] > c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58> > c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40> > c8: d65f03c0 ret > > To: > > 0000000000000048 <__lll_lock_wait>: > 48: aa0003e4 mov x4, x0 > 4c: 2a0103e5 mov w5, w1 > 50: b9400000 ldr w0, [x0] > 54: 7100081f cmp w0, #0x2 > 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none > 5c: 52800041 mov w1, #0x2 // #2 > 60: 885ffc80 ldaxr w0, [x4] > 64: 88027c81 stxr w2, w1, [x4] > 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18> > 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48> > 70: 521900a1 eor w1, w5, #0x80 > 74: aa0403e0 mov x0, x4 > 78: 93407c21 sxtw x1, w1 > 7c: d2800042 mov x2, #0x2 // #2 > 80: d2800003 mov x3, #0x0 // #0 > 84: d2800c48 mov x8, #0x62 // #98 > 88: d4000001 svc #0x0 > 8c: 17fffff4 b 5c <__lll_lock_wait+0x14> > 90: d65f03c0 ret > > I see similar changes on powerpc and other architectures. It also aligns > with x86_64 implementation by adding the systemtap probes. > > Checker on aarch64-linux-gnu. > > * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private): > Optimize futex call and add systemtap probe. > --- > nptl/lowlevellock.c | 31 +++++++++++++++++++------------ > 1 file changed, 19 insertions(+), 12 deletions(-) > > diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c > index 5eaa3807ea..47548ff121 100644 > --- a/nptl/lowlevellock.c > +++ b/nptl/lowlevellock.c > @@ -17,20 +17,23 @@ > License along with the GNU C Library; if not, see > <http://www.gnu.org/licenses/>. */ > > -#include <errno.h> > #include <sysdep.h> > #include <lowlevellock.h> > -#include <sys/time.h> > #include <atomic.h> > +#include <stap-probe.h> > > void > __lll_lock_wait_private (int *futex) > { > - if (*futex == 2) > - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ > - > - while (atomic_exchange_acq (futex, 2) != 0) > - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ > + if (atomic_load_relaxed (futex) == 2) > + goto futex; > + > + while (atomic_exchange_acquire (futex, 2) != 0) > + { > + futex: > + LIBC_PROBE (lll_lock_wait_private, 1, futex); > + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ > + } > } > > > @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex) > void > __lll_lock_wait (int *futex, int private) > { > - if (*futex == 2) > - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ > - > - while (atomic_exchange_acq (futex, 2) != 0) > - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ > + if (atomic_load_relaxed (futex) == 2) > + goto futex; > + > + while (atomic_exchange_acquire (futex, 2) != 0) > + { > + futex: > + LIBC_PROBE (lll_lock_wait, 1, futex); > + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ > + } > } > #endif >
Ping (x2). The third and forth part are already approved. On 09/04/2019 09:11, Adhemerval Zanella wrote: > Ping. > > On 22/02/2019 16:27, Adhemerval Zanella wrote: >> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait >> by issuing only one lll_futex_wait. Since it is defined as an inlined >> syscall and inlined syscalls are defined using inlined assembly the >> compiler usually can not see both calls are equal and optimize >> accordingly. >> >> On aarch64 the resulting binary is change from: >> >> 0000000000000060 <__lll_lock_wait>: >> 60: 2a0103e5 mov w5, w1 >> 64: b9400001 ldr w1, [x0] >> 68: aa0003e4 mov x4, x0 >> 6c: 7100083f cmp w1, #0x2 >> 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any >> 74: 521900a1 eor w1, w5, #0x80 >> 78: d2800042 mov x2, #0x2 // #2 >> 7c: 93407c21 sxtw x1, w1 >> 80: d2800003 mov x3, #0x0 // #0 >> 84: d2800c48 mov x8, #0x62 // #98 >> 88: d4000001 svc #0x0 >> 8c: 521900a5 eor w5, w5, #0x80 >> 90: 52800046 mov w6, #0x2 // #2 >> 94: 93407ca5 sxtw x5, w5 >> 98: 14000008 b b8 <__lll_lock_wait+0x58> >> 9c: d503201f nop >> a0: aa0403e0 mov x0, x4 >> a4: aa0503e1 mov x1, x5 >> a8: d2800042 mov x2, #0x2 // #2 >> ac: d2800003 mov x3, #0x0 // #0 >> b0: d2800c48 mov x8, #0x62 // #98 >> b4: d4000001 svc #0x0 >> b8: 885ffc80 ldaxr w0, [x4] >> bc: 88017c86 stxr w1, w6, [x4] >> c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58> >> c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40> >> c8: d65f03c0 ret >> >> To: >> >> 0000000000000048 <__lll_lock_wait>: >> 48: aa0003e4 mov x4, x0 >> 4c: 2a0103e5 mov w5, w1 >> 50: b9400000 ldr w0, [x0] >> 54: 7100081f cmp w0, #0x2 >> 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none >> 5c: 52800041 mov w1, #0x2 // #2 >> 60: 885ffc80 ldaxr w0, [x4] >> 64: 88027c81 stxr w2, w1, [x4] >> 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18> >> 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48> >> 70: 521900a1 eor w1, w5, #0x80 >> 74: aa0403e0 mov x0, x4 >> 78: 93407c21 sxtw x1, w1 >> 7c: d2800042 mov x2, #0x2 // #2 >> 80: d2800003 mov x3, #0x0 // #0 >> 84: d2800c48 mov x8, #0x62 // #98 >> 88: d4000001 svc #0x0 >> 8c: 17fffff4 b 5c <__lll_lock_wait+0x14> >> 90: d65f03c0 ret >> >> I see similar changes on powerpc and other architectures. It also aligns >> with x86_64 implementation by adding the systemtap probes. >> >> Checker on aarch64-linux-gnu. >> >> * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private): >> Optimize futex call and add systemtap probe. >> --- >> nptl/lowlevellock.c | 31 +++++++++++++++++++------------ >> 1 file changed, 19 insertions(+), 12 deletions(-) >> >> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c >> index 5eaa3807ea..47548ff121 100644 >> --- a/nptl/lowlevellock.c >> +++ b/nptl/lowlevellock.c >> @@ -17,20 +17,23 @@ >> License along with the GNU C Library; if not, see >> <http://www.gnu.org/licenses/>. */ >> >> -#include <errno.h> >> #include <sysdep.h> >> #include <lowlevellock.h> >> -#include <sys/time.h> >> #include <atomic.h> >> +#include <stap-probe.h> >> >> void >> __lll_lock_wait_private (int *futex) >> { >> - if (*futex == 2) >> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ >> - >> - while (atomic_exchange_acq (futex, 2) != 0) >> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ >> + if (atomic_load_relaxed (futex) == 2) >> + goto futex; >> + >> + while (atomic_exchange_acquire (futex, 2) != 0) >> + { >> + futex: >> + LIBC_PROBE (lll_lock_wait_private, 1, futex); >> + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ >> + } >> } >> >> >> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex) >> void >> __lll_lock_wait (int *futex, int private) >> { >> - if (*futex == 2) >> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ >> - >> - while (atomic_exchange_acq (futex, 2) != 0) >> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ >> + if (atomic_load_relaxed (futex) == 2) >> + goto futex; >> + >> + while (atomic_exchange_acquire (futex, 2) != 0) >> + { >> + futex: >> + LIBC_PROBE (lll_lock_wait, 1, futex); >> + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ >> + } >> } >> #endif >>
On 5/9/19 2:25 PM, Adhemerval Zanella wrote: > Ping (x2). The third and forth part are already approved. This looks good to me. Reviewed-by: Carlos O'Donell <carlos@redhat.com> > On 09/04/2019 09:11, Adhemerval Zanella wrote: >> Ping. >> >> On 22/02/2019 16:27, Adhemerval Zanella wrote: >>> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait >>> by issuing only one lll_futex_wait. Since it is defined as an inlined >>> syscall and inlined syscalls are defined using inlined assembly the >>> compiler usually can not see both calls are equal and optimize >>> accordingly. Right. >>> >>> On aarch64 the resulting binary is change from: >>> >>> 0000000000000060 <__lll_lock_wait>: >>> 60: 2a0103e5 mov w5, w1 >>> 64: b9400001 ldr w1, [x0] >>> 68: aa0003e4 mov x4, x0 >>> 6c: 7100083f cmp w1, #0x2 >>> 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any >>> 74: 521900a1 eor w1, w5, #0x80 >>> 78: d2800042 mov x2, #0x2 // #2 >>> 7c: 93407c21 sxtw x1, w1 >>> 80: d2800003 mov x3, #0x0 // #0 >>> 84: d2800c48 mov x8, #0x62 // #98 >>> 88: d4000001 svc #0x0 >>> 8c: 521900a5 eor w5, w5, #0x80 >>> 90: 52800046 mov w6, #0x2 // #2 >>> 94: 93407ca5 sxtw x5, w5 >>> 98: 14000008 b b8 <__lll_lock_wait+0x58> >>> 9c: d503201f nop >>> a0: aa0403e0 mov x0, x4 >>> a4: aa0503e1 mov x1, x5 >>> a8: d2800042 mov x2, #0x2 // #2 >>> ac: d2800003 mov x3, #0x0 // #0 >>> b0: d2800c48 mov x8, #0x62 // #98 >>> b4: d4000001 svc #0x0 >>> b8: 885ffc80 ldaxr w0, [x4] >>> bc: 88017c86 stxr w1, w6, [x4] >>> c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58> >>> c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40> >>> c8: d65f03c0 ret >>> >>> To: >>> >>> 0000000000000048 <__lll_lock_wait>: >>> 48: aa0003e4 mov x4, x0 >>> 4c: 2a0103e5 mov w5, w1 >>> 50: b9400000 ldr w0, [x0] >>> 54: 7100081f cmp w0, #0x2 >>> 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none >>> 5c: 52800041 mov w1, #0x2 // #2 >>> 60: 885ffc80 ldaxr w0, [x4] >>> 64: 88027c81 stxr w2, w1, [x4] >>> 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18> >>> 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48> >>> 70: 521900a1 eor w1, w5, #0x80 >>> 74: aa0403e0 mov x0, x4 >>> 78: 93407c21 sxtw x1, w1 >>> 7c: d2800042 mov x2, #0x2 // #2 >>> 80: d2800003 mov x3, #0x0 // #0 >>> 84: d2800c48 mov x8, #0x62 // #98 >>> 88: d4000001 svc #0x0 >>> 8c: 17fffff4 b 5c <__lll_lock_wait+0x14> >>> 90: d65f03c0 ret >>> Nice! >>> I see similar changes on powerpc and other architectures. It also aligns >>> with x86_64 implementation by adding the systemtap probes. >>> >>> Checker on aarch64-linux-gnu. >>> >>> * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private): >>> Optimize futex call and add systemtap probe. >>> --- >>> nptl/lowlevellock.c | 31 +++++++++++++++++++------------ >>> 1 file changed, 19 insertions(+), 12 deletions(-) >>> >>> diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c >>> index 5eaa3807ea..47548ff121 100644 >>> --- a/nptl/lowlevellock.c >>> +++ b/nptl/lowlevellock.c >>> @@ -17,20 +17,23 @@ >>> License along with the GNU C Library; if not, see >>> <http://www.gnu.org/licenses/>. */ >>> >>> -#include <errno.h> >>> #include <sysdep.h> >>> #include <lowlevellock.h> >>> -#include <sys/time.h> >>> #include <atomic.h> >>> +#include <stap-probe.h> OK. >>> >>> void >>> __lll_lock_wait_private (int *futex) >>> { >>> - if (*futex == 2) >>> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ >>> - >>> - while (atomic_exchange_acq (futex, 2) != 0) >>> - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ OK. >>> + if (atomic_load_relaxed (futex) == 2) >>> + goto futex; >>> + >>> + while (atomic_exchange_acquire (futex, 2) != 0) >>> + { >>> + futex: >>> + LIBC_PROBE (lll_lock_wait_private, 1, futex); >>> + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ OK. >>> + } >>> } >>> >>> >>> @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex) >>> void >>> __lll_lock_wait (int *futex, int private) >>> { >>> - if (*futex == 2) >>> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ >>> - >>> - while (atomic_exchange_acq (futex, 2) != 0) >>> - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ >>> + if (atomic_load_relaxed (futex) == 2) >>> + goto futex; OK. >>> + >>> + while (atomic_exchange_acquire (futex, 2) != 0) >>> + { >>> + futex: >>> + LIBC_PROBE (lll_lock_wait, 1, futex); >>> + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ OK. >>> + } >>> } >>> #endif >>> -- Cheers, Carlos.
diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c index 5eaa3807ea..47548ff121 100644 --- a/nptl/lowlevellock.c +++ b/nptl/lowlevellock.c @@ -17,20 +17,23 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <errno.h> #include <sysdep.h> #include <lowlevellock.h> -#include <sys/time.h> #include <atomic.h> +#include <stap-probe.h> void __lll_lock_wait_private (int *futex) { - if (*futex == 2) - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ - - while (atomic_exchange_acq (futex, 2) != 0) - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ + if (atomic_load_relaxed (futex) == 2) + goto futex; + + while (atomic_exchange_acquire (futex, 2) != 0) + { + futex: + LIBC_PROBE (lll_lock_wait_private, 1, futex); + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ + } } @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex) void __lll_lock_wait (int *futex, int private) { - if (*futex == 2) - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ - - while (atomic_exchange_acq (futex, 2) != 0) - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ + if (atomic_load_relaxed (futex) == 2) + goto futex; + + while (atomic_exchange_acquire (futex, 2) != 0) + { + futex: + LIBC_PROBE (lll_lock_wait, 1, futex); + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ + } } #endif