Message ID | 1417101181-13500-6-git-send-email-ola.liljedahl@linaro.org |
---|---|
State | Accepted |
Commit | 1c6711ed1d2c58c131fa10a858ac498c8f591553 |
Headers | show |
On Thu, Nov 27, 2014 at 10:13 AM, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> > Reviewed-by: Bill Fischofer <bill.fischofer@linaro.org> > --- > (This document/code contribution attached is provided under the terms of > agreement LES-LTM-21309) > > Use definitions from odp_atomic_internal.h. > > platform/linux-generic/include/api/odp_ticketlock.h | 2 +- > platform/linux-generic/odp_ticketlock.c | 14 ++++++-------- > 2 files changed, 7 insertions(+), 9 deletions(-) > > diff --git a/platform/linux-generic/include/api/odp_ticketlock.h > b/platform/linux-generic/include/api/odp_ticketlock.h > index a3b3ab5..619816e 100644 > --- a/platform/linux-generic/include/api/odp_ticketlock.h > +++ b/platform/linux-generic/include/api/odp_ticketlock.h > @@ -31,7 +31,7 @@ extern "C" { > */ > typedef struct odp_ticketlock_t { > odp_atomic_u32_t next_ticket; /**< @private Next ticket */ > - volatile uint32_t cur_ticket; /**< @private Current ticket */ > + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ > } odp_ticketlock_t; > > > diff --git a/platform/linux-generic/odp_ticketlock.c > b/platform/linux-generic/odp_ticketlock.c > index e60f526..6c5e74e 100644 > --- a/platform/linux-generic/odp_ticketlock.c > +++ b/platform/linux-generic/odp_ticketlock.c > @@ -6,6 +6,7 @@ > > #include <odp_ticketlock.h> > #include <odp_atomic.h> > +#include <odp_atomic_internal.h> > #include <odp_sync.h> > #include <odp_spin_internal.h> > > @@ -13,7 +14,7 @@ > void odp_ticketlock_init(odp_ticketlock_t *ticketlock) > { > odp_atomic_init_u32(&ticketlock->next_ticket, 0); > - ticketlock->cur_ticket = 0; > + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); > } > > > @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock) > > ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); > > - while (ticket != ticketlock->cur_ticket) > + while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > + _ODP_MEMMODEL_ACQ)) > odp_spin(); > - > - __atomic_thread_fence(__ATOMIC_ACQUIRE); > } > > > void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) > { > - __atomic_thread_fence(__ATOMIC_RELEASE); > - > - ticketlock->cur_ticket++; > + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, > _ODP_MEMMODEL_RLS); > > #if defined __OCTEON__ > odp_sync_stores(); /* SYNCW to flush write buffer */ > @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) > > int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) > { > - return ticketlock->cur_ticket != > + return odp_atomic_load_u32(&ticketlock->cur_ticket) != > odp_atomic_load_u32(&ticketlock->next_ticket); > } > -- > 1.9.1 > > > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > http://lists.linaro.org/mailman/listinfo/lng-odp >
On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com> wrote: > Hi, > > This change degrades ticketlock performance. The cur_ticket variable does not need atomic add functionality, since it's written only from within the lock. Only correct barrier enforcement is needed. cur_ticket needs atomic *store* but yes it does not need the whole RMW add operation to be atomic. This was a bit excessive of me. A better way to implemented this would be uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, _ODP_MEMMODEL_RLX); _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, _ODP_MEMMODEL_RLS); 0000000000000030 <odp_ticketlock_unlock>: 30: 8b 47 04 mov 0x4(%rdi),%eax 33: 83 c0 01 add $0x1,%eax 36: 89 47 04 mov %eax,0x4(%rdi) 39: c3 retq I was saving performance optimizations for later. On ARM there is a lot you can do with WFE/SEV (wait for event/signal event) which will avoid unnecessary spinning on atomic variables and avoiding the waiters interfering (loading a flag -> want shared ownership of cache line) with the lock owner (which likely want exclusive ownership of cache line in order to update it, this will often stall if other threads keep loading the lock variable). Moving the lock variables to another cache line avoids this problem but introduces another potential cache miss (when accessing the protected data) so is not necessarily better in all cases. -- Ola > > Here's odp_example cycle counts from the tip of repo (single thread, but multi-thread figures follow the same pattern). > > Thread 1 starts on core 1 > [1] alloc_sng alloc+free 61 cycles, 18 ns > [1] alloc_multi alloc+free 38 cycles, 11 ns > [1] poll_queue enq+deq 139 cycles, 41 ns > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns > [1] sched_____s_lo enq+deq 625 cycles, 184 ns > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns > [1] sched_____m_lo enq+deq 619 cycles, 182 ns > [1] sched_multi_lo enq+deq 165 cycles, 48 ns > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns > [1] sched_____s_hi enq+deq 331 cycles, 97 ns > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns > [1] sched_____m_hi enq+deq 325 cycles, 95 ns > [1] sched_multi_hi enq+deq 90 cycles, 26 ns > Thread 1 exits > ODP example complete > > > And here's the same after removing this patch from the tip. > > > Thread 1 starts on core 1 > [1] alloc_sng alloc+free 61 cycles, 18 ns > [1] alloc_multi alloc+free 35 cycles, 10 ns > [1] poll_queue enq+deq 86 cycles, 25 ns > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns > [1] sched_____s_lo enq+deq 404 cycles, 119 ns > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns > [1] sched_____m_lo enq+deq 405 cycles, 119 ns > [1] sched_multi_lo enq+deq 112 cycles, 33 ns > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns > [1] sched_____s_hi enq+deq 218 cycles, 64 ns > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns > [1] sched_____m_hi enq+deq 212 cycles, 62 ns > [1] sched_multi_hi enq+deq 67 cycles, 19 ns > Thread 1 exits > ODP example complete > > > Both ways are functionally correct, but atomic add functionality is not needed here and degrades performance at least in my x86 system. > > > -Petri > > > > >> -----Original Message----- >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl >> Sent: Thursday, November 27, 2014 5:13 PM >> To: lng-odp@lists.linaro.org >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: use >> odp_atomic_internal.h >> >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> >> --- >> (This document/code contribution attached is provided under the terms of >> agreement LES-LTM-21309) >> >> Use definitions from odp_atomic_internal.h. >> >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- >> platform/linux-generic/odp_ticketlock.c | 14 ++++++-------- >> 2 files changed, 7 insertions(+), 9 deletions(-) >> >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h >> b/platform/linux-generic/include/api/odp_ticketlock.h >> index a3b3ab5..619816e 100644 >> --- a/platform/linux-generic/include/api/odp_ticketlock.h >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h >> @@ -31,7 +31,7 @@ extern "C" { >> */ >> typedef struct odp_ticketlock_t { >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ >> - volatile uint32_t cur_ticket; /**< @private Current ticket */ >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ >> } odp_ticketlock_t; >> >> >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux- >> generic/odp_ticketlock.c >> index e60f526..6c5e74e 100644 >> --- a/platform/linux-generic/odp_ticketlock.c >> +++ b/platform/linux-generic/odp_ticketlock.c >> @@ -6,6 +6,7 @@ >> >> #include <odp_ticketlock.h> >> #include <odp_atomic.h> >> +#include <odp_atomic_internal.h> >> #include <odp_sync.h> >> #include <odp_spin_internal.h> >> >> @@ -13,7 +14,7 @@ >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) >> { >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); >> - ticketlock->cur_ticket = 0; >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); >> } >> >> >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock) >> >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); >> >> - while (ticket != ticketlock->cur_ticket) >> + while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, >> + _ODP_MEMMODEL_ACQ)) >> odp_spin(); >> - >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); >> } >> >> >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) >> { >> - __atomic_thread_fence(__ATOMIC_RELEASE); >> - >> - ticketlock->cur_ticket++; >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, >> _ODP_MEMMODEL_RLS); >> >> #if defined __OCTEON__ >> odp_sync_stores(); /* SYNCW to flush write buffer */ >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) >> >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) >> { >> - return ticketlock->cur_ticket != >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != >> odp_atomic_load_u32(&ticketlock->next_ticket); >> } >> -- >> 1.9.1 >> >> >> _______________________________________________ >> lng-odp mailing list >> lng-odp@lists.linaro.org >> http://lists.linaro.org/mailman/listinfo/lng-odp
Isn't that a question of implementation rather than API definition? The APIs are useful. It's the responsibility for each implementation to realize those APIs optimally for its platform. The alternative is to push that responsibility onto the application, which defeats the portability goals of ODP. On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) < petri.savolainen@nsn.com> wrote: > Hi, > > This is why lock (and lock free algorithm) implementation abstraction has > low value. It's hard to find a single abstraction that is optimal for all > HW architectures - a few C lines arranged differently may cause major > performance impact. E.g. the kernel ticketlock for x86 packs both those > variables into the same word so that they can load both with single xadd > command ... how to abstract that. > > -Petri > > > > -----Original Message----- > > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org] > > Sent: Monday, December 01, 2014 2:55 PM > > To: Savolainen, Petri (NSN - FI/Espoo) > > Cc: lng-odp@lists.linaro.org > > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: > > use odp_atomic_internal.h > > > > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) > > <petri.savolainen@nsn.com> wrote: > > > Hi, > > > > > > This change degrades ticketlock performance. The cur_ticket variable > > does not need atomic add functionality, since it's written only from > > within the lock. Only correct barrier enforcement is needed. > > cur_ticket needs atomic *store* but yes it does not need the whole RMW > > add operation to be atomic. This was a bit excessive of me. > > > > A better way to implemented this would be > > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > > _ODP_MEMMODEL_RLX); > > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, > > _ODP_MEMMODEL_RLS); > > > > 0000000000000030 <odp_ticketlock_unlock>: > > 30: 8b 47 04 mov 0x4(%rdi),%eax > > 33: 83 c0 01 add $0x1,%eax > > 36: 89 47 04 mov %eax,0x4(%rdi) > > 39: c3 retq > > > > I was saving performance optimizations for later. On ARM there is a > > lot you can do with WFE/SEV (wait for event/signal event) which will > > avoid unnecessary spinning on atomic variables and avoiding the > > waiters interfering (loading a flag -> want shared ownership of cache > > line) with the lock owner (which likely want exclusive ownership of > > cache line in order to update it, this will often stall if other > > threads keep loading the lock variable). Moving the lock variables to > > another cache line avoids this problem but introduces another > > potential cache miss (when accessing the protected data) so is not > > necessarily better in all cases. > > > > -- Ola > > > > > > > > Here's odp_example cycle counts from the tip of repo (single thread, > but > > multi-thread figures follow the same pattern). > > > > > > Thread 1 starts on core 1 > > > [1] alloc_sng alloc+free 61 cycles, 18 ns > > > [1] alloc_multi alloc+free 38 cycles, 11 ns > > > [1] poll_queue enq+deq 139 cycles, 41 ns > > > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns > > > [1] sched_____s_lo enq+deq 625 cycles, 184 ns > > > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns > > > [1] sched_____m_lo enq+deq 619 cycles, 182 ns > > > [1] sched_multi_lo enq+deq 165 cycles, 48 ns > > > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns > > > [1] sched_____s_hi enq+deq 331 cycles, 97 ns > > > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns > > > [1] sched_____m_hi enq+deq 325 cycles, 95 ns > > > [1] sched_multi_hi enq+deq 90 cycles, 26 ns > > > Thread 1 exits > > > ODP example complete > > > > > > > > > And here's the same after removing this patch from the tip. > > > > > > > > > Thread 1 starts on core 1 > > > [1] alloc_sng alloc+free 61 cycles, 18 ns > > > [1] alloc_multi alloc+free 35 cycles, 10 ns > > > [1] poll_queue enq+deq 86 cycles, 25 ns > > > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns > > > [1] sched_____s_lo enq+deq 404 cycles, 119 ns > > > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns > > > [1] sched_____m_lo enq+deq 405 cycles, 119 ns > > > [1] sched_multi_lo enq+deq 112 cycles, 33 ns > > > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns > > > [1] sched_____s_hi enq+deq 218 cycles, 64 ns > > > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns > > > [1] sched_____m_hi enq+deq 212 cycles, 62 ns > > > [1] sched_multi_hi enq+deq 67 cycles, 19 ns > > > Thread 1 exits > > > ODP example complete > > > > > > > > > Both ways are functionally correct, but atomic add functionality is not > > needed here and degrades performance at least in my x86 system. > > > > > > > > > -Petri > > > > > > > > > > > > > > >> -----Original Message----- > > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- > > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl > > >> Sent: Thursday, November 27, 2014 5:13 PM > > >> To: lng-odp@lists.linaro.org > > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: > > use > > >> odp_atomic_internal.h > > >> > > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> > > >> --- > > >> (This document/code contribution attached is provided under the terms > > of > > >> agreement LES-LTM-21309) > > >> > > >> Use definitions from odp_atomic_internal.h. > > >> > > >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- > > >> platform/linux-generic/odp_ticketlock.c | 14 > ++++++------- > > - > > >> 2 files changed, 7 insertions(+), 9 deletions(-) > > >> > > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h > > >> b/platform/linux-generic/include/api/odp_ticketlock.h > > >> index a3b3ab5..619816e 100644 > > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h > > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h > > >> @@ -31,7 +31,7 @@ extern "C" { > > >> */ > > >> typedef struct odp_ticketlock_t { > > >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ > > >> - volatile uint32_t cur_ticket; /**< @private Current ticket */ > > >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ > > >> } odp_ticketlock_t; > > >> > > >> > > >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux- > > >> generic/odp_ticketlock.c > > >> index e60f526..6c5e74e 100644 > > >> --- a/platform/linux-generic/odp_ticketlock.c > > >> +++ b/platform/linux-generic/odp_ticketlock.c > > >> @@ -6,6 +6,7 @@ > > >> > > >> #include <odp_ticketlock.h> > > >> #include <odp_atomic.h> > > >> +#include <odp_atomic_internal.h> > > >> #include <odp_sync.h> > > >> #include <odp_spin_internal.h> > > >> > > >> @@ -13,7 +14,7 @@ > > >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) > > >> { > > >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); > > >> - ticketlock->cur_ticket = 0; > > >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); > > >> } > > >> > > >> > > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t > > *ticketlock) > > >> > > >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); > > >> > > >> - while (ticket != ticketlock->cur_ticket) > > >> + while (ticket != > _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > > >> + _ODP_MEMMODEL_ACQ)) > > >> odp_spin(); > > >> - > > >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); > > >> } > > >> > > >> > > >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) > > >> { > > >> - __atomic_thread_fence(__ATOMIC_RELEASE); > > >> - > > >> - ticketlock->cur_ticket++; > > >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, > > >> _ODP_MEMMODEL_RLS); > > >> > > >> #if defined __OCTEON__ > > >> odp_sync_stores(); /* SYNCW to flush write buffer */ > > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t > > *ticketlock) > > >> > > >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) > > >> { > > >> - return ticketlock->cur_ticket != > > >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != > > >> odp_atomic_load_u32(&ticketlock->next_ticket); > > >> } > > >> -- > > >> 1.9.1 > > >> > > >> > > >> _______________________________________________ > > >> lng-odp mailing list > > >> lng-odp@lists.linaro.org > > >> http://lists.linaro.org/mailman/listinfo/lng-odp > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > http://lists.linaro.org/mailman/listinfo/lng-odp >
odp_atomic_internal is using the GCC primitives (for now) for linux-generic. Those can be changed/refined as needed. The functions themselves are useful. We need to distinguish between APIs and implementations. We shouldn't be arguing for or against an API based on the virtues or deficiencies of a specific implementation of that API. If the API is good then optimizing its implementation is a separate issue that should be handled as a bug fix. On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo) < petri.savolainen@nsn.com> wrote: > Implementation abstraction == usage of odp_atomic_internal.h in the > implementation. > > > > -Petri > > > > *From:* ext Bill Fischofer [mailto:bill.fischofer@linaro.org] > *Sent:* Monday, December 01, 2014 3:18 PM > *To:* Savolainen, Petri (NSN - FI/Espoo) > *Cc:* ext Ola Liljedahl; lng-odp@lists.linaro.org > > *Subject:* Re: [lng-odp] [PATCHv2 5/5] linux-generic: > odp_ticketlock.[ch]: use odp_atomic_internal.h > > > > Isn't that a question of implementation rather than API definition? The > APIs are useful. It's the responsibility for each implementation to > realize those APIs optimally for its platform. The alternative is to push > that responsibility onto the application, which defeats the portability > goals of ODP. > > > > On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) < > petri.savolainen@nsn.com> wrote: > > Hi, > > This is why lock (and lock free algorithm) implementation abstraction has > low value. It's hard to find a single abstraction that is optimal for all > HW architectures - a few C lines arranged differently may cause major > performance impact. E.g. the kernel ticketlock for x86 packs both those > variables into the same word so that they can load both with single xadd > command ... how to abstract that. > > -Petri > > > > > -----Original Message----- > > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org] > > Sent: Monday, December 01, 2014 2:55 PM > > To: Savolainen, Petri (NSN - FI/Espoo) > > Cc: lng-odp@lists.linaro.org > > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: > > use odp_atomic_internal.h > > > > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) > > <petri.savolainen@nsn.com> wrote: > > > Hi, > > > > > > This change degrades ticketlock performance. The cur_ticket variable > > does not need atomic add functionality, since it's written only from > > within the lock. Only correct barrier enforcement is needed. > > cur_ticket needs atomic *store* but yes it does not need the whole RMW > > add operation to be atomic. This was a bit excessive of me. > > > > A better way to implemented this would be > > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > > _ODP_MEMMODEL_RLX); > > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, > > _ODP_MEMMODEL_RLS); > > > > 0000000000000030 <odp_ticketlock_unlock>: > > 30: 8b 47 04 mov 0x4(%rdi),%eax > > 33: 83 c0 01 add $0x1,%eax > > 36: 89 47 04 mov %eax,0x4(%rdi) > > 39: c3 retq > > > > I was saving performance optimizations for later. On ARM there is a > > lot you can do with WFE/SEV (wait for event/signal event) which will > > avoid unnecessary spinning on atomic variables and avoiding the > > waiters interfering (loading a flag -> want shared ownership of cache > > line) with the lock owner (which likely want exclusive ownership of > > cache line in order to update it, this will often stall if other > > threads keep loading the lock variable). Moving the lock variables to > > another cache line avoids this problem but introduces another > > potential cache miss (when accessing the protected data) so is not > > necessarily better in all cases. > > > > -- Ola > > > > > > > > Here's odp_example cycle counts from the tip of repo (single thread, > but > > multi-thread figures follow the same pattern). > > > > > > Thread 1 starts on core 1 > > > [1] alloc_sng alloc+free 61 cycles, 18 ns > > > [1] alloc_multi alloc+free 38 cycles, 11 ns > > > [1] poll_queue enq+deq 139 cycles, 41 ns > > > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns > > > [1] sched_____s_lo enq+deq 625 cycles, 184 ns > > > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns > > > [1] sched_____m_lo enq+deq 619 cycles, 182 ns > > > [1] sched_multi_lo enq+deq 165 cycles, 48 ns > > > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns > > > [1] sched_____s_hi enq+deq 331 cycles, 97 ns > > > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns > > > [1] sched_____m_hi enq+deq 325 cycles, 95 ns > > > [1] sched_multi_hi enq+deq 90 cycles, 26 ns > > > Thread 1 exits > > > ODP example complete > > > > > > > > > And here's the same after removing this patch from the tip. > > > > > > > > > Thread 1 starts on core 1 > > > [1] alloc_sng alloc+free 61 cycles, 18 ns > > > [1] alloc_multi alloc+free 35 cycles, 10 ns > > > [1] poll_queue enq+deq 86 cycles, 25 ns > > > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns > > > [1] sched_____s_lo enq+deq 404 cycles, 119 ns > > > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns > > > [1] sched_____m_lo enq+deq 405 cycles, 119 ns > > > [1] sched_multi_lo enq+deq 112 cycles, 33 ns > > > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns > > > [1] sched_____s_hi enq+deq 218 cycles, 64 ns > > > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns > > > [1] sched_____m_hi enq+deq 212 cycles, 62 ns > > > [1] sched_multi_hi enq+deq 67 cycles, 19 ns > > > Thread 1 exits > > > ODP example complete > > > > > > > > > Both ways are functionally correct, but atomic add functionality is not > > needed here and degrades performance at least in my x86 system. > > > > > > > > > -Petri > > > > > > > > > > > > > > >> -----Original Message----- > > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- > > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl > > >> Sent: Thursday, November 27, 2014 5:13 PM > > >> To: lng-odp@lists.linaro.org > > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: > > use > > >> odp_atomic_internal.h > > >> > > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> > > >> --- > > >> (This document/code contribution attached is provided under the terms > > of > > >> agreement LES-LTM-21309) > > >> > > >> Use definitions from odp_atomic_internal.h. > > >> > > >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- > > >> platform/linux-generic/odp_ticketlock.c | 14 > ++++++------- > > - > > >> 2 files changed, 7 insertions(+), 9 deletions(-) > > >> > > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h > > >> b/platform/linux-generic/include/api/odp_ticketlock.h > > >> index a3b3ab5..619816e 100644 > > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h > > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h > > >> @@ -31,7 +31,7 @@ extern "C" { > > >> */ > > >> typedef struct odp_ticketlock_t { > > >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ > > >> - volatile uint32_t cur_ticket; /**< @private Current ticket */ > > >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ > > >> } odp_ticketlock_t; > > >> > > >> > > >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux- > > >> generic/odp_ticketlock.c > > >> index e60f526..6c5e74e 100644 > > >> --- a/platform/linux-generic/odp_ticketlock.c > > >> +++ b/platform/linux-generic/odp_ticketlock.c > > >> @@ -6,6 +6,7 @@ > > >> > > >> #include <odp_ticketlock.h> > > >> #include <odp_atomic.h> > > >> +#include <odp_atomic_internal.h> > > >> #include <odp_sync.h> > > >> #include <odp_spin_internal.h> > > >> > > >> @@ -13,7 +14,7 @@ > > >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) > > >> { > > >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); > > >> - ticketlock->cur_ticket = 0; > > >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); > > >> } > > >> > > >> > > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t > > *ticketlock) > > >> > > >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); > > >> > > >> - while (ticket != ticketlock->cur_ticket) > > >> + while (ticket != > _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > > >> + _ODP_MEMMODEL_ACQ)) > > >> odp_spin(); > > >> - > > >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); > > >> } > > >> > > >> > > >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) > > >> { > > >> - __atomic_thread_fence(__ATOMIC_RELEASE); > > >> - > > >> - ticketlock->cur_ticket++; > > >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, > > >> _ODP_MEMMODEL_RLS); > > >> > > >> #if defined __OCTEON__ > > >> odp_sync_stores(); /* SYNCW to flush write buffer */ > > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t > > *ticketlock) > > >> > > >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) > > >> { > > >> - return ticketlock->cur_ticket != > > >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != > > >> odp_atomic_load_u32(&ticketlock->next_ticket); > > >> } > > >> -- > > >> 1.9.1 > > >> > > >> > > >> _______________________________________________ > > >> lng-odp mailing list > > >> lng-odp@lists.linaro.org > > >> http://lists.linaro.org/mailman/listinfo/lng-odp > _______________________________________________ > lng-odp mailing list > lng-odp@lists.linaro.org > http://lists.linaro.org/mailman/listinfo/lng-odp > > >
On 1 December 2014 at 14:14, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com> wrote: > Hi, > > This is why lock (and lock free algorithm) implementation abstraction has low value. I can't agree with that. Using higher level abstraction and programming model makes it much easier to write correct code. Only when you have correct function should you start optimization. > It's hard to find a single abstraction that is optimal for all HW architectures - a few C lines arranged differently may cause major performance impact. E.g. the kernel ticketlock for x86 packs both those variables into the same word so that they can load both with single xadd command ... how to abstract that. I have looked at this implementation as well and was considering benchmarking it later. You challenge me to find an abstraction for that that makes is possible to implement it efficiently using C11-style atomics. Challenge accepted. I think the bigger challenge is to find implementations that scale nicely and avoid congesting the interconnect with loads and invalidates because a number of threads are competing for the same lock. -- Ola > > -Petri > > >> -----Original Message----- >> From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org] >> Sent: Monday, December 01, 2014 2:55 PM >> To: Savolainen, Petri (NSN - FI/Espoo) >> Cc: lng-odp@lists.linaro.org >> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: >> use odp_atomic_internal.h >> >> On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) >> <petri.savolainen@nsn.com> wrote: >> > Hi, >> > >> > This change degrades ticketlock performance. The cur_ticket variable >> does not need atomic add functionality, since it's written only from >> within the lock. Only correct barrier enforcement is needed. >> cur_ticket needs atomic *store* but yes it does not need the whole RMW >> add operation to be atomic. This was a bit excessive of me. >> >> A better way to implemented this would be >> uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, >> _ODP_MEMMODEL_RLX); >> _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, >> _ODP_MEMMODEL_RLS); >> >> 0000000000000030 <odp_ticketlock_unlock>: >> 30: 8b 47 04 mov 0x4(%rdi),%eax >> 33: 83 c0 01 add $0x1,%eax >> 36: 89 47 04 mov %eax,0x4(%rdi) >> 39: c3 retq >> >> I was saving performance optimizations for later. On ARM there is a >> lot you can do with WFE/SEV (wait for event/signal event) which will >> avoid unnecessary spinning on atomic variables and avoiding the >> waiters interfering (loading a flag -> want shared ownership of cache >> line) with the lock owner (which likely want exclusive ownership of >> cache line in order to update it, this will often stall if other >> threads keep loading the lock variable). Moving the lock variables to >> another cache line avoids this problem but introduces another >> potential cache miss (when accessing the protected data) so is not >> necessarily better in all cases. >> >> -- Ola >> >> > >> > Here's odp_example cycle counts from the tip of repo (single thread, but >> multi-thread figures follow the same pattern). >> > >> > Thread 1 starts on core 1 >> > [1] alloc_sng alloc+free 61 cycles, 18 ns >> > [1] alloc_multi alloc+free 38 cycles, 11 ns >> > [1] poll_queue enq+deq 139 cycles, 41 ns >> > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns >> > [1] sched_____s_lo enq+deq 625 cycles, 184 ns >> > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns >> > [1] sched_____m_lo enq+deq 619 cycles, 182 ns >> > [1] sched_multi_lo enq+deq 165 cycles, 48 ns >> > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns >> > [1] sched_____s_hi enq+deq 331 cycles, 97 ns >> > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns >> > [1] sched_____m_hi enq+deq 325 cycles, 95 ns >> > [1] sched_multi_hi enq+deq 90 cycles, 26 ns >> > Thread 1 exits >> > ODP example complete >> > >> > >> > And here's the same after removing this patch from the tip. >> > >> > >> > Thread 1 starts on core 1 >> > [1] alloc_sng alloc+free 61 cycles, 18 ns >> > [1] alloc_multi alloc+free 35 cycles, 10 ns >> > [1] poll_queue enq+deq 86 cycles, 25 ns >> > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns >> > [1] sched_____s_lo enq+deq 404 cycles, 119 ns >> > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns >> > [1] sched_____m_lo enq+deq 405 cycles, 119 ns >> > [1] sched_multi_lo enq+deq 112 cycles, 33 ns >> > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns >> > [1] sched_____s_hi enq+deq 218 cycles, 64 ns >> > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns >> > [1] sched_____m_hi enq+deq 212 cycles, 62 ns >> > [1] sched_multi_hi enq+deq 67 cycles, 19 ns >> > Thread 1 exits >> > ODP example complete >> > >> > >> > Both ways are functionally correct, but atomic add functionality is not >> needed here and degrades performance at least in my x86 system. >> > >> > >> > -Petri >> > >> > >> > >> > >> >> -----Original Message----- >> >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- >> >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl >> >> Sent: Thursday, November 27, 2014 5:13 PM >> >> To: lng-odp@lists.linaro.org >> >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: >> use >> >> odp_atomic_internal.h >> >> >> >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> >> >> --- >> >> (This document/code contribution attached is provided under the terms >> of >> >> agreement LES-LTM-21309) >> >> >> >> Use definitions from odp_atomic_internal.h. >> >> >> >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- >> >> platform/linux-generic/odp_ticketlock.c | 14 ++++++------- >> - >> >> 2 files changed, 7 insertions(+), 9 deletions(-) >> >> >> >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h >> >> b/platform/linux-generic/include/api/odp_ticketlock.h >> >> index a3b3ab5..619816e 100644 >> >> --- a/platform/linux-generic/include/api/odp_ticketlock.h >> >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h >> >> @@ -31,7 +31,7 @@ extern "C" { >> >> */ >> >> typedef struct odp_ticketlock_t { >> >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ >> >> - volatile uint32_t cur_ticket; /**< @private Current ticket */ >> >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ >> >> } odp_ticketlock_t; >> >> >> >> >> >> diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux- >> >> generic/odp_ticketlock.c >> >> index e60f526..6c5e74e 100644 >> >> --- a/platform/linux-generic/odp_ticketlock.c >> >> +++ b/platform/linux-generic/odp_ticketlock.c >> >> @@ -6,6 +6,7 @@ >> >> >> >> #include <odp_ticketlock.h> >> >> #include <odp_atomic.h> >> >> +#include <odp_atomic_internal.h> >> >> #include <odp_sync.h> >> >> #include <odp_spin_internal.h> >> >> >> >> @@ -13,7 +14,7 @@ >> >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) >> >> { >> >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); >> >> - ticketlock->cur_ticket = 0; >> >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); >> >> } >> >> >> >> >> >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t >> *ticketlock) >> >> >> >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); >> >> >> >> - while (ticket != ticketlock->cur_ticket) >> >> + while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, >> >> + _ODP_MEMMODEL_ACQ)) >> >> odp_spin(); >> >> - >> >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); >> >> } >> >> >> >> >> >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) >> >> { >> >> - __atomic_thread_fence(__ATOMIC_RELEASE); >> >> - >> >> - ticketlock->cur_ticket++; >> >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, >> >> _ODP_MEMMODEL_RLS); >> >> >> >> #if defined __OCTEON__ >> >> odp_sync_stores(); /* SYNCW to flush write buffer */ >> >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t >> *ticketlock) >> >> >> >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) >> >> { >> >> - return ticketlock->cur_ticket != >> >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != >> >> odp_atomic_load_u32(&ticketlock->next_ticket); >> >> } >> >> -- >> >> 1.9.1 >> >> >> >> >> >> _______________________________________________ >> >> lng-odp mailing list >> >> lng-odp@lists.linaro.org >> >> http://lists.linaro.org/mailman/listinfo/lng-odp
On 1 December 2014 at 14:25, Bill Fischofer <bill.fischofer@linaro.org> wrote: > odp_atomic_internal is using the GCC primitives (for now) for linux-generic. > Those can be changed/refined as needed. The functions themselves are useful. > > We need to distinguish between APIs and implementations. We shouldn't be > arguing for or against an API based on the virtues or deficiencies of a > specific implementation of that API. If the API is good then optimizing its > implementation is a separate issue that should be handled as a bug fix. The problem was not even the implementation of this API. I used an atomic-RMW operation where none was needed (only atomic store needed) and this is not optimal for some architectures (e.g. x86). But I wasn't attempting to optimize for performance, just optimizing for correctness. -- Ola > > On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo) > <petri.savolainen@nsn.com> wrote: >> >> Implementation abstraction == usage of odp_atomic_internal.h in the >> implementation. >> >> >> >> -Petri >> >> >> >> From: ext Bill Fischofer [mailto:bill.fischofer@linaro.org] >> Sent: Monday, December 01, 2014 3:18 PM >> To: Savolainen, Petri (NSN - FI/Espoo) >> Cc: ext Ola Liljedahl; lng-odp@lists.linaro.org >> >> >> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: >> use odp_atomic_internal.h >> >> >> >> Isn't that a question of implementation rather than API definition? The >> APIs are useful. It's the responsibility for each implementation to realize >> those APIs optimally for its platform. The alternative is to push that >> responsibility onto the application, which defeats the portability goals of >> ODP. >> >> >> >> On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) >> <petri.savolainen@nsn.com> wrote: >> >> Hi, >> >> This is why lock (and lock free algorithm) implementation abstraction has >> low value. It's hard to find a single abstraction that is optimal for all HW >> architectures - a few C lines arranged differently may cause major >> performance impact. E.g. the kernel ticketlock for x86 packs both those >> variables into the same word so that they can load both with single xadd >> command ... how to abstract that. >> >> -Petri >> >> >> >> > -----Original Message----- >> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org] >> > Sent: Monday, December 01, 2014 2:55 PM >> > To: Savolainen, Petri (NSN - FI/Espoo) >> > Cc: lng-odp@lists.linaro.org >> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: >> > use odp_atomic_internal.h >> > >> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) >> > <petri.savolainen@nsn.com> wrote: >> > > Hi, >> > > >> > > This change degrades ticketlock performance. The cur_ticket variable >> > does not need atomic add functionality, since it's written only from >> > within the lock. Only correct barrier enforcement is needed. >> > cur_ticket needs atomic *store* but yes it does not need the whole RMW >> > add operation to be atomic. This was a bit excessive of me. >> > >> > A better way to implemented this would be >> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, >> > _ODP_MEMMODEL_RLX); >> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, >> > _ODP_MEMMODEL_RLS); >> > >> > 0000000000000030 <odp_ticketlock_unlock>: >> > 30: 8b 47 04 mov 0x4(%rdi),%eax >> > 33: 83 c0 01 add $0x1,%eax >> > 36: 89 47 04 mov %eax,0x4(%rdi) >> > 39: c3 retq >> > >> > I was saving performance optimizations for later. On ARM there is a >> > lot you can do with WFE/SEV (wait for event/signal event) which will >> > avoid unnecessary spinning on atomic variables and avoiding the >> > waiters interfering (loading a flag -> want shared ownership of cache >> > line) with the lock owner (which likely want exclusive ownership of >> > cache line in order to update it, this will often stall if other >> > threads keep loading the lock variable). Moving the lock variables to >> > another cache line avoids this problem but introduces another >> > potential cache miss (when accessing the protected data) so is not >> > necessarily better in all cases. >> > >> > -- Ola >> > >> > > >> > > Here's odp_example cycle counts from the tip of repo (single thread, >> > > but >> > multi-thread figures follow the same pattern). >> > > >> > > Thread 1 starts on core 1 >> > > [1] alloc_sng alloc+free 61 cycles, 18 ns >> > > [1] alloc_multi alloc+free 38 cycles, 11 ns >> > > [1] poll_queue enq+deq 139 cycles, 41 ns >> > > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns >> > > [1] sched_____s_lo enq+deq 625 cycles, 184 ns >> > > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns >> > > [1] sched_____m_lo enq+deq 619 cycles, 182 ns >> > > [1] sched_multi_lo enq+deq 165 cycles, 48 ns >> > > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns >> > > [1] sched_____s_hi enq+deq 331 cycles, 97 ns >> > > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns >> > > [1] sched_____m_hi enq+deq 325 cycles, 95 ns >> > > [1] sched_multi_hi enq+deq 90 cycles, 26 ns >> > > Thread 1 exits >> > > ODP example complete >> > > >> > > >> > > And here's the same after removing this patch from the tip. >> > > >> > > >> > > Thread 1 starts on core 1 >> > > [1] alloc_sng alloc+free 61 cycles, 18 ns >> > > [1] alloc_multi alloc+free 35 cycles, 10 ns >> > > [1] poll_queue enq+deq 86 cycles, 25 ns >> > > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns >> > > [1] sched_____s_lo enq+deq 404 cycles, 119 ns >> > > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns >> > > [1] sched_____m_lo enq+deq 405 cycles, 119 ns >> > > [1] sched_multi_lo enq+deq 112 cycles, 33 ns >> > > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns >> > > [1] sched_____s_hi enq+deq 218 cycles, 64 ns >> > > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns >> > > [1] sched_____m_hi enq+deq 212 cycles, 62 ns >> > > [1] sched_multi_hi enq+deq 67 cycles, 19 ns >> > > Thread 1 exits >> > > ODP example complete >> > > >> > > >> > > Both ways are functionally correct, but atomic add functionality is >> > > not >> > needed here and degrades performance at least in my x86 system. >> > > >> > > >> > > -Petri >> > > >> > > >> > > >> > > >> > >> -----Original Message----- >> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- >> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl >> > >> Sent: Thursday, November 27, 2014 5:13 PM >> > >> To: lng-odp@lists.linaro.org >> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: >> > use >> > >> odp_atomic_internal.h >> > >> >> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> >> > >> --- >> > >> (This document/code contribution attached is provided under the terms >> > of >> > >> agreement LES-LTM-21309) >> > >> >> > >> Use definitions from odp_atomic_internal.h. >> > >> >> > >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- >> > >> platform/linux-generic/odp_ticketlock.c | 14 >> > >> ++++++------- >> > - >> > >> 2 files changed, 7 insertions(+), 9 deletions(-) >> > >> >> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h >> > >> b/platform/linux-generic/include/api/odp_ticketlock.h >> > >> index a3b3ab5..619816e 100644 >> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h >> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h >> > >> @@ -31,7 +31,7 @@ extern "C" { >> > >> */ >> > >> typedef struct odp_ticketlock_t { >> > >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ >> > >> - volatile uint32_t cur_ticket; /**< @private Current ticket */ >> > >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ >> > >> } odp_ticketlock_t; >> > >> >> > >> >> > >> diff --git a/platform/linux-generic/odp_ticketlock.c >> > >> b/platform/linux- >> > >> generic/odp_ticketlock.c >> > >> index e60f526..6c5e74e 100644 >> > >> --- a/platform/linux-generic/odp_ticketlock.c >> > >> +++ b/platform/linux-generic/odp_ticketlock.c >> > >> @@ -6,6 +6,7 @@ >> > >> >> > >> #include <odp_ticketlock.h> >> > >> #include <odp_atomic.h> >> > >> +#include <odp_atomic_internal.h> >> > >> #include <odp_sync.h> >> > >> #include <odp_spin_internal.h> >> > >> >> > >> @@ -13,7 +14,7 @@ >> > >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) >> > >> { >> > >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); >> > >> - ticketlock->cur_ticket = 0; >> > >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); >> > >> } >> > >> >> > >> >> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t >> > *ticketlock) >> > >> >> > >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); >> > >> >> > >> - while (ticket != ticketlock->cur_ticket) >> > >> + while (ticket != >> > >> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, >> > >> + _ODP_MEMMODEL_ACQ)) >> > >> odp_spin(); >> > >> - >> > >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); >> > >> } >> > >> >> > >> >> > >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) >> > >> { >> > >> - __atomic_thread_fence(__ATOMIC_RELEASE); >> > >> - >> > >> - ticketlock->cur_ticket++; >> > >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, >> > >> _ODP_MEMMODEL_RLS); >> > >> >> > >> #if defined __OCTEON__ >> > >> odp_sync_stores(); /* SYNCW to flush write buffer */ >> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t >> > *ticketlock) >> > >> >> > >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) >> > >> { >> > >> - return ticketlock->cur_ticket != >> > >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != >> > >> odp_atomic_load_u32(&ticketlock->next_ticket); >> > >> } >> > >> -- >> > >> 1.9.1 >> > >> >> > >> >> > >> _______________________________________________ >> > >> lng-odp mailing list >> > >> lng-odp@lists.linaro.org >> > >> http://lists.linaro.org/mailman/listinfo/lng-odp >> _______________________________________________ >> lng-odp mailing list >> lng-odp@lists.linaro.org >> http://lists.linaro.org/mailman/listinfo/lng-odp >> >> > >
I agree with that approach. Getting things correct first is the most important. Getting the wrong answer quickly isn't terribly useful. :) On Mon, Dec 1, 2014 at 8:43 AM, Ola Liljedahl <ola.liljedahl@linaro.org> wrote: > On 1 December 2014 at 14:25, Bill Fischofer <bill.fischofer@linaro.org> > wrote: > > odp_atomic_internal is using the GCC primitives (for now) for > linux-generic. > > Those can be changed/refined as needed. The functions themselves are > useful. > > > > We need to distinguish between APIs and implementations. We shouldn't be > > arguing for or against an API based on the virtues or deficiencies of a > > specific implementation of that API. If the API is good then optimizing > its > > implementation is a separate issue that should be handled as a bug fix. > The problem was not even the implementation of this API. I used an > atomic-RMW operation where none was needed (only atomic store needed) > and this is not optimal for some architectures (e.g. x86). But I > wasn't attempting to optimize for performance, just optimizing for > correctness. > > -- Ola > > > > > On Mon, Dec 1, 2014 at 8:20 AM, Savolainen, Petri (NSN - FI/Espoo) > > <petri.savolainen@nsn.com> wrote: > >> > >> Implementation abstraction == usage of odp_atomic_internal.h in the > >> implementation. > >> > >> > >> > >> -Petri > >> > >> > >> > >> From: ext Bill Fischofer [mailto:bill.fischofer@linaro.org] > >> Sent: Monday, December 01, 2014 3:18 PM > >> To: Savolainen, Petri (NSN - FI/Espoo) > >> Cc: ext Ola Liljedahl; lng-odp@lists.linaro.org > >> > >> > >> Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: odp_ticketlock.[ch]: > >> use odp_atomic_internal.h > >> > >> > >> > >> Isn't that a question of implementation rather than API definition? The > >> APIs are useful. It's the responsibility for each implementation to > realize > >> those APIs optimally for its platform. The alternative is to push that > >> responsibility onto the application, which defeats the portability > goals of > >> ODP. > >> > >> > >> > >> On Mon, Dec 1, 2014 at 8:14 AM, Savolainen, Petri (NSN - FI/Espoo) > >> <petri.savolainen@nsn.com> wrote: > >> > >> Hi, > >> > >> This is why lock (and lock free algorithm) implementation abstraction > has > >> low value. It's hard to find a single abstraction that is optimal for > all HW > >> architectures - a few C lines arranged differently may cause major > >> performance impact. E.g. the kernel ticketlock for x86 packs both those > >> variables into the same word so that they can load both with single xadd > >> command ... how to abstract that. > >> > >> -Petri > >> > >> > >> > >> > -----Original Message----- > >> > From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org] > >> > Sent: Monday, December 01, 2014 2:55 PM > >> > To: Savolainen, Petri (NSN - FI/Espoo) > >> > Cc: lng-odp@lists.linaro.org > >> > Subject: Re: [lng-odp] [PATCHv2 5/5] linux-generic: > odp_ticketlock.[ch]: > >> > use odp_atomic_internal.h > >> > > >> > On 1 December 2014 at 13:30, Savolainen, Petri (NSN - FI/Espoo) > >> > <petri.savolainen@nsn.com> wrote: > >> > > Hi, > >> > > > >> > > This change degrades ticketlock performance. The cur_ticket variable > >> > does not need atomic add functionality, since it's written only from > >> > within the lock. Only correct barrier enforcement is needed. > >> > cur_ticket needs atomic *store* but yes it does not need the whole RMW > >> > add operation to be atomic. This was a bit excessive of me. > >> > > >> > A better way to implemented this would be > >> > uint32_t cur = _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > >> > _ODP_MEMMODEL_RLX); > >> > _odp_atomic_u32_store_mm(&ticketlock->cur_ticket, cur + 1, > >> > _ODP_MEMMODEL_RLS); > >> > > >> > 0000000000000030 <odp_ticketlock_unlock>: > >> > 30: 8b 47 04 mov 0x4(%rdi),%eax > >> > 33: 83 c0 01 add $0x1,%eax > >> > 36: 89 47 04 mov %eax,0x4(%rdi) > >> > 39: c3 retq > >> > > >> > I was saving performance optimizations for later. On ARM there is a > >> > lot you can do with WFE/SEV (wait for event/signal event) which will > >> > avoid unnecessary spinning on atomic variables and avoiding the > >> > waiters interfering (loading a flag -> want shared ownership of cache > >> > line) with the lock owner (which likely want exclusive ownership of > >> > cache line in order to update it, this will often stall if other > >> > threads keep loading the lock variable). Moving the lock variables to > >> > another cache line avoids this problem but introduces another > >> > potential cache miss (when accessing the protected data) so is not > >> > necessarily better in all cases. > >> > > >> > -- Ola > >> > > >> > > > >> > > Here's odp_example cycle counts from the tip of repo (single thread, > >> > > but > >> > multi-thread figures follow the same pattern). > >> > > > >> > > Thread 1 starts on core 1 > >> > > [1] alloc_sng alloc+free 61 cycles, 18 ns > >> > > [1] alloc_multi alloc+free 38 cycles, 11 ns > >> > > [1] poll_queue enq+deq 139 cycles, 41 ns > >> > > [1] sched_one_s_lo enq+deq 626 cycles, 184 ns > >> > > [1] sched_____s_lo enq+deq 625 cycles, 184 ns > >> > > [1] sched_one_m_lo enq+deq 620 cycles, 182 ns > >> > > [1] sched_____m_lo enq+deq 619 cycles, 182 ns > >> > > [1] sched_multi_lo enq+deq 165 cycles, 48 ns > >> > > [1] sched_one_s_hi enq+deq 331 cycles, 97 ns > >> > > [1] sched_____s_hi enq+deq 331 cycles, 97 ns > >> > > [1] sched_one_m_hi enq+deq 325 cycles, 96 ns > >> > > [1] sched_____m_hi enq+deq 325 cycles, 95 ns > >> > > [1] sched_multi_hi enq+deq 90 cycles, 26 ns > >> > > Thread 1 exits > >> > > ODP example complete > >> > > > >> > > > >> > > And here's the same after removing this patch from the tip. > >> > > > >> > > > >> > > Thread 1 starts on core 1 > >> > > [1] alloc_sng alloc+free 61 cycles, 18 ns > >> > > [1] alloc_multi alloc+free 35 cycles, 10 ns > >> > > [1] poll_queue enq+deq 86 cycles, 25 ns > >> > > [1] sched_one_s_lo enq+deq 404 cycles, 119 ns > >> > > [1] sched_____s_lo enq+deq 404 cycles, 119 ns > >> > > [1] sched_one_m_lo enq+deq 403 cycles, 119 ns > >> > > [1] sched_____m_lo enq+deq 405 cycles, 119 ns > >> > > [1] sched_multi_lo enq+deq 112 cycles, 33 ns > >> > > [1] sched_one_s_hi enq+deq 216 cycles, 63 ns > >> > > [1] sched_____s_hi enq+deq 218 cycles, 64 ns > >> > > [1] sched_one_m_hi enq+deq 211 cycles, 62 ns > >> > > [1] sched_____m_hi enq+deq 212 cycles, 62 ns > >> > > [1] sched_multi_hi enq+deq 67 cycles, 19 ns > >> > > Thread 1 exits > >> > > ODP example complete > >> > > > >> > > > >> > > Both ways are functionally correct, but atomic add functionality is > >> > > not > >> > needed here and degrades performance at least in my x86 system. > >> > > > >> > > > >> > > -Petri > >> > > > >> > > > >> > > > >> > > > >> > >> -----Original Message----- > >> > >> From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp- > >> > >> bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl > >> > >> Sent: Thursday, November 27, 2014 5:13 PM > >> > >> To: lng-odp@lists.linaro.org > >> > >> Subject: [lng-odp] [PATCHv2 5/5] linux-generic: > odp_ticketlock.[ch]: > >> > use > >> > >> odp_atomic_internal.h > >> > >> > >> > >> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> > >> > >> --- > >> > >> (This document/code contribution attached is provided under the > terms > >> > of > >> > >> agreement LES-LTM-21309) > >> > >> > >> > >> Use definitions from odp_atomic_internal.h. > >> > >> > >> > >> platform/linux-generic/include/api/odp_ticketlock.h | 2 +- > >> > >> platform/linux-generic/odp_ticketlock.c | 14 > >> > >> ++++++------- > >> > - > >> > >> 2 files changed, 7 insertions(+), 9 deletions(-) > >> > >> > >> > >> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h > >> > >> b/platform/linux-generic/include/api/odp_ticketlock.h > >> > >> index a3b3ab5..619816e 100644 > >> > >> --- a/platform/linux-generic/include/api/odp_ticketlock.h > >> > >> +++ b/platform/linux-generic/include/api/odp_ticketlock.h > >> > >> @@ -31,7 +31,7 @@ extern "C" { > >> > >> */ > >> > >> typedef struct odp_ticketlock_t { > >> > >> odp_atomic_u32_t next_ticket; /**< @private Next ticket */ > >> > >> - volatile uint32_t cur_ticket; /**< @private Current ticket > */ > >> > >> + odp_atomic_u32_t cur_ticket; /**< @private Current ticket > */ > >> > >> } odp_ticketlock_t; > >> > >> > >> > >> > >> > >> diff --git a/platform/linux-generic/odp_ticketlock.c > >> > >> b/platform/linux- > >> > >> generic/odp_ticketlock.c > >> > >> index e60f526..6c5e74e 100644 > >> > >> --- a/platform/linux-generic/odp_ticketlock.c > >> > >> +++ b/platform/linux-generic/odp_ticketlock.c > >> > >> @@ -6,6 +6,7 @@ > >> > >> > >> > >> #include <odp_ticketlock.h> > >> > >> #include <odp_atomic.h> > >> > >> +#include <odp_atomic_internal.h> > >> > >> #include <odp_sync.h> > >> > >> #include <odp_spin_internal.h> > >> > >> > >> > >> @@ -13,7 +14,7 @@ > >> > >> void odp_ticketlock_init(odp_ticketlock_t *ticketlock) > >> > >> { > >> > >> odp_atomic_init_u32(&ticketlock->next_ticket, 0); > >> > >> - ticketlock->cur_ticket = 0; > >> > >> + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); > >> > >> } > >> > >> > >> > >> > >> > >> @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t > >> > *ticketlock) > >> > >> > >> > >> ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); > >> > >> > >> > >> - while (ticket != ticketlock->cur_ticket) > >> > >> + while (ticket != > >> > >> _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, > >> > >> + _ODP_MEMMODEL_ACQ)) > >> > >> odp_spin(); > >> > >> - > >> > >> - __atomic_thread_fence(__ATOMIC_ACQUIRE); > >> > >> } > >> > >> > >> > >> > >> > >> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) > >> > >> { > >> > >> - __atomic_thread_fence(__ATOMIC_RELEASE); > >> > >> - > >> > >> - ticketlock->cur_ticket++; > >> > >> + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, > >> > >> _ODP_MEMMODEL_RLS); > >> > >> > >> > >> #if defined __OCTEON__ > >> > >> odp_sync_stores(); /* SYNCW to flush write buffer */ > >> > >> @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t > >> > *ticketlock) > >> > >> > >> > >> int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) > >> > >> { > >> > >> - return ticketlock->cur_ticket != > >> > >> + return odp_atomic_load_u32(&ticketlock->cur_ticket) != > >> > >> odp_atomic_load_u32(&ticketlock->next_ticket); > >> > >> } > >> > >> -- > >> > >> 1.9.1 > >> > >> > >> > >> > >> > >> _______________________________________________ > >> > >> lng-odp mailing list > >> > >> lng-odp@lists.linaro.org > >> > >> http://lists.linaro.org/mailman/listinfo/lng-odp > >> _______________________________________________ > >> lng-odp mailing list > >> lng-odp@lists.linaro.org > >> http://lists.linaro.org/mailman/listinfo/lng-odp > >> > >> > > > > >
diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h index a3b3ab5..619816e 100644 --- a/platform/linux-generic/include/api/odp_ticketlock.h +++ b/platform/linux-generic/include/api/odp_ticketlock.h @@ -31,7 +31,7 @@ extern "C" { */ typedef struct odp_ticketlock_t { odp_atomic_u32_t next_ticket; /**< @private Next ticket */ - volatile uint32_t cur_ticket; /**< @private Current ticket */ + odp_atomic_u32_t cur_ticket; /**< @private Current ticket */ } odp_ticketlock_t; diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c index e60f526..6c5e74e 100644 --- a/platform/linux-generic/odp_ticketlock.c +++ b/platform/linux-generic/odp_ticketlock.c @@ -6,6 +6,7 @@ #include <odp_ticketlock.h> #include <odp_atomic.h> +#include <odp_atomic_internal.h> #include <odp_sync.h> #include <odp_spin_internal.h> @@ -13,7 +14,7 @@ void odp_ticketlock_init(odp_ticketlock_t *ticketlock) { odp_atomic_init_u32(&ticketlock->next_ticket, 0); - ticketlock->cur_ticket = 0; + odp_atomic_init_u32(&ticketlock->cur_ticket, 0); } @@ -23,18 +24,15 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock) ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket); - while (ticket != ticketlock->cur_ticket) + while (ticket != _odp_atomic_u32_load_mm(&ticketlock->cur_ticket, + _ODP_MEMMODEL_ACQ)) odp_spin(); - - __atomic_thread_fence(__ATOMIC_ACQUIRE); } void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) { - __atomic_thread_fence(__ATOMIC_RELEASE); - - ticketlock->cur_ticket++; + _odp_atomic_u32_add_mm(&ticketlock->cur_ticket, 1, _ODP_MEMMODEL_RLS); #if defined __OCTEON__ odp_sync_stores(); /* SYNCW to flush write buffer */ @@ -44,6 +42,6 @@ void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock) int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock) { - return ticketlock->cur_ticket != + return odp_atomic_load_u32(&ticketlock->cur_ticket) != odp_atomic_load_u32(&ticketlock->next_ticket); }
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org> --- (This document/code contribution attached is provided under the terms of agreement LES-LTM-21309) Use definitions from odp_atomic_internal.h. platform/linux-generic/include/api/odp_ticketlock.h | 2 +- platform/linux-generic/odp_ticketlock.c | 14 ++++++-------- 2 files changed, 7 insertions(+), 9 deletions(-)