diff mbox

[ODP/PATCH,v1] Look ma, no barriers! C11 memory model

Message ID 1413409619-13489-1-git-send-email-ola.liljedahl@linaro.org
State New
Headers show

Commit Message

Ola Liljedahl Oct. 15, 2014, 9:46 p.m. UTC
Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
---
Implementation of C11-based memory model for atomic operations.
Attempt to remove all explicit memory barriers (odp_sync_stores) from code that
implements multithreaded synchronization primitives (e.g. locks, barriers).
Rewrote such primitives to use the new atomic operations.
Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
Other architectures will fall back to GCC __sync builtins which often include
unnecessarily heavy barrier/sync operations (always sequentially consistent).
Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter) and
odp_ring enqueue/dequeue (need release barrier but only had compiler barrier).

 example/generator/odp_generator.c                  |  43 +-
 example/ipsec/odp_ipsec.c                          |   2 +-
 example/odp_example/odp_example.c                  |   2 +-
 example/timer/odp_timer_test.c                     |   2 +-
 helper/include/odph_ring.h                         |   8 +-
 platform/linux-generic/include/api/odp_atomic.h    | 820 ++++++++++++---------
 platform/linux-generic/include/api/odp_barrier.h   |  10 +-
 platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
 .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
 .../linux-generic/include/odp_buffer_internal.h    |   2 +-
 platform/linux-generic/odp_barrier.c               |  43 +-
 platform/linux-generic/odp_buffer.c                |   3 +-
 platform/linux-generic/odp_crypto.c                |   4 +-
 platform/linux-generic/odp_queue.c                 |   7 +-
 platform/linux-generic/odp_ring.c                  |  86 ++-
 platform/linux-generic/odp_rwlock.c                |  46 +-
 platform/linux-generic/odp_thread.c                |   6 +-
 platform/linux-generic/odp_ticketlock.c            |  27 +-
 platform/linux-generic/odp_timer.c                 |  17 +-
 test/api_test/odp_atomic_test.c                    | 126 +---
 test/api_test/odp_atomic_test.h                    |   9 -
 21 files changed, 651 insertions(+), 636 deletions(-)

Comments

Bill Fischofer Oct. 15, 2014, 11:18 p.m. UTC | #1
Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
intended to be generic wouldn't omitting these be better?

On Wed, Oct 15, 2014 at 4:46 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
> ---
> Implementation of C11-based memory model for atomic operations.
> Attempt to remove all explicit memory barriers (odp_sync_stores) from code
> that
> implements multithreaded synchronization primitives (e.g. locks, barriers).
> Rewrote such primitives to use the new atomic operations.
> Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
> Other architectures will fall back to GCC __sync builtins which often
> include
> unnecessarily heavy barrier/sync operations (always sequentially
> consistent).
> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter)
> and
> odp_ring enqueue/dequeue (need release barrier but only had compiler
> barrier).
>
>  example/generator/odp_generator.c                  |  43 +-
>  example/ipsec/odp_ipsec.c                          |   2 +-
>  example/odp_example/odp_example.c                  |   2 +-
>  example/timer/odp_timer_test.c                     |   2 +-
>  helper/include/odph_ring.h                         |   8 +-
>  platform/linux-generic/include/api/odp_atomic.h    | 820
> ++++++++++++---------
>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>  .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>  platform/linux-generic/odp_barrier.c               |  43 +-
>  platform/linux-generic/odp_buffer.c                |   3 +-
>  platform/linux-generic/odp_crypto.c                |   4 +-
>  platform/linux-generic/odp_queue.c                 |   7 +-
>  platform/linux-generic/odp_ring.c                  |  86 ++-
>  platform/linux-generic/odp_rwlock.c                |  46 +-
>  platform/linux-generic/odp_thread.c                |   6 +-
>  platform/linux-generic/odp_ticketlock.c            |  27 +-
>  platform/linux-generic/odp_timer.c                 |  17 +-
>  test/api_test/odp_atomic_test.c                    | 126 +---
>  test/api_test/odp_atomic_test.h                    |   9 -
>  21 files changed, 651 insertions(+), 636 deletions(-)
>
> diff --git a/example/generator/odp_generator.c
> b/example/generator/odp_generator.c
> index eb8b340..cf2d77b 100644
> --- a/example/generator/odp_generator.c
> +++ b/example/generator/odp_generator.c
> @@ -62,10 +62,10 @@ typedef struct {
>   * counters
>  */
>  static struct {
> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
> -       odp_atomic_u64_t ip;    /**< ip packets */
> -       odp_atomic_u64_t udp;   /**< udp packets */
> -       odp_atomic_u64_t icmp;  /**< icmp packets */
> +       odp_atomic64_t seq;     /**< ip seq to be send */
> +       odp_atomic64_t ip;      /**< ip packets */
> +       odp_atomic64_t udp;     /**< udp packets */
> +       odp_atomic64_t icmp;    /**< icmp packets */
>  } counters;
>
>  /** * Thread specific arguments
> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_UDPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_UDP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
> ODPH_ICMPHDR_LEN +
>                                        ODPH_IPV4HDR_LEN);
>         ip->proto = ODPH_IPPROTO_ICMP;
> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
>         ip->id = odp_cpu_to_be_16(seq);
>         ip->chksum = 0;
>         odph_ipv4_csum_update(pkt);
> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>                 }
>
>                 if (args->appl.interval != 0) {
> +                       uint64_t seq =
> odp_atomic64_load_rlx(&counters.seq);
>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
> -                              thr, counters.seq, counters.seq%0xffff);
> +                              thr, seq, seq%0xffff);
>                         /* TODO use odp timer */
>                         usleep(args->appl.interval * 1000);
>                 }
> -               if (args->appl.number != -1 && counters.seq
> -                   >= (unsigned int)args->appl.number) {
> +               if (args->appl.number != -1 &&
> +                   odp_atomic64_load_rlx(&counters.seq) >=
> +                   (unsigned int)args->appl.number) {
>                         break;
>                 }
>         }
> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>         /* receive number of reply pks until timeout */
>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>                 while (args->appl.timeout >= 0) {
> -                       if (counters.icmp >= (unsigned
> int)args->appl.number)
> +                       if (odp_atomic64_load_rlx(&counters.icmp) >=
> +                           (unsigned int)args->appl.number)
>                                 break;
>                         /* TODO use odp timer */
>                         sleep(1);
> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>
>         /* print info */
>         if (args->appl.mode == APPL_MODE_UDP) {
> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
> +               printf("  [%02i] total send: %ju\n", thr,
> +                      odp_atomic64_load_rlx(&counters.seq));
>         } else if (args->appl.mode == APPL_MODE_PING) {
>                 printf("  [%02i] total send: %ju total receive: %ju\n",
> -                      thr, counters.seq, counters.icmp);
> +                      thr, odp_atomic64_load_rlx(&counters.seq),
> +                      odp_atomic64_load_rlx(&counters.icmp));
>         }
>         return arg;
>  }
> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                 if (!odp_packet_inflag_ipv4(pkt))
>                         continue;
>
> -               odp_atomic_inc_u64(&counters.ip);
> +               odp_atomic64_add_rlx(&counters.ip, 1);
>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>
>                 /* udp */
>                 if (ip->proto == ODPH_IPPROTO_UDP) {
> -                       odp_atomic_inc_u64(&counters.udp);
> +                       odp_atomic64_add_rlx(&counters.udp, 1);
>                         udp = (odph_udphdr_t *)(buf + offset);
>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>                                         odp_be_to_cpu_16(udp->length) -
> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
> pkt_tbl[], unsigned len)
>                         icmp = (odph_icmphdr_t *)(buf + offset);
>                         /* echo reply */
>                         if (icmp->type == ICMP_ECHOREPLY) {
> -                               odp_atomic_inc_u64(&counters.icmp);
> +                               odp_atomic64_add_rlx(&counters.icmp, 1);
>                                 memcpy(&tvsend, buf + offset +
> ODPH_ICMPHDR_LEN,
>                                        sizeof(struct timeval));
>                                 /* TODO This should be changed to use an
> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>         }
>
>         /* init counters */
> -       odp_atomic_init_u64(&counters.seq);
> -       odp_atomic_init_u64(&counters.ip);
> -       odp_atomic_init_u64(&counters.udp);
> -       odp_atomic_init_u64(&counters.icmp);
> +       odp_atomic64_store_rlx(&counters.seq, 0);
> +       odp_atomic64_store_rlx(&counters.ip, 0);
> +       odp_atomic64_store_rlx(&counters.udp, 0);
> +       odp_atomic64_store_rlx(&counters.icmp, 0);
>
>         /* Reserve memory for args from shared mem */
>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
> index 2f2dc19..76c27d0 100644
> --- a/example/ipsec/odp_ipsec.c
> +++ b/example/ipsec/odp_ipsec.c
> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>         printf("Num worker threads: %i\n", num_workers);
>
>         /* Create a barrier to synchronize thread startup */
> -       odp_barrier_init_count(&sync_barrier, num_workers);
> +       odp_barrier_init(&sync_barrier, num_workers);
>
>         /*
>          * By default core #0 runs Linux kernel background tasks.
> diff --git a/example/odp_example/odp_example.c
> b/example/odp_example/odp_example.c
> index 0e9aa3d..c473395 100644
> --- a/example/odp_example/odp_example.c
> +++ b/example/odp_example/odp_example.c
> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>         odp_shm_print_all();
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&globals->barrier, num_workers);
> +       odp_barrier_init(&globals->barrier, num_workers);
>
>         if (args.proc_mode) {
>                 int ret;
> diff --git a/example/timer/odp_timer_test.c
> b/example/timer/odp_timer_test.c
> index 78b2ae2..dfbeae9 100644
> --- a/example/timer/odp_timer_test.c
> +++ b/example/timer/odp_timer_test.c
> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>         printf("\n");
>
>         /* Barrier to sync test case execution */
> -       odp_barrier_init_count(&test_barrier, num_workers);
> +       odp_barrier_init(&test_barrier, num_workers);
>
>         /* Create and launch worker threads */
>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
> index 76c1db8..5e78b34 100644
> --- a/helper/include/odph_ring.h
> +++ b/helper/include/odph_ring.h
> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>                 uint32_t sp_enqueue;     /* True, if single producer. */
>                 uint32_t size;           /* Size of ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Producer head. */
> -               uint32_t tail;          /* Producer tail. */
> +               odp_atomic32_t head;    /* Producer head. */
> +               odp_atomic32_t tail;    /* Producer tail. */
>         } prod ODP_ALIGNED_CACHE;
>
>         /** @private Consumer */
> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>                 uint32_t size;           /* Size of the ring. */
>                 uint32_t mask;           /* Mask (size-1) of ring. */
> -               uint32_t head;          /* Consumer head. */
> -               uint32_t tail;          /* Consumer tail. */
> +               odp_atomic32_t head;    /* Consumer head. */
> +               odp_atomic32_t tail;    /* Consumer tail. */
>         } cons ODP_ALIGNED_CACHE;
>
>         /** @private Memory space of ring starts here. */
> diff --git a/platform/linux-generic/include/api/odp_atomic.h
> b/platform/linux-generic/include/api/odp_atomic.h
> index 0cc4cf4..89f183c 100644
> --- a/platform/linux-generic/include/api/odp_atomic.h
> +++ b/platform/linux-generic/include/api/odp_atomic.h
> @@ -4,463 +4,559 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> -
>  /**
>   * @file
>   *
> - * ODP atomic operations
> + * ODP atomic types and operations, semantically a subset of C11 atomics.
> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
> + * without using the required access functions.
> + * Atomic functions must be used to operate on atomic variables!
>   */
>
>  #ifndef ODP_ATOMIC_H_
>  #define ODP_ATOMIC_H_
>
> +#include <stdint.h>
> +#include <odp_align.h>
> +#include <odp_hints.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
> -
> -#include <odp_std_types.h>
> -
> -
> -/**
> - * Atomic integer
> - */
> -typedef volatile int32_t odp_atomic_int_t;
> -
>  /**
> - * Atomic unsigned integer 64 bits
> + * 32-bit (unsigned) atomic type
>   */
> -typedef volatile uint64_t odp_atomic_u64_t;
> +typedef struct {
> +       uint32_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic32_t
> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>
>  /**
> - * Atomic unsigned integer 32 bits
> + * 64-bit (unsigned) atomic type
>   */
> -typedef volatile uint32_t odp_atomic_u32_t;
> -
> +typedef struct {
> +       uint64_t v; /**< Actual storage for the atomic variable */
> +} odp_atomic64_t
> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>
> -/**
> - * Initialize atomic integer
> - *
> - * @param ptr    An integer atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
> -{
> -       *ptr = 0;
> -}
> -
> -/**
> - * Load value of atomic integer
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic integer value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
> -{
> -       return *ptr;
> -}
>
> +/*****************************************************************************
> + * Just a few helpers
> +
> *****************************************************************************/
>
> -/**
> - * Store value to atomic integer
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
> new_value)
> -{
> -       *ptr = new_value;
> -}
> -
> -/**
> - * Fetch and add atomic integer
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> -
> -/**
> - * Fetch and subtract atomic integer
> - *
> - * @param ptr    An atomic integer variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
> value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> -
> -/**
> - * Fetch and increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_add_int(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic integer by 1
> - *
> - * @param ptr    An atomic int variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_int(ptr, 1);
> -}
> -
> -/**
> - * Decrement atomic integer by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
> -{
> -       odp_atomic_fetch_sub_int(ptr, 1);
> -}
> +#ifdef __OCTEON__
> +/* OCTEON Write Memory Barrier */
> +#define COMPILER_HW_BARRIER() __asm __volatile( \
> +       /* Double syncw to work around errata */ \
> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
> +       : : : "memory")
> +/* syncw is also used to flush the write buffer which makes stores visible
> + * quicker which should be beneficial to release operations */
> +#define OCTEON_FLUSH() __asm __volatile( \
> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
> +       : : : "memory")
> +#else
> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
> */
> +/** Compiler and hardware full memory barrier */
> +#define COMPILER_HW_BARRIER() __sync_synchronize()
> +/** Flush write buffer on OCTEON */
> +#define OCTEON_FLUSH() (void)0
> +#endif
>
> -/**
> - * Initialize atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
> -{
> -       *ptr = 0;
> -}
> +/** Compiler memory barrier */
> +#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
>
> -/**
> - * Load value of atomic uint32
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return atomic uint32 value
> - *
> - * @note The operation is not synchronized with other threads
> - */
> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
> -{
> -       return *ptr;
> -}
>
> +/*****************************************************************************
> + * Operations on 32-bit atomics
> + * odp_atomic32_load_rlx
> + * odp_atomic32_store_rlx
> + * odp_atomic32_load_acq
> + * odp_atomic32_store_rls
> + * odp_atomic32_cmp_and_swap_rlx - return old value
> + * odp_atomic32_fetch_add_rlx - return old value
> + * odp_atomic32_fetch_add_rls - return old value
> + * odp_atomic32_add_rlx - no return value
> + * odp_atomic32_add_rls - no return value
> +
> *****************************************************************************/
>
>  /**
> - * Store value to atomic uint32
> + * Relaxed atomic load of 32-bit atomic variable
> + * @note Relaxed memory model, no barriers.
>   *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * @param ptr   Pointer to a 32-bit atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the variable
>   */
> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
> -                                       uint32_t new_value)
> +static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
>  {
> -       *ptr = new_value;
> +       uint32_t val;
> +       COMPILER_BARRIER();
> +       /* Read of aligned word is atomic */
> +       val = ptr->v;
> +       COMPILER_BARRIER();
> +       return val;
>  }
>
>  /**
> - * Fetch and add atomic uint32
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * Relaxed atomic store of 32-bit atomic variable
> + * @note Relaxed memory model, no barriers.
>   *
> - * @return Value of the variable before the operation
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param val   Value to write to the variable
>   */
> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> +static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr, uint32_t
> val)
>  {
> -       return __sync_fetch_and_add(ptr, value);
> +       COMPILER_BARRIER();
> +       /* Write of aligned word is atomic */
> +       ptr->v = val;
> +       COMPILER_BARRIER();
>  }
>
>  /**
> - * Fetch and subtract uint32
> + * Atomic load-acquire of 32-bit atomic variable
> + * @note SC-load-acquire barrier, later accesses cannot move before
> + * the load-acquire access.
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be sub to the variable
> + * @param ptr   Pointer to a 32-bit atomic variable
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the variable
>   */
> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
> -                                               uint32_t value)
> +static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
>  {
> -       return __sync_fetch_and_sub(ptr, value);
> +#if defined __aarch64__
> +       uint32_t val;
> +       __asm __volatile("ldar %w0, [%1]"
> +                : "=&r"(val)
> +                                : "r"(&ptr->v)
> +                                : "memory");
> +       return val;
> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
> +       /* Read of aligned word is atomic */
> +       uint32_t val = ptr->v;
> +       /* To prevent later accesses from moving up */
> +       /* FIXME: Herb Sutter claims HW barrier not needed on x86? */
> +       COMPILER_HW_BARRIER();
> +       return val;
> +#else
> +#warning odp_atomic32_load_acq() may not be efficiently implemented
> +       /* Assume read of aligned word is atomic */
> +       uint32_t val = ptr->v;
> +       /* To prevent later accesses from moving up */
> +       COMPILER_HW_BARRIER();
> +       return val;
> +#endif
>  }
>
>  /**
> - * Fetch and increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __OCTEON__
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       uint32_t ret;
> -
> -       __asm__ __volatile__ ("syncws");
> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
> -                             "r" (ptr));
> -
> -       return ret;
> -}
> -
> + * Atomic store-release of 32-bit atomic variable
> + * @note SC-store-release barrier, earlier accesses cannot move after
> + * store-release access.
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + */
> +static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr, uint32_t
> val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       /* Compiler and HW barrier to prevent earlier accesses from moving
> +        * down */
> +       COMPILER_HW_BARRIER();
> +       /* Write of aligned word is atomic */
> +       ptr->v = val;
> +       /* Compiler and HW barrier to prevent this store from moving down
> after
> +        * a later load-acquire and thus create overlapping critical
> sections.
> +        * Herb Sutter thinks this is needed */
> +       COMPILER_HW_BARRIER();
> +#elif defined __aarch64__
> +       __asm __volatile("stlr %w0, [%1]"
> +                :
> +                : "r"(val), "r"(&ptr->v)
> +                                : "memory");
> +#elif defined __mips64__
> +       /* Compiler and HW barrier to prevent earlier accesses from moving
> +        * down */
> +       COMPILER_HW_BARRIER();
> +       /* Write of aligned word is atomic */
> +       ptr->v = val;
> +       /* Compiler and HW barrier to prevent this store from moving down
> after
> +        * a later load-acquire and thus create overlapping critical
> sections.
> +        * Herb Sutter thinks this is needed */
> +       COMPILER_HW_BARRIER();
> +#elif defined __x86_64__
> +       /* This is actually an atomic exchange operation */
> +       /* Generates good code on x86_64 */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>  #else
> -
> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u32(ptr, 1);
> -}
> -
> +#warning odp_atomic32_store_rls() may not be efficiently implemented
> +       /* This is actually an atomic exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>  #endif
> -
> -/**
> - * Increment atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_add_u32(ptr, 1);
>  }
>
> -/**
> - * Fetch and decrement uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u32(ptr, 1);
> -}
>
>  /**
> - * Decrement atomic uint32 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u32(ptr, 1);
> + * Atomic compare and swap of 32-bit atomic variable
> + * @note Relaxed memory model, no barriers.
> + * @note Not compare-and-set! Called should compare return value with
> expected
> + * parameter to check if swap operation succeeded.
> + *
> + * @param ptr  Pointer to a 32-bit atomic variable
> + * @param exp  Expected old value
> + * @param val  New value
> + * @return Actual old value, if different from 'exp' then swap failed
> + */
> +static inline uint32_t
> +odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
> +                             uint32_t exp,
> +                             uint32_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t old;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%1]"
> +                : "=&r"(old)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       break;
> +               }
> +               /* Current value is as expected, attempt to write new
> value */
> +               __asm __volatile("strex %0, %1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(val), "r"(&ptr->v)
> +                                        : "memory");
> +               /* Restart the loop so we can re-read the previous value */
> +       } while (odp_unlikely(status != 0));
> +       return old;
> +#elif defined __aarch64__
> +       uint32_t old;
> +       int status;
> +       do {
> +               __asm __volatile("ldxr %w0, [%1]"
> +                : "=&r"(old)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       /* Clear exclusive access monitor */
> +                       __asm __volatile("clrex");
> +                       break;
> +               }
> +               /* Current value is as expected, attempt to write new
> value */
> +               __asm __volatile("stxr %w0, %w1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(val), "r"(&ptr->v)
> +                                        : "memory");
> +               /* Restart the loop so we can re-read the previous value */
> +       } while (odp_unlikely(status != 0));
> +       return old;
> +#elif defined __mips64__
> +       uint32_t old, new_val;
> +       do {
> +               __asm __volatile("llw %0, [%1]"
> +                : "=&r"(old)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               if (odp_unlikely(old != exp)) {
> +                       /* Value has changed, can't proceed */
> +                       break;
> +               }
> +               /* Current value is as expected, attempt to write new
> value */
> +               new_val = val;
> +               __asm __volatile("scw %0, [%1]"
> +                : "+&r"(new_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(new_val == 0));
> +       return old;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
> +#else
> +#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently
> implemented
> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
> +#endif
>  }
>
>  /**
> - * Atomic compare and set for 32bit
> - *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> - */
> -static inline int
> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
> -{
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note Relaxed memory model, no barriers.
> + * @note A - B <=> A + (-B)
> + *
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
> +               uint32_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint32_t old_val, new_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrex %0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("strex %0, %1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(new_val), "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(status != 0));
> +       return old_val;
> +#elif defined __aarch64__
> +       uint32_t old_val, new_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldxr %w0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("stxr %w0, %w1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(new_val), "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(status != 0));
> +       return old_val;
> +#elif defined __mips64__
> +       uint32_t old_val, new_val;
> +       do {
> +               __asm __volatile("llw %0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("scw %0, [%1]"
> +                : "+&r"(new_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(new_val == 0));
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add_rlx() may not be efficiently implemented
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
>  }
>
>  /**
> - * Initialize atomic uint64
> + * Atomic fetch and add to 32-bit atomic variable
> + * @note Sequential consistent memory model, barriers before and after the
> + * @note A - B <=> A + (-B)
>   *
> - * @param ptr    An atomic variable
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
>   *
> - * @note The operation is not synchronized with other threads
> + * @return Value of the atomic variable before the addition
>   */
> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
> +static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
> +               uint32_t incr)
>  {
> -       *ptr = 0;
> +#if defined __arm__ /* A32/T32 ISA */
> +       COMPILER_HW_BARRIER();
> +       return odp_atomic32_fetch_add_rlx(ptr, incr);
> +#elif defined __aarch64__
> +       /* We basically get acquire/release semantics */
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#elif defined __mips64__
> +       uint32_t old;
> +       COMPILER_HW_BARRIER();
> +       old = odp_atomic32_fetch_add_rlx(ptr, incr);
> +       OCTEON_FLUSH();
> +       return old;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic32_fetch_add_rls() may not be efficiently implemented
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
>  }
>
>  /**
> - * Load value of atomic uint64
> - *
> - * @param ptr    An atomic variable
> + * Atomic add to 32-bit atomic variable
> + * @note Relaxed memory model, no barriers.
>   *
> - * @return atomic uint64 value
> - *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
>   */
> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
> +static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
> +                                       uint32_t incr)
>  {
> -       return *ptr;
> +       /* Use odp_atomic32_fetch_add_rlx() for now */
> +       (void)odp_atomic32_fetch_add_rlx(ptr, incr);
>  }
>
>  /**
> - * Store value to atomic uint64
> - *
> - * @param ptr        An atomic variable
> - * @param new_value  Store new_value to a variable
> + * Atomic add to 32-bit atomic variable
> + * @note Sequential consistent memory model, barriers before and after the
> + * operation.
>   *
> - * @note The operation is not synchronized with other threads
> + * @param ptr   Pointer to a 32-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
>   */
> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
> -                                       uint64_t new_value)
> +static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t
> incr)
>  {
> -       *ptr = new_value;
> +       /* Use odp_atomic32_fetch_add_rls() for now */
> +       (void)odp_atomic32_fetch_add_rls(ptr, incr);
>  }
>
> -/**
> - * Add atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> - *
> - */
> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_add(ptr, value);
> -}
>
> +/*****************************************************************************
> + * Operations on 64-bit atomics
> + * odp_atomic64_load_rlx
> + * odp_atomic64_store_rlx
> + * odp_atomic64_fetch_add_rlx
> + * odp_atomic64_add_rlx
> +
> *****************************************************************************/
>
>  /**
> - * Fetch and add atomic uint64
> + * Relaxed atomic load of 64-bit atomic variable
> + * @note Relaxed memory model, no barriers.
>   *
> - * @param ptr    An atomic variable
> - * @param value  A value to be added to the variable
> + * @param ptr   Pointer to a 64-bit atomic variable
>   *
> - * @return Value of the variable before the operation
> + * @return Value of the atomic variable
>   */
> -
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> +static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
>  {
> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t val;
> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
> +                        "clrex" /* Clear exclusive access monitor */
> +                : "=&r"(val)
> +                                : "r"(&ptr->v)
> +                                : );
> +       return val;
> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
> +       /* Read of aligned quad/double word is atomic */
> +       return ptr->v;
>  #else
> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_add(ptr, value);
> -}
> +#warning odp_atomic64_load_rlx() may not be efficiently implemented
> +       return __sync_fetch_and_or(&ptr->v, 0);
>  #endif
> -/**
> - * Subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - */
> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
> value)
> -{
> -       __sync_fetch_and_sub(ptr, value);
>  }
>
>  /**
> - * Fetch and subtract atomic uint64
> - *
> - * @param ptr    An atomic variable
> - * @param value  A value to be subtracted from the variable
> - *
> - * @return Value of the variable before the operation
> - */
> -#if defined __powerpc__ && !defined __powerpc64__
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
> -                                   (uint32_t)value);
> -}
> + * Relaxed atomic store of 64-bit atomic variable
> + * @note Relaxed memory model, no barriers.
> + *
> + * @param ptr  Pointer to a 64-bit atomic variable
> + * @param val  Value to write to the atomic variable
> + */
> +static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
> +               uint64_t val)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val;
> +       int status;
> +       do {
> +               /* Read atomic variable exclusively so we can write to it
> +                * later */
> +               __asm __volatile("ldrexd %0, %H0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               (void)old_val; /* Ignore old value */
> +               /* Attempt to write the new value */
> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(val), "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
> +       /* Write of aligned quad/double word is atomic */
> +       ptr->v = val;
>  #else
> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
> -                                               uint64_t value)
> -{
> -       return __sync_fetch_and_sub(ptr, value);
> -}
> +#warning odp_atomic64_store_rlx() may not be efficiently implemented
> +       /* This is actually an atomic exchange operation */
> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>  #endif
> -/**
> - * Fetch and increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Increment atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_add_u64(ptr, 1);
> -}
> -
> -/**
> - * Fetch and decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - * @return Value of the variable before the operation
> - */
> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>  }
>
>  /**
> - * Decrement atomic uint64 by 1
> - *
> - * @param ptr    An atomic variable
> - *
> - */
> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
> -{
> -       odp_atomic_fetch_sub_u64(ptr, 1);
> + * Atomic fetch and add to 64-bit atomic variable
> + * @note Relaxed memory model, no barriers.
> + *
> + * @param ptr   Pointer to a 64-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
> + *
> + * @return Value of the atomic variable before the addition
> + */
> +static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
> +               uint64_t incr)
> +{
> +#if defined __arm__ /* A32/T32 ISA */
> +       uint64_t old_val, new_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldrexd %0, %H0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(new_val), "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __aarch64__
> +       uint64_t old_val, new_val;
> +       int status;
> +       do {
> +               __asm __volatile("ldxr %x0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("stxr %w0, %x1, [%2]"
> +                : "=&r"(status)
> +                                        : "r"(new_val), "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(status != 0)); /* Retry until write succeeds
> */
> +       return old_val;
> +#elif defined __mips64__
> +       uint64_t old_val, new_val;
> +       do {
> +               __asm __volatile("ll %0, [%1]"
> +                : "=&r"(old_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +               new_val = old_val + incr;
> +               __asm __volatile("sc %0, [%1]"
> +                : "+&r"(new_val)
> +                                        : "r"(&ptr->v)
> +                                        : "memory");
> +       } while (odp_unlikely(new_val == 0));
> +       return old_val;
> +#elif defined __x86_64__
> +       /* Generates good code on x86_64 */
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#else
> +#warning odp_atomic64_fetch_add_rlx() may not be efficiently implemented
> +       return __sync_fetch_and_add(&ptr->v, incr);
> +#endif
>  }
>
>  /**
> - * Atomic compare and set for 64bit
> + * Atomic add to 64-bit atomic variable
> + * @note Relaxed memory model, no barriers.
>   *
> - * @param dst destination location into which the value will be written.
> - * @param exp expected value.
> - * @param src new value.
> - * @return Non-zero on success; 0 on failure.
> + * @param ptr   Pointer to a 64-bit atomic variable
> + * @param incr  The value to be added to the atomic variable
>   */
> -static inline int
> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
> +static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t
> incr)
>  {
> -       return __sync_bool_compare_and_swap(dst, exp, src);
> +       (void)odp_atomic64_fetch_add_rlx(ptr, incr);
>  }
>
>  #ifdef __cplusplus
> diff --git a/platform/linux-generic/include/api/odp_barrier.h
> b/platform/linux-generic/include/api/odp_barrier.h
> index a7b3215..f8eae9a 100644
> --- a/platform/linux-generic/include/api/odp_barrier.h
> +++ b/platform/linux-generic/include/api/odp_barrier.h
> @@ -27,18 +27,18 @@ extern "C" {
>   * ODP execution barrier
>   */
>  typedef struct odp_barrier_t {
> -       int              count;  /**< @private Thread count */
> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
> +       uint32_t       num_threads;  /**< @private Thread count (constant)
> */
> +       odp_atomic32_t in_barrier;   /**< @private Threaads in barrier */
>  } odp_barrier_t;
>
>
>  /**
>   * Init barrier with thread count
>   *
> - * @param barrier    Barrier
> - * @param count      Thread count
> + * @param barrier     Barrier
> + * @param num_threads Number of threads which share the barrier
>   */
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>
>
>  /**
> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
> b/platform/linux-generic/include/api/odp_rwlock.h
> index 252ebb2..ff8a9a2 100644
> --- a/platform/linux-generic/include/api/odp_rwlock.h
> +++ b/platform/linux-generic/include/api/odp_rwlock.h
> @@ -10,26 +10,30 @@
>  /**
>   * @file
>   *
> - * ODP RW Locks
> + * ODP read/write lock
> + * RW lock support multiple concurrent reads but only one (exclusive)
> writer.
>   */
>
> +#include <odp_atomic.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
>
>  /**
>   * The odp_rwlock_t type.
> - * write lock count is -1,
> - * read lock count > 0
> + * write lock is ~0U
> + * read lock count >0 && <~0U
>   */
>  typedef struct {
> -       volatile int32_t cnt; /**< -1 Write lock,
> -                               > 0 for Read lock. */
> +       odp_atomic32_t cnt; /**< == 0: unlocked,
> +                                == ~0: locked for write,
> +                                > 0 number of concurrent read locks */
>  } odp_rwlock_t;
>
>
>  /**
> - * Initialize the rwlock to an unlocked state.
> + * Initialize the rwlock to the unlocked state.
>   *
>   * @param rwlock pointer to the RW Lock.
>   */
> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>
>  /**
> - * Aquire a write lock.
> + * Aquire the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>
>  /**
> - * Release a write lock.
> + * Release the write lock.
>   *
>   * @param rwlock pointer to a RW Lock.
>   */
> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
> b/platform/linux-generic/include/api/odp_ticketlock.h
> index 6277a18..c4b5e34 100644
> --- a/platform/linux-generic/include/api/odp_ticketlock.h
> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
> @@ -27,8 +27,8 @@ extern "C" {
>   * ODP ticketlock
>   */
>  typedef struct odp_ticketlock_t {
> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
> +       odp_atomic32_t next_ticket; /**< @private Next ticket */
> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>  } odp_ticketlock_t;
>
>
> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
> b/platform/linux-generic/include/odp_buffer_internal.h
> index 2002b51..530ab96 100644
> --- a/platform/linux-generic/include/odp_buffer_internal.h
> +++ b/platform/linux-generic/include/odp_buffer_internal.h
> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>         uint32_t                 index;      /* buf index in the pool */
>         size_t                   size;       /* max data size */
>         size_t                   cur_offset; /* current offset */
> -       odp_atomic_int_t         ref_count;  /* reference count */
> +       odp_atomic32_t           ref_count;  /* reference count */
>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>         int                      type;       /* type of next header */
>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
> diff --git a/platform/linux-generic/odp_barrier.c
> b/platform/linux-generic/odp_barrier.c
> index a82b294..6c3b884 100644
> --- a/platform/linux-generic/odp_barrier.c
> +++ b/platform/linux-generic/odp_barrier.c
> @@ -8,41 +8,48 @@
>  #include <odp_sync.h>
>  #include <odp_spin_internal.h>
>
> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>  {
> -       barrier->count = count;
> -       barrier->bar = 0;
> -       odp_sync_stores();
> +       barrier->num_threads = num_threads; /* Constant after
> initialisation */
> +       odp_atomic32_store_rls(&barrier->in_barrier, 0);
>  }
>
>  /*
>   * Efficient barrier_sync -
>   *
>   *   Barriers are initialized with a count of the number of callers
> - *   that must sync on the barrier before any may proceed.
> + *   that must sync on (enter) the barrier before any may proceed (exit).
>   *
>   *   To avoid race conditions and to permit the barrier to be fully
>   *   reusable, the barrier value cycles between 0..2*count-1. When
> - *   synchronizing the wasless variable simply tracks which half of
> + *   synchronizing the waslow variable simply tracks which half of
>   *   the cycle the barrier was in upon entry.  Exit is when the
>   *   barrier crosses to the other half of the cycle.
>   */
>
>  void odp_barrier_sync(odp_barrier_t *barrier)
>  {
> -       int count;
> -       int wasless;
> +       uint32_t count;
> +       bool waslow;
>
> -       odp_sync_stores();
> -       wasless = barrier->bar < barrier->count;
> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
> +       /* FIXME do we need acquire barrier as well? */
> +       /* Increase threads in_barrier count, this will automatically
> release
> +        * the other threads when lower/upper range is switched */
> +       count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
> +       /* Compute lower or higher range indicator */
> +       waslow = count < barrier->num_threads;
>
> -       if (count == 2*barrier->count-1) {
> -               barrier->bar = 0;
> -       } else {
> -               while ((barrier->bar < barrier->count) == wasless)
> -                       odp_spin();
> +       /* Check if in_barrier count has "wrapped" */
> +       if (count == 2 * barrier->num_threads - 1) {
> +               /* Manually wrap the counter */
> +               odp_atomic32_add_rls(&barrier->in_barrier,
> +
> (uint32_t)(-2*(int)barrier->num_threads));
> +               /* We don't need to wait below, return immediately */
> +               return;
> +       }
> +       /* Wait for counter to change half */
> +       while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
> +              barrier->num_threads) == waslow) {
> +               odp_spin();
>         }
> -
> -       odp_mem_barrier();
>  }
> diff --git a/platform/linux-generic/odp_buffer.c
> b/platform/linux-generic/odp_buffer.c
> index e54e0e7..a5939f3 100644
> --- a/platform/linux-generic/odp_buffer.c
> +++ b/platform/linux-generic/odp_buffer.c
> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n, odp_buffer_t
> buf)
>         len += snprintf(&str[len], n-len,
>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>         len += snprintf(&str[len], n-len,
> -                       "  ref_count    %i\n",        hdr->ref_count);
> +                       "  ref_count    %u\n",
> +                       odp_atomic32_load_rlx(&hdr->ref_count));
>         len += snprintf(&str[len], n-len,
>                         "  type         %i\n",        hdr->type);
>         len += snprintf(&str[len], n-len,
> diff --git a/platform/linux-generic/odp_crypto.c
> b/platform/linux-generic/odp_crypto.c
> index b37ad6b..d9fff10 100644
> --- a/platform/linux-generic/odp_crypto.c
> +++ b/platform/linux-generic/odp_crypto.c
> @@ -26,7 +26,7 @@
>  #define MAX_SESSIONS 32
>
>  typedef struct {
> -       odp_atomic_u32_t next;
> +       odp_atomic32_t   next;
>         uint32_t         max;
>         odp_crypto_generic_session_t sessions[0];
>  } odp_crypto_global_t;
> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>         uint32_t idx;
>         odp_crypto_generic_session_t *session = NULL;
>
> -       idx = odp_atomic_fetch_inc_u32(&global->next);
> +       idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
>         if (idx < global->max) {
>                 session = &global->sessions[idx];
>                 session->index = idx;
> diff --git a/platform/linux-generic/odp_queue.c
> b/platform/linux-generic/odp_queue.c
> index 1318bcd..08c0d29 100644
> --- a/platform/linux-generic/odp_queue.c
> +++ b/platform/linux-generic/odp_queue.c
> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
> *context)
>  {
>         queue_entry_t *queue;
>         queue = queue_to_qentry(handle);
> +       /* Setting a new queue context can be viewed as a release
> operation,
> +        * all writes to the context must be observable before the context
> +        * is made observable */
>         odp_sync_stores();
> -       queue->s.param.context = context;
> +       queue->s.param.context = context; /* Store-release */
> +       /* Ensure queue modification is globally visible before we return
> +        * and the application might cause the queue to be scheduled */
>         odp_sync_stores();
>         return 0;
>  }
> diff --git a/platform/linux-generic/odp_ring.c
> b/platform/linux-generic/odp_ring.c
> index 632aa66..d1ec825 100644
> --- a/platform/linux-generic/odp_ring.c
> +++ b/platform/linux-generic/odp_ring.c
> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
> unsigned flags)
>                 r->cons.size = count;
>                 r->prod.mask = count-1;
>                 r->cons.mask = count-1;
> -               r->prod.head = 0;
> -               r->cons.head = 0;
> -               r->prod.tail = 0;
> -               r->cons.tail = 0;
> +               odp_atomic32_store_rlx(&r->prod.head, 0);
> +               odp_atomic32_store_rlx(&r->cons.head, 0);
> +               odp_atomic32_store_rlx(&r->prod.tail, 0);
> +               odp_atomic32_store_rlx(&r->cons.tail, 0);
>
>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>         } else {
> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t prod_head, prod_next;
>         uint32_t cons_tail, free_entries;
>         const unsigned max = n;
> -       int success;
> +       bool ok;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>         int ret;
> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 /* Reset n to the initial burst count */
>                 n = max;
>
> -               prod_head = r->prod.head;
> -               cons_tail = r->cons.tail;
> +               prod_head = odp_atomic32_load_rlx(&r->prod.head);
> +               cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * prod_head > cons_tail). So 'free_entries' is always
> between 0
> @@ -259,13 +259,13 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 }
>
>                 prod_next = prod_head + n;
> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
> -                                             prod_next);
> -       } while (odp_unlikely(success == 0));
> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
> +                                                  prod_head,
> +                                                  prod_next) == prod_head;
> +       } while (odp_unlikely(!ok));
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -279,10 +279,10 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>          * If there are other enqueues in progress that preceeded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->prod.tail != prod_head))
> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) !=
> prod_head))
>                 odp_spin();
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>         return ret;
>  }
>
> @@ -298,8 +298,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         uint32_t mask = r->prod.mask;
>         int ret;
>
> -       prod_head = r->prod.head;
> -       cons_tail = r->cons.tail;
> +       prod_head = odp_atomic32_load_rlx(&r->prod.head);
> +       cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * prod_head > cons_tail). So 'free_entries' is always between 0
> @@ -320,11 +320,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>         }
>
>         prod_next = prod_head + n;
> -       r->prod.head = prod_next;
> +       odp_atomic32_store_rlx(&r->prod.head, prod_next);
>
>         /* write entries in ring */
>         ENQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /* if we exceed the watermark */
>         if (odp_unlikely(((mask + 1) - free_entries + n) >
> r->prod.watermark)) {
> @@ -334,7 +333,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
> const *obj_table,
>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>         }
>
> -       r->prod.tail = prod_next;
> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>         return ret;
>  }
>
> @@ -348,7 +347,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         uint32_t cons_head, prod_tail;
>         uint32_t cons_next, entries;
>         const unsigned max = n;
> -       int success;
> +       bool ok;
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> @@ -357,8 +356,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 /* Restore n as it may change every loop */
>                 n = max;
>
> -               cons_head = r->cons.head;
> -               prod_tail = r->prod.tail;
> +               cons_head = odp_atomic32_load_rlx(&r->cons.head);
> +               prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>                 /* The subtraction is done between two unsigned 32bits
> value
>                  * (the result is always modulo 32 bits even if we have
>                  * cons_head > prod_tail). So 'entries' is always between 0
> @@ -378,22 +377,22 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>                 }
>
>                 cons_next = cons_head + n;
> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
> -                                             cons_next);
> -       } while (odp_unlikely(success == 0));
> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
> +                                                  cons_head,
> +                                                  cons_next) == cons_head;
> +       } while (odp_unlikely(!ok));
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
>         /*
>          * If there are other dequeues in progress that preceded us,
>          * we need to wait for them to complete
>          */
> -       while (odp_unlikely(r->cons.tail != cons_head))
> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) !=
> cons_head))
>                 odp_spin();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
> @@ -409,8 +408,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         unsigned i;
>         uint32_t mask = r->prod.mask;
>
> -       cons_head = r->cons.head;
> -       prod_tail = r->prod.tail;
> +       cons_head = odp_atomic32_load_rlx(&r->cons.head);
> +       prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>         /* The subtraction is done between two unsigned 32bits value
>          * (the result is always modulo 32 bits even if we have
>          * cons_head > prod_tail). So 'entries' is always between 0
> @@ -429,13 +428,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
> **obj_table,
>         }
>
>         cons_next = cons_head + n;
> -       r->cons.head = cons_next;
> +       odp_atomic32_store_rlx(&r->cons.head, cons_next);
>
>         /* copy in table */
>         DEQUEUE_PTRS();
> -       odp_mem_barrier();
>
> -       r->cons.tail = cons_next;
> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>  }
>
> @@ -482,8 +480,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
> **obj_table, unsigned n)
>   */
>  int odph_ring_full(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>  }
>
> @@ -492,8 +490,8 @@ int odph_ring_full(const odph_ring_t *r)
>   */
>  int odph_ring_empty(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>         return !!(cons_tail == prod_tail);
>  }
>
> @@ -502,8 +500,8 @@ int odph_ring_empty(const odph_ring_t *r)
>   */
>  unsigned odph_ring_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>         return (prod_tail - cons_tail) & r->prod.mask;
>  }
>
> @@ -512,8 +510,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>   */
>  unsigned odph_ring_free_count(const odph_ring_t *r)
>  {
> -       uint32_t prod_tail = r->prod.tail;
> -       uint32_t cons_tail = r->cons.tail;
> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>  }
>
> @@ -523,10 +521,10 @@ void odph_ring_dump(const odph_ring_t *r)
>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>         ODP_DBG("  flags=%x\n", r->flags);
>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.tail));
> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.head));
> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.tail));
> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.head));
>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>         if (r->prod.watermark == r->prod.size)
> diff --git a/platform/linux-generic/odp_rwlock.c
> b/platform/linux-generic/odp_rwlock.c
> index 11c8dd7..ba0a7ca 100644
> --- a/platform/linux-generic/odp_rwlock.c
> +++ b/platform/linux-generic/odp_rwlock.c
> @@ -4,58 +4,56 @@
>   * SPDX-License-Identifier:     BSD-3-Clause
>   */
>
> +#include <stdbool.h>
>  #include <odp_atomic.h>
>  #include <odp_rwlock.h>
> -
>  #include <odp_spin_internal.h>
>
>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>  {
> -       rwlock->cnt = 0;
> +       odp_atomic32_store_rlx(&rwlock->cnt, 0);
>  }
>
>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int  is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> +       bool gotit;
> +       do {
> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>                 /* waiting for read lock */
> -               if (cnt < 0) {
> +               if ((int32_t)cnt < 0) {
>                         odp_spin();
>                         continue;
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             cnt, cnt + 1);
> -       }
> +               /* Attempt to take another read lock */
> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
> +                                                     cnt, cnt + 1) == cnt;
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release one read lock by subtracting 1 */
> +       odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
>  }
>
>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>  {
> -       int32_t cnt;
> -       int is_locked = 0;
> -
> -       while (is_locked == 0) {
> -               cnt = rwlock->cnt;
> -               /* lock aquired, wait */
> +       bool gotit;
> +       do {
> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>                 if (cnt != 0) {
> +                       /* Lock is busy */
>                         odp_spin();
>                         continue;
>                 }
> -               is_locked = odp_atomic_cmpset_u32(
> -                                       (volatile uint32_t *)&rwlock->cnt,
> -                                             0, -1);
> -       }
> +               /* Attempt to take write lock */
> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
> +                                                     (uint32_t)-1) == 0;
> +       } while (!gotit);
>  }
>
>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>  {
> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
> +       /* Release the write lock by adding 1 */
> +       odp_atomic32_add_rls(&rwlock->cnt, 1);
>  }
> diff --git a/platform/linux-generic/odp_thread.c
> b/platform/linux-generic/odp_thread.c
> index b869b27..569b235 100644
> --- a/platform/linux-generic/odp_thread.c
> +++ b/platform/linux-generic/odp_thread.c
> @@ -31,7 +31,7 @@ typedef struct {
>
>  typedef struct {
>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
> -       odp_atomic_int_t num;
> +       odp_atomic32_t   num;
>
>  } thread_globals_t;
>
> @@ -67,7 +67,7 @@ static int thread_id(void)
>         int id;
>         int cpu;
>
> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
> +       id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
>
>         if (id >= ODP_CONFIG_MAX_THREADS) {
>                 ODP_ERR("Too many threads\n");
> @@ -77,7 +77,7 @@ static int thread_id(void)
>         cpu = sched_getcpu();
>
>         if (cpu < 0) {
> -               ODP_ERR("getcpu failed\n");
> +               ODP_ERR("sched_getcpu failed\n");
>                 return -1;
>         }
>
> diff --git a/platform/linux-generic/odp_ticketlock.c
> b/platform/linux-generic/odp_ticketlock.c
> index be5b885..cadc0e0 100644
> --- a/platform/linux-generic/odp_ticketlock.c
> +++ b/platform/linux-generic/odp_ticketlock.c
> @@ -12,9 +12,8 @@
>
>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>  {
> -       ticketlock->next_ticket = 0;
> -       ticketlock->cur_ticket  = 0;
> -       odp_sync_stores();
> +       odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
> +       odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
>  }
>
>
> @@ -22,30 +21,14 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>  {
>         uint32_t ticket;
>
> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> +       ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>
> -       while (ticket != ticketlock->cur_ticket)
> +       while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>                 odp_spin();
> -
> -       odp_mem_barrier();
>  }
>
>
>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>  {
> -       odp_sync_stores();
> -
> -       ticketlock->cur_ticket++;
> -
> -#if defined __OCTEON__
> -       odp_sync_stores();
> -#else
> -       odp_mem_barrier();
> -#endif
> -}
> -
> -
> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
> -{
> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
> +       odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
>  }
> diff --git a/platform/linux-generic/odp_timer.c
> b/platform/linux-generic/odp_timer.c
> index 313c713..938429f 100644
> --- a/platform/linux-generic/odp_timer.c
> +++ b/platform/linux-generic/odp_timer.c
> @@ -32,8 +32,8 @@ typedef struct {
>
>  typedef struct {
>         int               allocated;
> -       volatile int      active;
> -       volatile uint64_t cur_tick;
> +       odp_atomic32_t    active;
> +       odp_atomic64_t    cur_tick;
>         timer_t           timerid;
>         odp_timer_t       timer_hdl;
>         odp_buffer_pool_t pool;
> @@ -150,16 +150,14 @@ static void notify_function(union sigval sigval)
>
>         timer = sigval.sival_ptr;
>
> -       if (timer->active == 0) {
> +       if (odp_atomic32_load_rlx(&timer->active) == 0) {
>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>                 return;
>         }
>
>         /* ODP_DBG("Tick\n"); */
>
> -       cur_tick = timer->cur_tick++;
> -
> -       odp_sync_stores();
> +       cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
>
>         tick = &timer->tick[cur_tick % MAX_TICKS];
>
> @@ -318,8 +316,7 @@ odp_timer_t odp_timer_create(const char *name,
> odp_buffer_pool_t pool,
>                 timer->tick[i].list = NULL;
>         }
>
> -       timer->active = 1;
> -       odp_sync_stores();
> +       odp_atomic32_store_rls(&timer->active, 1);
>
>         timer_start(timer);
>
> @@ -340,7 +337,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
> timer_hdl, uint64_t tmo_tick,
>         id = (int)timer_hdl - 1;
>         timer = &odp_timer.timer[id];
>
> -       cur_tick = timer->cur_tick;
> +       cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
>         if (tmo_tick <= cur_tick) {
>                 ODP_DBG("timeout too close\n");
>                 return ODP_TIMER_TMO_INVALID;
> @@ -416,7 +413,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>         uint32_t id;
>
>         id = timer_hdl - 1;
> -       return odp_timer.timer[id].cur_tick;
> +       return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
>  }
>
>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
> diff --git a/test/api_test/odp_atomic_test.c
> b/test/api_test/odp_atomic_test.c
> index 9019d4f..4d27b32 100644
> --- a/test/api_test/odp_atomic_test.c
> +++ b/test/api_test/odp_atomic_test.c
> @@ -10,17 +10,14 @@
>  #include <odp_common.h>
>  #include <odp_atomic_test.h>
>
> -static odp_atomic_int_t a32;
> -static odp_atomic_u32_t a32u;
> -static odp_atomic_u64_t a64u;
> +static odp_atomic32_t a32u;
> +static odp_atomic64_t a64u;
>
> -static odp_atomic_int_t numthrds;
> +static odp_barrier_t barrier;
>
>  static const char * const test_name[] = {
>         "dummy",
>         "test atomic basic ops add/sub/inc/dec",
> -       "test atomic inc/dec of signed word",
> -       "test atomic add/sub of signed word",
>         "test atomic inc/dec of unsigned word",
>         "test atomic add/sub of unsigned word",
>         "test atomic inc/dec of unsigned double word",
> @@ -31,39 +28,29 @@ static struct timeval tv0[MAX_WORKERS],
> tv1[MAX_WORKERS];
>
>  static void usage(void)
>  {
> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
> +       printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
>                "\t<testcase> is\n"
>                "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
> -              "\t\t2 - Test inc dec of signed word\n"
> -              "\t\t3 - Test add sub of signed word\n"
> -              "\t\t4 - Test inc dec of unsigned word\n"
> -              "\t\t5 - Test add sub of unsigned word\n"
> -              "\t\t6 - Test inc dec of double word\n"
> -              "\t\t7 - Test add sub of double word\n"
> -              "\t<num of pthread> is optional\n"
> -              "\t\t<1 - 31> - no of pthreads to start\n"
> +              "\t\t2 - Test inc dec of unsigned word\n"
> +              "\t\t3 - Test add sub of unsigned word\n"
> +              "\t\t4 - Test inc dec of double word\n"
> +              "\t\t5 - Test add sub of double word\n"
> +              "\t<num of thread> is optional\n"
> +              "\t\t<1 - 31> - no of threads to start\n"
>                "\t\tif user doesn't specify this option, then\n"
> -              "\t\tno of pthreads created is equivalent to no of cores\n"
> +              "\t\tno of threads created is equivalent to no of cores\n"
>                "\t\tavailable in the system\n"
>                "\tExample usage:\n"
>                "\t\t./odp_atomic -t 2\n"
>                "\t\t./odp_atomic -t 3 -n 12\n");
>  }
>
> -void test_atomic_inc_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_int(&a32);
> -}
> -
>  void test_atomic_inc_u32(void)
>  {
>         int i;
>
>         for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u32(&a32u);
> +               odp_atomic32_add_rlx(&a32u, 1);
>  }
>
>  void test_atomic_inc_64(void)
> @@ -71,15 +58,7 @@ void test_atomic_inc_64(void)
>         int i;
>
>         for (i = 0; i < CNT; i++)
> -               odp_atomic_inc_u64(&a64u);
> -}
> -
> -void test_atomic_dec_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_int(&a32);
> +               odp_atomic64_add_rlx(&a64u, 1);
>  }
>
>  void test_atomic_dec_u32(void)
> @@ -87,7 +66,7 @@ void test_atomic_dec_u32(void)
>         int i;
>
>         for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u32(&a32u);
> +               odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
>  }
>
>  void test_atomic_dec_64(void)
> @@ -95,15 +74,7 @@ void test_atomic_dec_64(void)
>         int i;
>
>         for (i = 0; i < CNT; i++)
> -               odp_atomic_dec_u64(&a64u);
> -}
> -
> -void test_atomic_add_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
> +               odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
>  }
>
>  void test_atomic_add_u32(void)
> @@ -111,7 +82,7 @@ void test_atomic_add_u32(void)
>         int i;
>
>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
> +               odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
>  }
>
>  void test_atomic_add_64(void)
> @@ -119,15 +90,7 @@ void test_atomic_add_64(void)
>         int i;
>
>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_sub_32(void)
> -{
> -       int i;
> -
> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
> +               odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
>  }
>
>  void test_atomic_sub_u32(void)
> @@ -135,7 +98,7 @@ void test_atomic_sub_u32(void)
>         int i;
>
>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
> +               odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
>  }
>
>  void test_atomic_sub_64(void)
> @@ -143,19 +106,7 @@ void test_atomic_sub_64(void)
>         int i;
>
>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
> -}
> -
> -void test_atomic_inc_dec_32(void)
> -{
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -}
> -
> -void test_atomic_add_sub_32(void)
> -{
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> +               odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
>  }
>
>  void test_atomic_inc_dec_u32(void)
> @@ -188,11 +139,6 @@ void test_atomic_add_sub_64(void)
>   */
>  void test_atomic_basic(void)
>  {
> -       test_atomic_inc_32();
> -       test_atomic_dec_32();
> -       test_atomic_add_32();
> -       test_atomic_sub_32();
> -
>         test_atomic_inc_u32();
>         test_atomic_dec_u32();
>         test_atomic_add_u32();
> @@ -206,31 +152,24 @@ void test_atomic_basic(void)
>
>  void test_atomic_init(void)
>  {
> -       odp_atomic_init_int(&a32);
> -       odp_atomic_init_u32(&a32u);
> -       odp_atomic_init_u64(&a64u);
> +       odp_atomic32_store_rlx(&a32u, 0);
> +       odp_atomic64_store_rlx(&a64u, 0);
>  }
>
>  void test_atomic_store(void)
>  {
> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
> +       odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
> +       odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
>  }
>
>  int test_atomic_validate(void)
>  {
> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
> -               return -1;
> -       }
> -
> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
> +       if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
>                 ODP_ERR("Atomic u32 usual functions failed\n");
>                 return -1;
>         }
>
> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
> +       if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
>                 ODP_ERR("Atomic u64 usual functions failed\n");
>                 return -1;
>         }
> @@ -247,11 +186,8 @@ static void *run_thread(void *arg)
>
>         ODP_DBG("Thread %i starts\n", thr);
>
> -       odp_atomic_inc_int(&numthrds);
> -
> -       /* Wait here until all pthreads are created */
> -       while (*(volatile int *)&numthrds < parg->numthrds)
> -               ;
> +       /* Wait here until all threads have arrived */
> +       odp_barrier_sync(&barrier);
>
>         gettimeofday(&tv0[thr], NULL);
>
> @@ -259,12 +195,6 @@ static void *run_thread(void *arg)
>         case TEST_MIX:
>                 test_atomic_basic();
>                 break;
> -       case TEST_INC_DEC_S32:
> -               test_atomic_inc_dec_32();
> -               break;
> -       case TEST_ADD_SUB_S32:
> -               test_atomic_add_sub_32();
> -               break;
>         case TEST_INC_DEC_U32:
>                 test_atomic_inc_dec_u32();
>                 break;
> @@ -327,7 +257,6 @@ int main(int argc, char *argv[])
>         if (pthrdnum == 0)
>                 pthrdnum = odp_sys_core_count();
>
> -       odp_atomic_init_int(&numthrds);
>         test_atomic_init();
>         test_atomic_store();
>
> @@ -342,6 +271,7 @@ int main(int argc, char *argv[])
>                 usage();
>                 goto err_exit;
>         }
> +       odp_barrier_init(&barrier, pthrdnum);
>         odp_test_thread_create(run_thread, &thrdarg);
>
>         odp_test_thread_exit(&thrdarg);
> diff --git a/test/api_test/odp_atomic_test.h
> b/test/api_test/odp_atomic_test.h
> index 7814da5..aaa9d34 100644
> --- a/test/api_test/odp_atomic_test.h
> +++ b/test/api_test/odp_atomic_test.h
> @@ -18,14 +18,11 @@
>  #define ADD_SUB_CNT    5
>
>  #define        CNT 500000
> -#define        S32_INIT_VAL    (1UL << 10)
>  #define        U32_INIT_VAL    (1UL << 10)
>  #define        U64_INIT_VAL    (1ULL << 33)
>
>  typedef enum {
>         TEST_MIX = 1, /* Must be first test case num */
> -       TEST_INC_DEC_S32,
> -       TEST_ADD_SUB_S32,
>         TEST_INC_DEC_U32,
>         TEST_ADD_SUB_U32,
>         TEST_INC_DEC_64,
> @@ -34,16 +31,10 @@ typedef enum {
>  } odp_test_atomic_t;
>
>
> -void test_atomic_inc_dec_32(void);
> -void test_atomic_add_sub_32(void);
>  void test_atomic_inc_dec_u32(void);
>  void test_atomic_add_sub_u32(void);
>  void test_atomic_inc_dec_64(void);
>  void test_atomic_add_sub_64(void);
> -void test_atomic_inc_32(void);
> -void test_atomic_dec_32(void);
> -void test_atomic_add_32(void);
> -void test_atomic_sub_32(void);
>  void test_atomic_inc_u32(void);
>  void test_atomic_dec_u32(void);
>  void test_atomic_add_u32(void);
> --
> 1.9.1
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
Mike Holmes Oct. 16, 2014, 1:38 a.m. UTC | #2
On 15 October 2014 19:18, Bill Fischofer <bill.fischofer@linaro.org> wrote:

> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
> On Wed, Oct 15, 2014 at 4:46 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
> wrote:
>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> Implementation of C11-based memory model for atomic operations.
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>> code that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>> Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
>> Other architectures will fall back to GCC __sync builtins which often
>> include
>> unnecessarily heavy barrier/sync operations (always sequentially
>> consistent).
>>
>
Are these fallbacks also 100% supported by LLVM ?
ODP Linux generic implementation is already compiler crippled and not C std
compliant  with Variable Length Arrays in Structs (VLAIS)
The Linux kernel is still trying to fix those GCCisims
http://lkml.iu.edu/hypermail/linux/kernel/1410.1/03100.html

If this works with std=c99 --pedantic etc for both gcc and llvm, then it
looks like it would be an improvement from your introduction, is there any
form of benchmark to show it helps ?


> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter)
>> and
>> odp_ring enqueue/dequeue (need release barrier but only had compiler
>> barrier).
>>
>>  example/generator/odp_generator.c                  |  43 +-
>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>  example/odp_example/odp_example.c                  |   2 +-
>>  example/timer/odp_timer_test.c                     |   2 +-
>>  helper/include/odph_ring.h                         |   8 +-
>>  platform/linux-generic/include/api/odp_atomic.h    | 820
>> ++++++++++++---------
>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>  .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>  platform/linux-generic/odp_barrier.c               |  43 +-
>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>  platform/linux-generic/odp_crypto.c                |   4 +-
>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>  platform/linux-generic/odp_ring.c                  |  86 ++-
>>  platform/linux-generic/odp_rwlock.c                |  46 +-
>>  platform/linux-generic/odp_thread.c                |   6 +-
>>  platform/linux-generic/odp_ticketlock.c            |  27 +-
>>  platform/linux-generic/odp_timer.c                 |  17 +-
>>  test/api_test/odp_atomic_test.c                    | 126 +---
>>  test/api_test/odp_atomic_test.h                    |   9 -
>>  21 files changed, 651 insertions(+), 636 deletions(-)
>>
>> diff --git a/example/generator/odp_generator.c
>> b/example/generator/odp_generator.c
>> index eb8b340..cf2d77b 100644
>> --- a/example/generator/odp_generator.c
>> +++ b/example/generator/odp_generator.c
>> @@ -62,10 +62,10 @@ typedef struct {
>>   * counters
>>  */
>>  static struct {
>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>> -       odp_atomic_u64_t ip;    /**< ip packets */
>> -       odp_atomic_u64_t udp;   /**< udp packets */
>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>> +       odp_atomic64_t seq;     /**< ip seq to be send */
>> +       odp_atomic64_t ip;      /**< ip packets */
>> +       odp_atomic64_t udp;     /**< udp packets */
>> +       odp_atomic64_t icmp;    /**< icmp packets */
>>  } counters;
>>
>>  /** * Thread specific arguments
>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_UDPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_UDP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_ICMPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_ICMP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>                 }
>>
>>                 if (args->appl.interval != 0) {
>> +                       uint64_t seq =
>> odp_atomic64_load_rlx(&counters.seq);
>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>> -                              thr, counters.seq, counters.seq%0xffff);
>> +                              thr, seq, seq%0xffff);
>>                         /* TODO use odp timer */
>>                         usleep(args->appl.interval * 1000);
>>                 }
>> -               if (args->appl.number != -1 && counters.seq
>> -                   >= (unsigned int)args->appl.number) {
>> +               if (args->appl.number != -1 &&
>> +                   odp_atomic64_load_rlx(&counters.seq) >=
>> +                   (unsigned int)args->appl.number) {
>>                         break;
>>                 }
>>         }
>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>         /* receive number of reply pks until timeout */
>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>                 while (args->appl.timeout >= 0) {
>> -                       if (counters.icmp >= (unsigned
>> int)args->appl.number)
>> +                       if (odp_atomic64_load_rlx(&counters.icmp) >=
>> +                           (unsigned int)args->appl.number)
>>                                 break;
>>                         /* TODO use odp timer */
>>                         sleep(1);
>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>
>>         /* print info */
>>         if (args->appl.mode == APPL_MODE_UDP) {
>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>> +               printf("  [%02i] total send: %ju\n", thr,
>> +                      odp_atomic64_load_rlx(&counters.seq));
>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>> -                      thr, counters.seq, counters.icmp);
>> +                      thr, odp_atomic64_load_rlx(&counters.seq),
>> +                      odp_atomic64_load_rlx(&counters.icmp));
>>         }
>>         return arg;
>>  }
>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                 if (!odp_packet_inflag_ipv4(pkt))
>>                         continue;
>>
>> -               odp_atomic_inc_u64(&counters.ip);
>> +               odp_atomic64_add_rlx(&counters.ip, 1);
>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>
>>                 /* udp */
>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>> -                       odp_atomic_inc_u64(&counters.udp);
>> +                       odp_atomic64_add_rlx(&counters.udp, 1);
>>                         udp = (odph_udphdr_t *)(buf + offset);
>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>                                         odp_be_to_cpu_16(udp->length) -
>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>                         /* echo reply */
>>                         if (icmp->type == ICMP_ECHOREPLY) {
>> -                               odp_atomic_inc_u64(&counters.icmp);
>> +                               odp_atomic64_add_rlx(&counters.icmp, 1);
>>                                 memcpy(&tvsend, buf + offset +
>> ODPH_ICMPHDR_LEN,
>>                                        sizeof(struct timeval));
>>                                 /* TODO This should be changed to use an
>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>         }
>>
>>         /* init counters */
>> -       odp_atomic_init_u64(&counters.seq);
>> -       odp_atomic_init_u64(&counters.ip);
>> -       odp_atomic_init_u64(&counters.udp);
>> -       odp_atomic_init_u64(&counters.icmp);
>> +       odp_atomic64_store_rlx(&counters.seq, 0);
>> +       odp_atomic64_store_rlx(&counters.ip, 0);
>> +       odp_atomic64_store_rlx(&counters.udp, 0);
>> +       odp_atomic64_store_rlx(&counters.icmp, 0);
>>
>>         /* Reserve memory for args from shared mem */
>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>> index 2f2dc19..76c27d0 100644
>> --- a/example/ipsec/odp_ipsec.c
>> +++ b/example/ipsec/odp_ipsec.c
>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>         printf("Num worker threads: %i\n", num_workers);
>>
>>         /* Create a barrier to synchronize thread startup */
>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>> +       odp_barrier_init(&sync_barrier, num_workers);
>>
>>         /*
>>          * By default core #0 runs Linux kernel background tasks.
>> diff --git a/example/odp_example/odp_example.c
>> b/example/odp_example/odp_example.c
>> index 0e9aa3d..c473395 100644
>> --- a/example/odp_example/odp_example.c
>> +++ b/example/odp_example/odp_example.c
>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>         odp_shm_print_all();
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>> +       odp_barrier_init(&globals->barrier, num_workers);
>>
>>         if (args.proc_mode) {
>>                 int ret;
>> diff --git a/example/timer/odp_timer_test.c
>> b/example/timer/odp_timer_test.c
>> index 78b2ae2..dfbeae9 100644
>> --- a/example/timer/odp_timer_test.c
>> +++ b/example/timer/odp_timer_test.c
>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>         printf("\n");
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&test_barrier, num_workers);
>> +       odp_barrier_init(&test_barrier, num_workers);
>>
>>         /* Create and launch worker threads */
>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>> index 76c1db8..5e78b34 100644
>> --- a/helper/include/odph_ring.h
>> +++ b/helper/include/odph_ring.h
>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>                 uint32_t size;           /* Size of ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Producer head. */
>> -               uint32_t tail;          /* Producer tail. */
>> +               odp_atomic32_t head;    /* Producer head. */
>> +               odp_atomic32_t tail;    /* Producer tail. */
>>         } prod ODP_ALIGNED_CACHE;
>>
>>         /** @private Consumer */
>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>                 uint32_t size;           /* Size of the ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Consumer head. */
>> -               uint32_t tail;          /* Consumer tail. */
>> +               odp_atomic32_t head;    /* Consumer head. */
>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>         } cons ODP_ALIGNED_CACHE;
>>
>>         /** @private Memory space of ring starts here. */
>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>> b/platform/linux-generic/include/api/odp_atomic.h
>> index 0cc4cf4..89f183c 100644
>> --- a/platform/linux-generic/include/api/odp_atomic.h
>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>> @@ -4,463 +4,559 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> -
>>  /**
>>   * @file
>>   *
>> - * ODP atomic operations
>> + * ODP atomic types and operations, semantically a subset of C11 atomics.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Atomic functions must be used to operate on atomic variables!
>>   */
>>
>>  #ifndef ODP_ATOMIC_H_
>>  #define ODP_ATOMIC_H_
>>
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>> -
>> -#include <odp_std_types.h>
>> -
>> -
>> -/**
>> - * Atomic integer
>> - */
>> -typedef volatile int32_t odp_atomic_int_t;
>> -
>>  /**
>> - * Atomic unsigned integer 64 bits
>> + * 32-bit (unsigned) atomic type
>>   */
>> -typedef volatile uint64_t odp_atomic_u64_t;
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>
>>  /**
>> - * Atomic unsigned integer 32 bits
>> + * 64-bit (unsigned) atomic type
>>   */
>> -typedef volatile uint32_t odp_atomic_u32_t;
>> -
>> +typedef struct {
>> +       uint64_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic64_t
>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>
>> -/**
>> - * Initialize atomic integer
>> - *
>> - * @param ptr    An integer atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic integer value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>>
>> +/*****************************************************************************
>> + * Just a few helpers
>> +
>> *****************************************************************************/
>>
>> -/**
>> - * Store value to atomic integer
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>> new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic integer
>> - *
>> - * @param ptr    An atomic integer variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic int variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> +#ifdef __OCTEON__
>> +/* OCTEON Write Memory Barrier */
>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>> +       /* Double syncw to work around errata */ \
>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
>> +       : : : "memory")
>> +/* syncw is also used to flush the write buffer which makes stores
>> visible
>> + * quicker which should be beneficial to release operations */
>> +#define OCTEON_FLUSH() __asm __volatile( \
>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
>> +       : : : "memory")
>> +#else
>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
>> */
>> +/** Compiler and hardware full memory barrier */
>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>> +/** Flush write buffer on OCTEON */
>> +#define OCTEON_FLUSH() (void)0
>> +#endif
>>
>> -/**
>> - * Initialize atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> +/** Compiler memory barrier */
>> +#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
>>
>> -/**
>> - * Load value of atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic uint32 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomics
>> + * odp_atomic32_load_rlx
>> + * odp_atomic32_store_rlx
>> + * odp_atomic32_load_acq
>> + * odp_atomic32_store_rls
>> + * odp_atomic32_cmp_and_swap_rlx - return old value
>> + * odp_atomic32_fetch_add_rlx - return old value
>> + * odp_atomic32_fetch_add_rls - return old value
>> + * odp_atomic32_add_rlx - no return value
>> + * odp_atomic32_add_rls - no return value
>> +
>> *****************************************************************************/
>>
>>  /**
>> - * Store value to atomic uint32
>> + * Relaxed atomic load of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the variable
>>   */
>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>> -                                       uint32_t new_value)
>> +static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
>>  {
>> -       *ptr = new_value;
>> +       uint32_t val;
>> +       COMPILER_BARRIER();
>> +       /* Read of aligned word is atomic */
>> +       val = ptr->v;
>> +       COMPILER_BARRIER();
>> +       return val;
>>  }
>>
>>  /**
>> - * Fetch and add atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * Relaxed atomic store of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @return Value of the variable before the operation
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param val   Value to write to the variable
>>   */
>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> +static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr, uint32_t
>> val)
>>  {
>> -       return __sync_fetch_and_add(ptr, value);
>> +       COMPILER_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       COMPILER_BARRIER();
>>  }
>>
>>  /**
>> - * Fetch and subtract uint32
>> + * Atomic load-acquire of 32-bit atomic variable
>> + * @note SC-load-acquire barrier, later accesses cannot move before
>> + * the load-acquire access.
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be sub to the variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the variable
>>   */
>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> +static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
>>  {
>> -       return __sync_fetch_and_sub(ptr, value);
>> +#if defined __aarch64__
>> +       uint32_t val;
>> +       __asm __volatile("ldar %w0, [%1]"
>> +                : "=&r"(val)
>> +                                : "r"(&ptr->v)
>> +                                : "memory");
>> +       return val;
>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>> +       /* Read of aligned word is atomic */
>> +       uint32_t val = ptr->v;
>> +       /* To prevent later accesses from moving up */
>> +       /* FIXME: Herb Sutter claims HW barrier not needed on x86? */
>> +       COMPILER_HW_BARRIER();
>> +       return val;
>> +#else
>> +#warning odp_atomic32_load_acq() may not be efficiently implemented
>> +       /* Assume read of aligned word is atomic */
>> +       uint32_t val = ptr->v;
>> +       /* To prevent later accesses from moving up */
>> +       COMPILER_HW_BARRIER();
>> +       return val;
>> +#endif
>>  }
>>
>>  /**
>> - * Fetch and increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __OCTEON__
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       uint32_t ret;
>> -
>> -       __asm__ __volatile__ ("syncws");
>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>> -                             "r" (ptr));
>> -
>> -       return ret;
>> -}
>> -
>> + * Atomic store-release of 32-bit atomic variable
>> + * @note SC-store-release barrier, earlier accesses cannot move after
>> + * store-release access.
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + */
>> +static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr, uint32_t
>> val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       /* Compiler and HW barrier to prevent earlier accesses from moving
>> +        * down */
>> +       COMPILER_HW_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       /* Compiler and HW barrier to prevent this store from moving down
>> after
>> +        * a later load-acquire and thus create overlapping critical
>> sections.
>> +        * Herb Sutter thinks this is needed */
>> +       COMPILER_HW_BARRIER();
>> +#elif defined __aarch64__
>> +       __asm __volatile("stlr %w0, [%1]"
>> +                :
>> +                : "r"(val), "r"(&ptr->v)
>> +                                : "memory");
>> +#elif defined __mips64__
>> +       /* Compiler and HW barrier to prevent earlier accesses from moving
>> +        * down */
>> +       COMPILER_HW_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       /* Compiler and HW barrier to prevent this store from moving down
>> after
>> +        * a later load-acquire and thus create overlapping critical
>> sections.
>> +        * Herb Sutter thinks this is needed */
>> +       COMPILER_HW_BARRIER();
>> +#elif defined __x86_64__
>> +       /* This is actually an atomic exchange operation */
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #else
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>> +       /* This is actually an atomic exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #endif
>> -
>> -/**
>> - * Increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>  }
>>
>> -/**
>> - * Fetch and decrement uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>>
>>  /**
>> - * Decrement atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>> + * Atomic compare and swap of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + * @note Not compare-and-set! Called should compare return value with
>> expected
>> + * parameter to check if swap operation succeeded.
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param exp  Expected old value
>> + * @param val  New value
>> + * @return Actual old value, if different from 'exp' then swap failed
>> + */
>> +static inline uint32_t
>> +odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
>> +                             uint32_t exp,
>> +                             uint32_t val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t old;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               __asm __volatile("strex %0, %1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +               /* Restart the loop so we can re-read the previous value
>> */
>> +       } while (odp_unlikely(status != 0));
>> +       return old;
>> +#elif defined __aarch64__
>> +       uint32_t old;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %w0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +               /* Restart the loop so we can re-read the previous value
>> */
>> +       } while (odp_unlikely(status != 0));
>> +       return old;
>> +#elif defined __mips64__
>> +       uint32_t old, new_val;
>> +       do {
>> +               __asm __volatile("llw %0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               new_val = val;
>> +               __asm __volatile("scw %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>> +#else
>> +#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently
>> implemented
>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>> +#endif
>>  }
>>
>>  /**
>> - * Atomic compare and set for 32bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + * @note A - B <=> A + (-B)
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>> +               uint32_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("strex %0, %1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0));
>> +       return old_val;
>> +#elif defined __aarch64__
>> +       uint32_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %w0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0));
>> +       return old_val;
>> +#elif defined __mips64__
>> +       uint32_t old_val, new_val;
>> +       do {
>> +               __asm __volatile("llw %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("scw %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Initialize atomic uint64
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note Sequential consistent memory model, barriers before and after
>> the
>> + * @note A - B <=> A + (-B)
>>   *
>> - * @param ptr    An atomic variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the atomic variable before the addition
>>   */
>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>> +static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
>> +               uint32_t incr)
>>  {
>> -       *ptr = 0;
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       COMPILER_HW_BARRIER();
>> +       return odp_atomic32_fetch_add_rlx(ptr, incr);
>> +#elif defined __aarch64__
>> +       /* We basically get acquire/release semantics */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#elif defined __mips64__
>> +       uint32_t old;
>> +       COMPILER_HW_BARRIER();
>> +       old = odp_atomic32_fetch_add_rlx(ptr, incr);
>> +       OCTEON_FLUSH();
>> +       return old;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add_rls() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Load value of atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> + * Atomic add to 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @return atomic uint64 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>> +static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
>> +                                       uint32_t incr)
>>  {
>> -       return *ptr;
>> +       /* Use odp_atomic32_fetch_add_rlx() for now */
>> +       (void)odp_atomic32_fetch_add_rlx(ptr, incr);
>>  }
>>
>>  /**
>> - * Store value to atomic uint64
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * Atomic add to 32-bit atomic variable
>> + * @note Sequential consistent memory model, barriers before and after
>> the
>> + * operation.
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>> -                                       uint64_t new_value)
>> +static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t
>> incr)
>>  {
>> -       *ptr = new_value;
>> +       /* Use odp_atomic32_fetch_add_rls() for now */
>> +       (void)odp_atomic32_fetch_add_rls(ptr, incr);
>>  }
>>
>> -/**
>> - * Add atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - */
>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_add(ptr, value);
>> -}
>>
>> +/*****************************************************************************
>> + * Operations on 64-bit atomics
>> + * odp_atomic64_load_rlx
>> + * odp_atomic64_store_rlx
>> + * odp_atomic64_fetch_add_rlx
>> + * odp_atomic64_add_rlx
>> +
>> *****************************************************************************/
>>
>>  /**
>> - * Fetch and add atomic uint64
>> + * Relaxed atomic load of 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * @param ptr   Pointer to a 64-bit atomic variable
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the atomic variable
>>   */
>> -
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> +static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
>>  {
>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t val;
>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>> +                        "clrex" /* Clear exclusive access monitor */
>> +                : "=&r"(val)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       return val;
>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>> +       /* Read of aligned quad/double word is atomic */
>> +       return ptr->v;
>>  #else
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> +#warning odp_atomic64_load_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>  #endif
>> -/**
>> - * Subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - */
>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_sub(ptr, value);
>>  }
>>
>>  /**
>> - * Fetch and subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> + * Relaxed atomic store of 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + *
>> + * @param ptr  Pointer to a 64-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + */
>> +static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
>> +               uint64_t val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               /* Read atomic variable exclusively so we can write to it
>> +                * later */
>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               (void)old_val; /* Ignore old value */
>> +               /* Attempt to write the new value */
>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>> +       /* Write of aligned quad/double word is atomic */
>> +       ptr->v = val;
>>  #else
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> +#warning odp_atomic64_store_rlx() may not be efficiently implemented
>> +       /* This is actually an atomic exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #endif
>> -/**
>> - * Fetch and increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>  }
>>
>>  /**
>> - * Decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>> + * Atomic fetch and add to 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + *
>> + * @param ptr   Pointer to a 64-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
>> +               uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __aarch64__
>> +       uint64_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %x0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("stxr %w0, %x1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __mips64__
>> +       uint64_t old_val, new_val;
>> +       do {
>> +               __asm __volatile("ll %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("sc %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic64_fetch_add_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Atomic compare and set for 64bit
>> + * Atomic add to 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> + * @param ptr   Pointer to a 64-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline int
>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>> +static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t
>> incr)
>>  {
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> +       (void)odp_atomic64_fetch_add_rlx(ptr, incr);
>>  }
>>
>>  #ifdef __cplusplus
>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>> b/platform/linux-generic/include/api/odp_barrier.h
>> index a7b3215..f8eae9a 100644
>> --- a/platform/linux-generic/include/api/odp_barrier.h
>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>> @@ -27,18 +27,18 @@ extern "C" {
>>   * ODP execution barrier
>>   */
>>  typedef struct odp_barrier_t {
>> -       int              count;  /**< @private Thread count */
>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>> +       uint32_t       num_threads;  /**< @private Thread count
>> (constant) */
>> +       odp_atomic32_t in_barrier;   /**< @private Threaads in barrier */
>>  } odp_barrier_t;
>>
>>
>>  /**
>>   * Init barrier with thread count
>>   *
>> - * @param barrier    Barrier
>> - * @param count      Thread count
>> + * @param barrier     Barrier
>> + * @param num_threads Number of threads which share the barrier
>>   */
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>
>>
>>  /**
>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>> b/platform/linux-generic/include/api/odp_rwlock.h
>> index 252ebb2..ff8a9a2 100644
>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>> @@ -10,26 +10,30 @@
>>  /**
>>   * @file
>>   *
>> - * ODP RW Locks
>> + * ODP read/write lock
>> + * RW lock support multiple concurrent reads but only one (exclusive)
>> writer.
>>   */
>>
>> +#include <odp_atomic.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>>  /**
>>   * The odp_rwlock_t type.
>> - * write lock count is -1,
>> - * read lock count > 0
>> + * write lock is ~0U
>> + * read lock count >0 && <~0U
>>   */
>>  typedef struct {
>> -       volatile int32_t cnt; /**< -1 Write lock,
>> -                               > 0 for Read lock. */
>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>> +                                == ~0: locked for write,
>> +                                > 0 number of concurrent read locks */
>>  } odp_rwlock_t;
>>
>>
>>  /**
>> - * Initialize the rwlock to an unlocked state.
>> + * Initialize the rwlock to the unlocked state.
>>   *
>>   * @param rwlock pointer to the RW Lock.
>>   */
>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Aquire a write lock.
>> + * Aquire the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Release a write lock.
>> + * Release the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> b/platform/linux-generic/include/api/odp_ticketlock.h
>> index 6277a18..c4b5e34 100644
>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> @@ -27,8 +27,8 @@ extern "C" {
>>   * ODP ticketlock
>>   */
>>  typedef struct odp_ticketlock_t {
>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> +       odp_atomic32_t next_ticket; /**< @private Next ticket */
>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>  } odp_ticketlock_t;
>>
>>
>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>> b/platform/linux-generic/include/odp_buffer_internal.h
>> index 2002b51..530ab96 100644
>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>         uint32_t                 index;      /* buf index in the pool */
>>         size_t                   size;       /* max data size */
>>         size_t                   cur_offset; /* current offset */
>> -       odp_atomic_int_t         ref_count;  /* reference count */
>> +       odp_atomic32_t           ref_count;  /* reference count */
>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>         int                      type;       /* type of next header */
>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>> diff --git a/platform/linux-generic/odp_barrier.c
>> b/platform/linux-generic/odp_barrier.c
>> index a82b294..6c3b884 100644
>> --- a/platform/linux-generic/odp_barrier.c
>> +++ b/platform/linux-generic/odp_barrier.c
>> @@ -8,41 +8,48 @@
>>  #include <odp_sync.h>
>>  #include <odp_spin_internal.h>
>>
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>  {
>> -       barrier->count = count;
>> -       barrier->bar = 0;
>> -       odp_sync_stores();
>> +       barrier->num_threads = num_threads; /* Constant after
>> initialisation */
>> +       odp_atomic32_store_rls(&barrier->in_barrier, 0);
>>  }
>>
>>  /*
>>   * Efficient barrier_sync -
>>   *
>>   *   Barriers are initialized with a count of the number of callers
>> - *   that must sync on the barrier before any may proceed.
>> + *   that must sync on (enter) the barrier before any may proceed (exit).
>>   *
>>   *   To avoid race conditions and to permit the barrier to be fully
>>   *   reusable, the barrier value cycles between 0..2*count-1. When
>> - *   synchronizing the wasless variable simply tracks which half of
>> + *   synchronizing the waslow variable simply tracks which half of
>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>   *   barrier crosses to the other half of the cycle.
>>   */
>>
>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>  {
>> -       int count;
>> -       int wasless;
>> +       uint32_t count;
>> +       bool waslow;
>>
>> -       odp_sync_stores();
>> -       wasless = barrier->bar < barrier->count;
>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>> +       /* FIXME do we need acquire barrier as well? */
>> +       /* Increase threads in_barrier count, this will automatically
>> release
>> +        * the other threads when lower/upper range is switched */
>> +       count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
>> +       /* Compute lower or higher range indicator */
>> +       waslow = count < barrier->num_threads;
>>
>> -       if (count == 2*barrier->count-1) {
>> -               barrier->bar = 0;
>> -       } else {
>> -               while ((barrier->bar < barrier->count) == wasless)
>> -                       odp_spin();
>> +       /* Check if in_barrier count has "wrapped" */
>> +       if (count == 2 * barrier->num_threads - 1) {
>> +               /* Manually wrap the counter */
>> +               odp_atomic32_add_rls(&barrier->in_barrier,
>> +
>> (uint32_t)(-2*(int)barrier->num_threads));
>> +               /* We don't need to wait below, return immediately */
>> +               return;
>> +       }
>> +       /* Wait for counter to change half */
>> +       while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
>> +              barrier->num_threads) == waslow) {
>> +               odp_spin();
>>         }
>> -
>> -       odp_mem_barrier();
>>  }
>> diff --git a/platform/linux-generic/odp_buffer.c
>> b/platform/linux-generic/odp_buffer.c
>> index e54e0e7..a5939f3 100644
>> --- a/platform/linux-generic/odp_buffer.c
>> +++ b/platform/linux-generic/odp_buffer.c
>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>> odp_buffer_t buf)
>>         len += snprintf(&str[len], n-len,
>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>         len += snprintf(&str[len], n-len,
>> -                       "  ref_count    %i\n",        hdr->ref_count);
>> +                       "  ref_count    %u\n",
>> +                       odp_atomic32_load_rlx(&hdr->ref_count));
>>         len += snprintf(&str[len], n-len,
>>                         "  type         %i\n",        hdr->type);
>>         len += snprintf(&str[len], n-len,
>> diff --git a/platform/linux-generic/odp_crypto.c
>> b/platform/linux-generic/odp_crypto.c
>> index b37ad6b..d9fff10 100644
>> --- a/platform/linux-generic/odp_crypto.c
>> +++ b/platform/linux-generic/odp_crypto.c
>> @@ -26,7 +26,7 @@
>>  #define MAX_SESSIONS 32
>>
>>  typedef struct {
>> -       odp_atomic_u32_t next;
>> +       odp_atomic32_t   next;
>>         uint32_t         max;
>>         odp_crypto_generic_session_t sessions[0];
>>  } odp_crypto_global_t;
>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>         uint32_t idx;
>>         odp_crypto_generic_session_t *session = NULL;
>>
>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>> +       idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
>>         if (idx < global->max) {
>>                 session = &global->sessions[idx];
>>                 session->index = idx;
>> diff --git a/platform/linux-generic/odp_queue.c
>> b/platform/linux-generic/odp_queue.c
>> index 1318bcd..08c0d29 100644
>> --- a/platform/linux-generic/odp_queue.c
>> +++ b/platform/linux-generic/odp_queue.c
>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>> *context)
>>  {
>>         queue_entry_t *queue;
>>         queue = queue_to_qentry(handle);
>> +       /* Setting a new queue context can be viewed as a release
>> operation,
>> +        * all writes to the context must be observable before the context
>> +        * is made observable */
>>         odp_sync_stores();
>> -       queue->s.param.context = context;
>> +       queue->s.param.context = context; /* Store-release */
>> +       /* Ensure queue modification is globally visible before we return
>> +        * and the application might cause the queue to be scheduled */
>>         odp_sync_stores();
>>         return 0;
>>  }
>> diff --git a/platform/linux-generic/odp_ring.c
>> b/platform/linux-generic/odp_ring.c
>> index 632aa66..d1ec825 100644
>> --- a/platform/linux-generic/odp_ring.c
>> +++ b/platform/linux-generic/odp_ring.c
>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
>> unsigned flags)
>>                 r->cons.size = count;
>>                 r->prod.mask = count-1;
>>                 r->cons.mask = count-1;
>> -               r->prod.head = 0;
>> -               r->cons.head = 0;
>> -               r->prod.tail = 0;
>> -               r->cons.tail = 0;
>> +               odp_atomic32_store_rlx(&r->prod.head, 0);
>> +               odp_atomic32_store_rlx(&r->cons.head, 0);
>> +               odp_atomic32_store_rlx(&r->prod.tail, 0);
>> +               odp_atomic32_store_rlx(&r->cons.tail, 0);
>>
>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>         } else {
>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t prod_head, prod_next;
>>         uint32_t cons_tail, free_entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool ok;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 /* Reset n to the initial burst count */
>>                 n = max;
>>
>> -               prod_head = r->prod.head;
>> -               cons_tail = r->cons.tail;
>> +               prod_head = odp_atomic32_load_rlx(&r->prod.head);
>> +               cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * prod_head > cons_tail). So 'free_entries' is always
>> between 0
>> @@ -259,13 +259,13 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>                 }
>>
>>                 prod_next = prod_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
>> -                                             prod_next);
>> -       } while (odp_unlikely(success == 0));
>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
>> +                                                  prod_head,
>> +                                                  prod_next) ==
>> prod_head;
>> +       } while (odp_unlikely(!ok));
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -279,10 +279,10 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>          * If there are other enqueues in progress that preceeded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->prod.tail != prod_head))
>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) !=
>> prod_head))
>>                 odp_spin();
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>         return ret;
>>  }
>>
>> @@ -298,8 +298,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>>
>> -       prod_head = r->prod.head;
>> -       cons_tail = r->cons.tail;
>> +       prod_head = odp_atomic32_load_rlx(&r->prod.head);
>> +       cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>> @@ -320,11 +320,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>         }
>>
>>         prod_next = prod_head + n;
>> -       r->prod.head = prod_next;
>> +       odp_atomic32_store_rlx(&r->prod.head, prod_next);
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -334,7 +333,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>         }
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>         return ret;
>>  }
>>
>> @@ -348,7 +347,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         uint32_t cons_head, prod_tail;
>>         uint32_t cons_next, entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool ok;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> @@ -357,8 +356,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 /* Restore n as it may change every loop */
>>                 n = max;
>>
>> -               cons_head = r->cons.head;
>> -               prod_tail = r->prod.tail;
>> +               cons_head = odp_atomic32_load_rlx(&r->cons.head);
>> +               prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * cons_head > prod_tail). So 'entries' is always between
>> 0
>> @@ -378,22 +377,22 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 }
>>
>>                 cons_next = cons_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
>> -                                             cons_next);
>> -       } while (odp_unlikely(success == 0));
>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
>> +                                                  cons_head,
>> +                                                  cons_next) ==
>> cons_head;
>> +       } while (odp_unlikely(!ok));
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /*
>>          * If there are other dequeues in progress that preceded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->cons.tail != cons_head))
>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) !=
>> cons_head))
>>                 odp_spin();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>> @@ -409,8 +408,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> -       cons_head = r->cons.head;
>> -       prod_tail = r->prod.tail;
>> +       cons_head = odp_atomic32_load_rlx(&r->cons.head);
>> +       prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * cons_head > prod_tail). So 'entries' is always between 0
>> @@ -429,13 +428,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         }
>>
>>         cons_next = cons_head + n;
>> -       r->cons.head = cons_next;
>> +       odp_atomic32_store_rlx(&r->cons.head, cons_next);
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>>
>> @@ -482,8 +480,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>> **obj_table, unsigned n)
>>   */
>>  int odph_ring_full(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>  }
>>
>> @@ -492,8 +490,8 @@ int odph_ring_full(const odph_ring_t *r)
>>   */
>>  int odph_ring_empty(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return !!(cons_tail == prod_tail);
>>  }
>>
>> @@ -502,8 +500,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (prod_tail - cons_tail) & r->prod.mask;
>>  }
>>
>> @@ -512,8 +510,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>  }
>>
>> @@ -523,10 +521,10 @@ void odph_ring_dump(const odph_ring_t *r)
>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>         ODP_DBG("  flags=%x\n", r->flags);
>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.tail));
>> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.head));
>> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.tail));
>> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.head));
>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>         if (r->prod.watermark == r->prod.size)
>> diff --git a/platform/linux-generic/odp_rwlock.c
>> b/platform/linux-generic/odp_rwlock.c
>> index 11c8dd7..ba0a7ca 100644
>> --- a/platform/linux-generic/odp_rwlock.c
>> +++ b/platform/linux-generic/odp_rwlock.c
>> @@ -4,58 +4,56 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> +#include <stdbool.h>
>>  #include <odp_atomic.h>
>>  #include <odp_rwlock.h>
>> -
>>  #include <odp_spin_internal.h>
>>
>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>  {
>> -       rwlock->cnt = 0;
>> +       odp_atomic32_store_rlx(&rwlock->cnt, 0);
>>  }
>>
>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int  is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> +       bool gotit;
>> +       do {
>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>                 /* waiting for read lock */
>> -               if (cnt < 0) {
>> +               if ((int32_t)cnt < 0) {
>>                         odp_spin();
>>                         continue;
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             cnt, cnt + 1);
>> -       }
>> +               /* Attempt to take another read lock */
>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
>> +                                                     cnt, cnt + 1) ==
>> cnt;
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release one read lock by subtracting 1 */
>> +       odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
>>  }
>>
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> -               /* lock aquired, wait */
>> +       bool gotit;
>> +       do {
>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>                 if (cnt != 0) {
>> +                       /* Lock is busy */
>>                         odp_spin();
>>                         continue;
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             0, -1);
>> -       }
>> +               /* Attempt to take write lock */
>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
>> +                                                     (uint32_t)-1) == 0;
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release the write lock by adding 1 */
>> +       odp_atomic32_add_rls(&rwlock->cnt, 1);
>>  }
>> diff --git a/platform/linux-generic/odp_thread.c
>> b/platform/linux-generic/odp_thread.c
>> index b869b27..569b235 100644
>> --- a/platform/linux-generic/odp_thread.c
>> +++ b/platform/linux-generic/odp_thread.c
>> @@ -31,7 +31,7 @@ typedef struct {
>>
>>  typedef struct {
>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>> -       odp_atomic_int_t num;
>> +       odp_atomic32_t   num;
>>
>>  } thread_globals_t;
>>
>> @@ -67,7 +67,7 @@ static int thread_id(void)
>>         int id;
>>         int cpu;
>>
>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>> +       id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
>>
>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>                 ODP_ERR("Too many threads\n");
>> @@ -77,7 +77,7 @@ static int thread_id(void)
>>         cpu = sched_getcpu();
>>
>>         if (cpu < 0) {
>> -               ODP_ERR("getcpu failed\n");
>> +               ODP_ERR("sched_getcpu failed\n");
>>                 return -1;
>>         }
>>
>> diff --git a/platform/linux-generic/odp_ticketlock.c
>> b/platform/linux-generic/odp_ticketlock.c
>> index be5b885..cadc0e0 100644
>> --- a/platform/linux-generic/odp_ticketlock.c
>> +++ b/platform/linux-generic/odp_ticketlock.c
>> @@ -12,9 +12,8 @@
>>
>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>  {
>> -       ticketlock->next_ticket = 0;
>> -       ticketlock->cur_ticket  = 0;
>> -       odp_sync_stores();
>> +       odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
>> +       odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
>>  }
>>
>>
>> @@ -22,30 +21,14 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>  {
>>         uint32_t ticket;
>>
>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>> +       ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>
>> -       while (ticket != ticketlock->cur_ticket)
>> +       while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>                 odp_spin();
>> -
>> -       odp_mem_barrier();
>>  }
>>
>>
>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>  {
>> -       odp_sync_stores();
>> -
>> -       ticketlock->cur_ticket++;
>> -
>> -#if defined __OCTEON__
>> -       odp_sync_stores();
>> -#else
>> -       odp_mem_barrier();
>> -#endif
>> -}
>> -
>> -
>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>> -{
>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>> +       odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
>>  }
>> diff --git a/platform/linux-generic/odp_timer.c
>> b/platform/linux-generic/odp_timer.c
>> index 313c713..938429f 100644
>> --- a/platform/linux-generic/odp_timer.c
>> +++ b/platform/linux-generic/odp_timer.c
>> @@ -32,8 +32,8 @@ typedef struct {
>>
>>  typedef struct {
>>         int               allocated;
>> -       volatile int      active;
>> -       volatile uint64_t cur_tick;
>> +       odp_atomic32_t    active;
>> +       odp_atomic64_t    cur_tick;
>>         timer_t           timerid;
>>         odp_timer_t       timer_hdl;
>>         odp_buffer_pool_t pool;
>> @@ -150,16 +150,14 @@ static void notify_function(union sigval sigval)
>>
>>         timer = sigval.sival_ptr;
>>
>> -       if (timer->active == 0) {
>> +       if (odp_atomic32_load_rlx(&timer->active) == 0) {
>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>                 return;
>>         }
>>
>>         /* ODP_DBG("Tick\n"); */
>>
>> -       cur_tick = timer->cur_tick++;
>> -
>> -       odp_sync_stores();
>> +       cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
>>
>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>
>> @@ -318,8 +316,7 @@ odp_timer_t odp_timer_create(const char *name,
>> odp_buffer_pool_t pool,
>>                 timer->tick[i].list = NULL;
>>         }
>>
>> -       timer->active = 1;
>> -       odp_sync_stores();
>> +       odp_atomic32_store_rls(&timer->active, 1);
>>
>>         timer_start(timer);
>>
>> @@ -340,7 +337,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>> timer_hdl, uint64_t tmo_tick,
>>         id = (int)timer_hdl - 1;
>>         timer = &odp_timer.timer[id];
>>
>> -       cur_tick = timer->cur_tick;
>> +       cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
>>         if (tmo_tick <= cur_tick) {
>>                 ODP_DBG("timeout too close\n");
>>                 return ODP_TIMER_TMO_INVALID;
>> @@ -416,7 +413,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>>         uint32_t id;
>>
>>         id = timer_hdl - 1;
>> -       return odp_timer.timer[id].cur_tick;
>> +       return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
>>  }
>>
>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>> diff --git a/test/api_test/odp_atomic_test.c
>> b/test/api_test/odp_atomic_test.c
>> index 9019d4f..4d27b32 100644
>> --- a/test/api_test/odp_atomic_test.c
>> +++ b/test/api_test/odp_atomic_test.c
>> @@ -10,17 +10,14 @@
>>  #include <odp_common.h>
>>  #include <odp_atomic_test.h>
>>
>> -static odp_atomic_int_t a32;
>> -static odp_atomic_u32_t a32u;
>> -static odp_atomic_u64_t a64u;
>> +static odp_atomic32_t a32u;
>> +static odp_atomic64_t a64u;
>>
>> -static odp_atomic_int_t numthrds;
>> +static odp_barrier_t barrier;
>>
>>  static const char * const test_name[] = {
>>         "dummy",
>>         "test atomic basic ops add/sub/inc/dec",
>> -       "test atomic inc/dec of signed word",
>> -       "test atomic add/sub of signed word",
>>         "test atomic inc/dec of unsigned word",
>>         "test atomic add/sub of unsigned word",
>>         "test atomic inc/dec of unsigned double word",
>> @@ -31,39 +28,29 @@ static struct timeval tv0[MAX_WORKERS],
>> tv1[MAX_WORKERS];
>>
>>  static void usage(void)
>>  {
>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>> +       printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
>>                "\t<testcase> is\n"
>>                "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>> -              "\t\t2 - Test inc dec of signed word\n"
>> -              "\t\t3 - Test add sub of signed word\n"
>> -              "\t\t4 - Test inc dec of unsigned word\n"
>> -              "\t\t5 - Test add sub of unsigned word\n"
>> -              "\t\t6 - Test inc dec of double word\n"
>> -              "\t\t7 - Test add sub of double word\n"
>> -              "\t<num of pthread> is optional\n"
>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>> +              "\t\t2 - Test inc dec of unsigned word\n"
>> +              "\t\t3 - Test add sub of unsigned word\n"
>> +              "\t\t4 - Test inc dec of double word\n"
>> +              "\t\t5 - Test add sub of double word\n"
>> +              "\t<num of thread> is optional\n"
>> +              "\t\t<1 - 31> - no of threads to start\n"
>>                "\t\tif user doesn't specify this option, then\n"
>> -              "\t\tno of pthreads created is equivalent to no of cores\n"
>> +              "\t\tno of threads created is equivalent to no of cores\n"
>>                "\t\tavailable in the system\n"
>>                "\tExample usage:\n"
>>                "\t\t./odp_atomic -t 2\n"
>>                "\t\t./odp_atomic -t 3 -n 12\n");
>>  }
>>
>> -void test_atomic_inc_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_int(&a32);
>> -}
>> -
>>  void test_atomic_inc_u32(void)
>>  {
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u32(&a32u);
>> +               odp_atomic32_add_rlx(&a32u, 1);
>>  }
>>
>>  void test_atomic_inc_64(void)
>> @@ -71,15 +58,7 @@ void test_atomic_inc_64(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u64(&a64u);
>> -}
>> -
>> -void test_atomic_dec_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_int(&a32);
>> +               odp_atomic64_add_rlx(&a64u, 1);
>>  }
>>
>>  void test_atomic_dec_u32(void)
>> @@ -87,7 +66,7 @@ void test_atomic_dec_u32(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u32(&a32u);
>> +               odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
>>  }
>>
>>  void test_atomic_dec_64(void)
>> @@ -95,15 +74,7 @@ void test_atomic_dec_64(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u64(&a64u);
>> -}
>> -
>> -void test_atomic_add_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>> +               odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
>>  }
>>
>>  void test_atomic_add_u32(void)
>> @@ -111,7 +82,7 @@ void test_atomic_add_u32(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>> +               odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_add_64(void)
>> @@ -119,15 +90,7 @@ void test_atomic_add_64(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_sub_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>> +               odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_sub_u32(void)
>> @@ -135,7 +98,7 @@ void test_atomic_sub_u32(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>> +               odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_sub_64(void)
>> @@ -143,19 +106,7 @@ void test_atomic_sub_64(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_inc_dec_32(void)
>> -{
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -}
>> -
>> -void test_atomic_add_sub_32(void)
>> -{
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> +               odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_inc_dec_u32(void)
>> @@ -188,11 +139,6 @@ void test_atomic_add_sub_64(void)
>>   */
>>  void test_atomic_basic(void)
>>  {
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> -
>>         test_atomic_inc_u32();
>>         test_atomic_dec_u32();
>>         test_atomic_add_u32();
>> @@ -206,31 +152,24 @@ void test_atomic_basic(void)
>>
>>  void test_atomic_init(void)
>>  {
>> -       odp_atomic_init_int(&a32);
>> -       odp_atomic_init_u32(&a32u);
>> -       odp_atomic_init_u64(&a64u);
>> +       odp_atomic32_store_rlx(&a32u, 0);
>> +       odp_atomic64_store_rlx(&a64u, 0);
>>  }
>>
>>  void test_atomic_store(void)
>>  {
>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>> +       odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
>> +       odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
>>  }
>>
>>  int test_atomic_validate(void)
>>  {
>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>> -               return -1;
>> -       }
>> -
>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>> +       if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
>>                 ODP_ERR("Atomic u32 usual functions failed\n");
>>                 return -1;
>>         }
>>
>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>> +       if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
>>                 ODP_ERR("Atomic u64 usual functions failed\n");
>>                 return -1;
>>         }
>> @@ -247,11 +186,8 @@ static void *run_thread(void *arg)
>>
>>         ODP_DBG("Thread %i starts\n", thr);
>>
>> -       odp_atomic_inc_int(&numthrds);
>> -
>> -       /* Wait here until all pthreads are created */
>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>> -               ;
>> +       /* Wait here until all threads have arrived */
>> +       odp_barrier_sync(&barrier);
>>
>>         gettimeofday(&tv0[thr], NULL);
>>
>> @@ -259,12 +195,6 @@ static void *run_thread(void *arg)
>>         case TEST_MIX:
>>                 test_atomic_basic();
>>                 break;
>> -       case TEST_INC_DEC_S32:
>> -               test_atomic_inc_dec_32();
>> -               break;
>> -       case TEST_ADD_SUB_S32:
>> -               test_atomic_add_sub_32();
>> -               break;
>>         case TEST_INC_DEC_U32:
>>                 test_atomic_inc_dec_u32();
>>                 break;
>> @@ -327,7 +257,6 @@ int main(int argc, char *argv[])
>>         if (pthrdnum == 0)
>>                 pthrdnum = odp_sys_core_count();
>>
>> -       odp_atomic_init_int(&numthrds);
>>         test_atomic_init();
>>         test_atomic_store();
>>
>> @@ -342,6 +271,7 @@ int main(int argc, char *argv[])
>>                 usage();
>>                 goto err_exit;
>>         }
>> +       odp_barrier_init(&barrier, pthrdnum);
>>         odp_test_thread_create(run_thread, &thrdarg);
>>
>>         odp_test_thread_exit(&thrdarg);
>> diff --git a/test/api_test/odp_atomic_test.h
>> b/test/api_test/odp_atomic_test.h
>> index 7814da5..aaa9d34 100644
>> --- a/test/api_test/odp_atomic_test.h
>> +++ b/test/api_test/odp_atomic_test.h
>> @@ -18,14 +18,11 @@
>>  #define ADD_SUB_CNT    5
>>
>>  #define        CNT 500000
>> -#define        S32_INIT_VAL    (1UL << 10)
>>  #define        U32_INIT_VAL    (1UL << 10)
>>  #define        U64_INIT_VAL    (1ULL << 33)
>>
>>  typedef enum {
>>         TEST_MIX = 1, /* Must be first test case num */
>> -       TEST_INC_DEC_S32,
>> -       TEST_ADD_SUB_S32,
>>         TEST_INC_DEC_U32,
>>         TEST_ADD_SUB_U32,
>>         TEST_INC_DEC_64,
>> @@ -34,16 +31,10 @@ typedef enum {
>>  } odp_test_atomic_t;
>>
>>
>> -void test_atomic_inc_dec_32(void);
>> -void test_atomic_add_sub_32(void);
>>  void test_atomic_inc_dec_u32(void);
>>  void test_atomic_add_sub_u32(void);
>>  void test_atomic_inc_dec_64(void);
>>  void test_atomic_add_sub_64(void);
>> -void test_atomic_inc_32(void);
>> -void test_atomic_dec_32(void);
>> -void test_atomic_add_32(void);
>> -void test_atomic_sub_32(void);
>>  void test_atomic_inc_u32(void);
>>  void test_atomic_dec_u32(void);
>>  void test_atomic_add_u32(void);
>> --
>> 1.9.1
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>
>
> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Ola Liljedahl Oct. 16, 2014, 8:42 a.m. UTC | #3
These suffixes stand for relaxed, release and acquire. They are important
concepts in C11/C++11 atomics and memory models. It is all about what
(observable) ordering is required in multithreaded (multiprocessor)
environments, happens-before and happens-after relationships.

A relaxed access is independent of all other accesses and need no
synchronization.
An acquire access denotes some type of shared resource acquisition. Loads
and stores after the acquire load must be prevented from moving up (either
by compiler or by the HW), this is a half-sided barrier. Loads and stores
from before the acquire are allowed to move down.
A release access denotes releases of a shared resource. Loads and stores
before the release store must be prevented from moving down (either by
compiler or by HW), this is also a half-sided barrier. Loads and stores
after the release are allowed to move up.

Code that uses atomic variables (e.g. for implementing shared memory data
structures such as locks and rings) must know which type of atomic
operations is required. The ODP ticket lock implementation makes a good
example:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
//the ticket counter does not protect anything to incrementing it can be
relaxed

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
 //acquiring the currently served position will include (a half-sided)
barrier so to contain accesses from inside the critical section
                odp_spin();
}


void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
currently server position will also include (a half-sided) barrier to
contain inside accesses
}

Implementations may use barriers of some kind inside these primitive atomic
operations. Some architectures don't even need explicit barriers as they
have memory access instructions (e.g. load and store) with acquire and
release semantics. Full barriers are heavy (and semantically an overkill)
and you want to avoid them if possible. To use full barriers for updates to
e.g. global statistics counters will affect performance, such updates can
be relaxed (they still need to be atomic of course).

See these two good presentations Herb Sutter on the C++ standards committee.
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2

On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org> wrote:

> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
> On Wed, Oct 15, 2014 at 4:46 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
> wrote:
>
>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>> ---
>> Implementation of C11-based memory model for atomic operations.
>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>> code that
>> implements multithreaded synchronization primitives (e.g. locks,
>> barriers).
>> Rewrote such primitives to use the new atomic operations.
>> Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
>> Other architectures will fall back to GCC __sync builtins which often
>> include
>> unnecessarily heavy barrier/sync operations (always sequentially
>> consistent).
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter)
>> and
>> odp_ring enqueue/dequeue (need release barrier but only had compiler
>> barrier).
>>
>>  example/generator/odp_generator.c                  |  43 +-
>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>  example/odp_example/odp_example.c                  |   2 +-
>>  example/timer/odp_timer_test.c                     |   2 +-
>>  helper/include/odph_ring.h                         |   8 +-
>>  platform/linux-generic/include/api/odp_atomic.h    | 820
>> ++++++++++++---------
>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>  .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>  platform/linux-generic/odp_barrier.c               |  43 +-
>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>  platform/linux-generic/odp_crypto.c                |   4 +-
>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>  platform/linux-generic/odp_ring.c                  |  86 ++-
>>  platform/linux-generic/odp_rwlock.c                |  46 +-
>>  platform/linux-generic/odp_thread.c                |   6 +-
>>  platform/linux-generic/odp_ticketlock.c            |  27 +-
>>  platform/linux-generic/odp_timer.c                 |  17 +-
>>  test/api_test/odp_atomic_test.c                    | 126 +---
>>  test/api_test/odp_atomic_test.h                    |   9 -
>>  21 files changed, 651 insertions(+), 636 deletions(-)
>>
>> diff --git a/example/generator/odp_generator.c
>> b/example/generator/odp_generator.c
>> index eb8b340..cf2d77b 100644
>> --- a/example/generator/odp_generator.c
>> +++ b/example/generator/odp_generator.c
>> @@ -62,10 +62,10 @@ typedef struct {
>>   * counters
>>  */
>>  static struct {
>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>> -       odp_atomic_u64_t ip;    /**< ip packets */
>> -       odp_atomic_u64_t udp;   /**< udp packets */
>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>> +       odp_atomic64_t seq;     /**< ip seq to be send */
>> +       odp_atomic64_t ip;      /**< ip packets */
>> +       odp_atomic64_t udp;     /**< udp packets */
>> +       odp_atomic64_t icmp;    /**< icmp packets */
>>  } counters;
>>
>>  /** * Thread specific arguments
>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_UDPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_UDP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>> ODPH_ICMPHDR_LEN +
>>                                        ODPH_IPV4HDR_LEN);
>>         ip->proto = ODPH_IPPROTO_ICMP;
>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
>>         ip->id = odp_cpu_to_be_16(seq);
>>         ip->chksum = 0;
>>         odph_ipv4_csum_update(pkt);
>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>                 }
>>
>>                 if (args->appl.interval != 0) {
>> +                       uint64_t seq =
>> odp_atomic64_load_rlx(&counters.seq);
>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>> -                              thr, counters.seq, counters.seq%0xffff);
>> +                              thr, seq, seq%0xffff);
>>                         /* TODO use odp timer */
>>                         usleep(args->appl.interval * 1000);
>>                 }
>> -               if (args->appl.number != -1 && counters.seq
>> -                   >= (unsigned int)args->appl.number) {
>> +               if (args->appl.number != -1 &&
>> +                   odp_atomic64_load_rlx(&counters.seq) >=
>> +                   (unsigned int)args->appl.number) {
>>                         break;
>>                 }
>>         }
>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>         /* receive number of reply pks until timeout */
>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>                 while (args->appl.timeout >= 0) {
>> -                       if (counters.icmp >= (unsigned
>> int)args->appl.number)
>> +                       if (odp_atomic64_load_rlx(&counters.icmp) >=
>> +                           (unsigned int)args->appl.number)
>>                                 break;
>>                         /* TODO use odp timer */
>>                         sleep(1);
>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>
>>         /* print info */
>>         if (args->appl.mode == APPL_MODE_UDP) {
>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>> +               printf("  [%02i] total send: %ju\n", thr,
>> +                      odp_atomic64_load_rlx(&counters.seq));
>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>> -                      thr, counters.seq, counters.icmp);
>> +                      thr, odp_atomic64_load_rlx(&counters.seq),
>> +                      odp_atomic64_load_rlx(&counters.icmp));
>>         }
>>         return arg;
>>  }
>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                 if (!odp_packet_inflag_ipv4(pkt))
>>                         continue;
>>
>> -               odp_atomic_inc_u64(&counters.ip);
>> +               odp_atomic64_add_rlx(&counters.ip, 1);
>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>
>>                 /* udp */
>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>> -                       odp_atomic_inc_u64(&counters.udp);
>> +                       odp_atomic64_add_rlx(&counters.udp, 1);
>>                         udp = (odph_udphdr_t *)(buf + offset);
>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>                                         odp_be_to_cpu_16(udp->length) -
>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>> pkt_tbl[], unsigned len)
>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>                         /* echo reply */
>>                         if (icmp->type == ICMP_ECHOREPLY) {
>> -                               odp_atomic_inc_u64(&counters.icmp);
>> +                               odp_atomic64_add_rlx(&counters.icmp, 1);
>>                                 memcpy(&tvsend, buf + offset +
>> ODPH_ICMPHDR_LEN,
>>                                        sizeof(struct timeval));
>>                                 /* TODO This should be changed to use an
>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>         }
>>
>>         /* init counters */
>> -       odp_atomic_init_u64(&counters.seq);
>> -       odp_atomic_init_u64(&counters.ip);
>> -       odp_atomic_init_u64(&counters.udp);
>> -       odp_atomic_init_u64(&counters.icmp);
>> +       odp_atomic64_store_rlx(&counters.seq, 0);
>> +       odp_atomic64_store_rlx(&counters.ip, 0);
>> +       odp_atomic64_store_rlx(&counters.udp, 0);
>> +       odp_atomic64_store_rlx(&counters.icmp, 0);
>>
>>         /* Reserve memory for args from shared mem */
>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>> index 2f2dc19..76c27d0 100644
>> --- a/example/ipsec/odp_ipsec.c
>> +++ b/example/ipsec/odp_ipsec.c
>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>         printf("Num worker threads: %i\n", num_workers);
>>
>>         /* Create a barrier to synchronize thread startup */
>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>> +       odp_barrier_init(&sync_barrier, num_workers);
>>
>>         /*
>>          * By default core #0 runs Linux kernel background tasks.
>> diff --git a/example/odp_example/odp_example.c
>> b/example/odp_example/odp_example.c
>> index 0e9aa3d..c473395 100644
>> --- a/example/odp_example/odp_example.c
>> +++ b/example/odp_example/odp_example.c
>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>         odp_shm_print_all();
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>> +       odp_barrier_init(&globals->barrier, num_workers);
>>
>>         if (args.proc_mode) {
>>                 int ret;
>> diff --git a/example/timer/odp_timer_test.c
>> b/example/timer/odp_timer_test.c
>> index 78b2ae2..dfbeae9 100644
>> --- a/example/timer/odp_timer_test.c
>> +++ b/example/timer/odp_timer_test.c
>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>         printf("\n");
>>
>>         /* Barrier to sync test case execution */
>> -       odp_barrier_init_count(&test_barrier, num_workers);
>> +       odp_barrier_init(&test_barrier, num_workers);
>>
>>         /* Create and launch worker threads */
>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>> index 76c1db8..5e78b34 100644
>> --- a/helper/include/odph_ring.h
>> +++ b/helper/include/odph_ring.h
>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>                 uint32_t size;           /* Size of ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Producer head. */
>> -               uint32_t tail;          /* Producer tail. */
>> +               odp_atomic32_t head;    /* Producer head. */
>> +               odp_atomic32_t tail;    /* Producer tail. */
>>         } prod ODP_ALIGNED_CACHE;
>>
>>         /** @private Consumer */
>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>                 uint32_t size;           /* Size of the ring. */
>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>> -               uint32_t head;          /* Consumer head. */
>> -               uint32_t tail;          /* Consumer tail. */
>> +               odp_atomic32_t head;    /* Consumer head. */
>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>         } cons ODP_ALIGNED_CACHE;
>>
>>         /** @private Memory space of ring starts here. */
>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>> b/platform/linux-generic/include/api/odp_atomic.h
>> index 0cc4cf4..89f183c 100644
>> --- a/platform/linux-generic/include/api/odp_atomic.h
>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>> @@ -4,463 +4,559 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> -
>>  /**
>>   * @file
>>   *
>> - * ODP atomic operations
>> + * ODP atomic types and operations, semantically a subset of C11 atomics.
>> + * Scalar variable wrapped in a struct to avoid accessing scalar directly
>> + * without using the required access functions.
>> + * Atomic functions must be used to operate on atomic variables!
>>   */
>>
>>  #ifndef ODP_ATOMIC_H_
>>  #define ODP_ATOMIC_H_
>>
>> +#include <stdint.h>
>> +#include <odp_align.h>
>> +#include <odp_hints.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>> -
>> -#include <odp_std_types.h>
>> -
>> -
>> -/**
>> - * Atomic integer
>> - */
>> -typedef volatile int32_t odp_atomic_int_t;
>> -
>>  /**
>> - * Atomic unsigned integer 64 bits
>> + * 32-bit (unsigned) atomic type
>>   */
>> -typedef volatile uint64_t odp_atomic_u64_t;
>> +typedef struct {
>> +       uint32_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic32_t
>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>
>>  /**
>> - * Atomic unsigned integer 32 bits
>> + * 64-bit (unsigned) atomic type
>>   */
>> -typedef volatile uint32_t odp_atomic_u32_t;
>> -
>> +typedef struct {
>> +       uint64_t v; /**< Actual storage for the atomic variable */
>> +} odp_atomic64_t
>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>
>> -/**
>> - * Initialize atomic integer
>> - *
>> - * @param ptr    An integer atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> -
>> -/**
>> - * Load value of atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic integer value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>>
>> +/*****************************************************************************
>> + * Just a few helpers
>> +
>> *****************************************************************************/
>>
>> -/**
>> - * Store value to atomic integer
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>> new_value)
>> -{
>> -       *ptr = new_value;
>> -}
>> -
>> -/**
>> - * Fetch and add atomic integer
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and subtract atomic integer
>> - *
>> - * @param ptr    An atomic integer variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>> value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> -
>> -/**
>> - * Fetch and increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic int variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> -
>> -/**
>> - * Decrement atomic integer by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_int(ptr, 1);
>> -}
>> +#ifdef __OCTEON__
>> +/* OCTEON Write Memory Barrier */
>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>> +       /* Double syncw to work around errata */ \
>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
>> +       : : : "memory")
>> +/* syncw is also used to flush the write buffer which makes stores
>> visible
>> + * quicker which should be beneficial to release operations */
>> +#define OCTEON_FLUSH() __asm __volatile( \
>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
>> +       : : : "memory")
>> +#else
>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a
>> */
>> +/** Compiler and hardware full memory barrier */
>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>> +/** Flush write buffer on OCTEON */
>> +#define OCTEON_FLUSH() (void)0
>> +#endif
>>
>> -/**
>> - * Initialize atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       *ptr = 0;
>> -}
>> +/** Compiler memory barrier */
>> +#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
>>
>> -/**
>> - * Load value of atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return atomic uint32 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> - */
>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return *ptr;
>> -}
>>
>> +/*****************************************************************************
>> + * Operations on 32-bit atomics
>> + * odp_atomic32_load_rlx
>> + * odp_atomic32_store_rlx
>> + * odp_atomic32_load_acq
>> + * odp_atomic32_store_rls
>> + * odp_atomic32_cmp_and_swap_rlx - return old value
>> + * odp_atomic32_fetch_add_rlx - return old value
>> + * odp_atomic32_fetch_add_rls - return old value
>> + * odp_atomic32_add_rlx - no return value
>> + * odp_atomic32_add_rls - no return value
>> +
>> *****************************************************************************/
>>
>>  /**
>> - * Store value to atomic uint32
>> + * Relaxed atomic load of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the variable
>>   */
>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>> -                                       uint32_t new_value)
>> +static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
>>  {
>> -       *ptr = new_value;
>> +       uint32_t val;
>> +       COMPILER_BARRIER();
>> +       /* Read of aligned word is atomic */
>> +       val = ptr->v;
>> +       COMPILER_BARRIER();
>> +       return val;
>>  }
>>
>>  /**
>> - * Fetch and add atomic uint32
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * Relaxed atomic store of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @return Value of the variable before the operation
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param val   Value to write to the variable
>>   */
>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> +static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr, uint32_t
>> val)
>>  {
>> -       return __sync_fetch_and_add(ptr, value);
>> +       COMPILER_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       COMPILER_BARRIER();
>>  }
>>
>>  /**
>> - * Fetch and subtract uint32
>> + * Atomic load-acquire of 32-bit atomic variable
>> + * @note SC-load-acquire barrier, later accesses cannot move before
>> + * the load-acquire access.
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be sub to the variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the variable
>>   */
>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>> -                                               uint32_t value)
>> +static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
>>  {
>> -       return __sync_fetch_and_sub(ptr, value);
>> +#if defined __aarch64__
>> +       uint32_t val;
>> +       __asm __volatile("ldar %w0, [%1]"
>> +                : "=&r"(val)
>> +                                : "r"(&ptr->v)
>> +                                : "memory");
>> +       return val;
>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>> +       /* Read of aligned word is atomic */
>> +       uint32_t val = ptr->v;
>> +       /* To prevent later accesses from moving up */
>> +       /* FIXME: Herb Sutter claims HW barrier not needed on x86? */
>> +       COMPILER_HW_BARRIER();
>> +       return val;
>> +#else
>> +#warning odp_atomic32_load_acq() may not be efficiently implemented
>> +       /* Assume read of aligned word is atomic */
>> +       uint32_t val = ptr->v;
>> +       /* To prevent later accesses from moving up */
>> +       COMPILER_HW_BARRIER();
>> +       return val;
>> +#endif
>>  }
>>
>>  /**
>> - * Fetch and increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __OCTEON__
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       uint32_t ret;
>> -
>> -       __asm__ __volatile__ ("syncws");
>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>> -                             "r" (ptr));
>> -
>> -       return ret;
>> -}
>> -
>> + * Atomic store-release of 32-bit atomic variable
>> + * @note SC-store-release barrier, earlier accesses cannot move after
>> + * store-release access.
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + */
>> +static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr, uint32_t
>> val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       /* Compiler and HW barrier to prevent earlier accesses from moving
>> +        * down */
>> +       COMPILER_HW_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       /* Compiler and HW barrier to prevent this store from moving down
>> after
>> +        * a later load-acquire and thus create overlapping critical
>> sections.
>> +        * Herb Sutter thinks this is needed */
>> +       COMPILER_HW_BARRIER();
>> +#elif defined __aarch64__
>> +       __asm __volatile("stlr %w0, [%1]"
>> +                :
>> +                : "r"(val), "r"(&ptr->v)
>> +                                : "memory");
>> +#elif defined __mips64__
>> +       /* Compiler and HW barrier to prevent earlier accesses from moving
>> +        * down */
>> +       COMPILER_HW_BARRIER();
>> +       /* Write of aligned word is atomic */
>> +       ptr->v = val;
>> +       /* Compiler and HW barrier to prevent this store from moving down
>> after
>> +        * a later load-acquire and thus create overlapping critical
>> sections.
>> +        * Herb Sutter thinks this is needed */
>> +       COMPILER_HW_BARRIER();
>> +#elif defined __x86_64__
>> +       /* This is actually an atomic exchange operation */
>> +       /* Generates good code on x86_64 */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #else
>> -
>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>> -}
>> -
>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>> +       /* This is actually an atomic exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #endif
>> -
>> -/**
>> - * Increment atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>  }
>>
>> -/**
>> - * Fetch and decrement uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>> -}
>>
>>  /**
>> - * Decrement atomic uint32 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>> + * Atomic compare and swap of 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + * @note Not compare-and-set! Called should compare return value with
>> expected
>> + * parameter to check if swap operation succeeded.
>> + *
>> + * @param ptr  Pointer to a 32-bit atomic variable
>> + * @param exp  Expected old value
>> + * @param val  New value
>> + * @return Actual old value, if different from 'exp' then swap failed
>> + */
>> +static inline uint32_t
>> +odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
>> +                             uint32_t exp,
>> +                             uint32_t val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t old;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               __asm __volatile("strex %0, %1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +               /* Restart the loop so we can re-read the previous value
>> */
>> +       } while (odp_unlikely(status != 0));
>> +       return old;
>> +#elif defined __aarch64__
>> +       uint32_t old;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %w0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       /* Clear exclusive access monitor */
>> +                       __asm __volatile("clrex");
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +               /* Restart the loop so we can re-read the previous value
>> */
>> +       } while (odp_unlikely(status != 0));
>> +       return old;
>> +#elif defined __mips64__
>> +       uint32_t old, new_val;
>> +       do {
>> +               __asm __volatile("llw %0, [%1]"
>> +                : "=&r"(old)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               if (odp_unlikely(old != exp)) {
>> +                       /* Value has changed, can't proceed */
>> +                       break;
>> +               }
>> +               /* Current value is as expected, attempt to write new
>> value */
>> +               new_val = val;
>> +               __asm __volatile("scw %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>> +#else
>> +#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently
>> implemented
>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>> +#endif
>>  }
>>
>>  /**
>> - * Atomic compare and set for 32bit
>> - *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> - */
>> -static inline int
>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>> -{
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + * @note A - B <=> A + (-B)
>> + *
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>> +               uint32_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint32_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrex %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("strex %0, %1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0));
>> +       return old_val;
>> +#elif defined __aarch64__
>> +       uint32_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %w0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0));
>> +       return old_val;
>> +#elif defined __mips64__
>> +       uint32_t old_val, new_val;
>> +       do {
>> +               __asm __volatile("llw %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("scw %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Initialize atomic uint64
>> + * Atomic fetch and add to 32-bit atomic variable
>> + * @note Sequential consistent memory model, barriers before and after
>> the
>> + * @note A - B <=> A + (-B)
>>   *
>> - * @param ptr    An atomic variable
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @return Value of the atomic variable before the addition
>>   */
>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>> +static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
>> +               uint32_t incr)
>>  {
>> -       *ptr = 0;
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       COMPILER_HW_BARRIER();
>> +       return odp_atomic32_fetch_add_rlx(ptr, incr);
>> +#elif defined __aarch64__
>> +       /* We basically get acquire/release semantics */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#elif defined __mips64__
>> +       uint32_t old;
>> +       COMPILER_HW_BARRIER();
>> +       old = odp_atomic32_fetch_add_rlx(ptr, incr);
>> +       OCTEON_FLUSH();
>> +       return old;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic32_fetch_add_rls() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Load value of atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> + * Atomic add to 32-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @return atomic uint64 value
>> - *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>> +static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
>> +                                       uint32_t incr)
>>  {
>> -       return *ptr;
>> +       /* Use odp_atomic32_fetch_add_rlx() for now */
>> +       (void)odp_atomic32_fetch_add_rlx(ptr, incr);
>>  }
>>
>>  /**
>> - * Store value to atomic uint64
>> - *
>> - * @param ptr        An atomic variable
>> - * @param new_value  Store new_value to a variable
>> + * Atomic add to 32-bit atomic variable
>> + * @note Sequential consistent memory model, barriers before and after
>> the
>> + * operation.
>>   *
>> - * @note The operation is not synchronized with other threads
>> + * @param ptr   Pointer to a 32-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>> -                                       uint64_t new_value)
>> +static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t
>> incr)
>>  {
>> -       *ptr = new_value;
>> +       /* Use odp_atomic32_fetch_add_rls() for now */
>> +       (void)odp_atomic32_fetch_add_rls(ptr, incr);
>>  }
>>
>> -/**
>> - * Add atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> - *
>> - */
>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_add(ptr, value);
>> -}
>>
>> +/*****************************************************************************
>> + * Operations on 64-bit atomics
>> + * odp_atomic64_load_rlx
>> + * odp_atomic64_store_rlx
>> + * odp_atomic64_fetch_add_rlx
>> + * odp_atomic64_add_rlx
>> +
>> *****************************************************************************/
>>
>>  /**
>> - * Fetch and add atomic uint64
>> + * Relaxed atomic load of 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be added to the variable
>> + * @param ptr   Pointer to a 64-bit atomic variable
>>   *
>> - * @return Value of the variable before the operation
>> + * @return Value of the atomic variable
>>   */
>> -
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> +static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
>>  {
>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t val;
>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>> +                        "clrex" /* Clear exclusive access monitor */
>> +                : "=&r"(val)
>> +                                : "r"(&ptr->v)
>> +                                : );
>> +       return val;
>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>> +       /* Read of aligned quad/double word is atomic */
>> +       return ptr->v;
>>  #else
>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_add(ptr, value);
>> -}
>> +#warning odp_atomic64_load_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>  #endif
>> -/**
>> - * Subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - */
>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>> value)
>> -{
>> -       __sync_fetch_and_sub(ptr, value);
>>  }
>>
>>  /**
>> - * Fetch and subtract atomic uint64
>> - *
>> - * @param ptr    An atomic variable
>> - * @param value  A value to be subtracted from the variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -#if defined __powerpc__ && !defined __powerpc64__
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>> -                                   (uint32_t)value);
>> -}
>> + * Relaxed atomic store of 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + *
>> + * @param ptr  Pointer to a 64-bit atomic variable
>> + * @param val  Value to write to the atomic variable
>> + */
>> +static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
>> +               uint64_t val)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val;
>> +       int status;
>> +       do {
>> +               /* Read atomic variable exclusively so we can write to it
>> +                * later */
>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               (void)old_val; /* Ignore old value */
>> +               /* Attempt to write the new value */
>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>> +       /* Write of aligned quad/double word is atomic */
>> +       ptr->v = val;
>>  #else
>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>> -                                               uint64_t value)
>> -{
>> -       return __sync_fetch_and_sub(ptr, value);
>> -}
>> +#warning odp_atomic64_store_rlx() may not be efficiently implemented
>> +       /* This is actually an atomic exchange operation */
>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>  #endif
>> -/**
>> - * Fetch and increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Increment atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_add_u64(ptr, 1);
>> -}
>> -
>> -/**
>> - * Fetch and decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - * @return Value of the variable before the operation
>> - */
>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>  }
>>
>>  /**
>> - * Decrement atomic uint64 by 1
>> - *
>> - * @param ptr    An atomic variable
>> - *
>> - */
>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>> -{
>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>> + * Atomic fetch and add to 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>> + *
>> + * @param ptr   Pointer to a 64-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>> + *
>> + * @return Value of the atomic variable before the addition
>> + */
>> +static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
>> +               uint64_t incr)
>> +{
>> +#if defined __arm__ /* A32/T32 ISA */
>> +       uint64_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __aarch64__
>> +       uint64_t old_val, new_val;
>> +       int status;
>> +       do {
>> +               __asm __volatile("ldxr %x0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("stxr %w0, %x1, [%2]"
>> +                : "=&r"(status)
>> +                                        : "r"(new_val), "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>> succeeds */
>> +       return old_val;
>> +#elif defined __mips64__
>> +       uint64_t old_val, new_val;
>> +       do {
>> +               __asm __volatile("ll %0, [%1]"
>> +                : "=&r"(old_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +               new_val = old_val + incr;
>> +               __asm __volatile("sc %0, [%1]"
>> +                : "+&r"(new_val)
>> +                                        : "r"(&ptr->v)
>> +                                        : "memory");
>> +       } while (odp_unlikely(new_val == 0));
>> +       return old_val;
>> +#elif defined __x86_64__
>> +       /* Generates good code on x86_64 */
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#else
>> +#warning odp_atomic64_fetch_add_rlx() may not be efficiently implemented
>> +       return __sync_fetch_and_add(&ptr->v, incr);
>> +#endif
>>  }
>>
>>  /**
>> - * Atomic compare and set for 64bit
>> + * Atomic add to 64-bit atomic variable
>> + * @note Relaxed memory model, no barriers.
>>   *
>> - * @param dst destination location into which the value will be written.
>> - * @param exp expected value.
>> - * @param src new value.
>> - * @return Non-zero on success; 0 on failure.
>> + * @param ptr   Pointer to a 64-bit atomic variable
>> + * @param incr  The value to be added to the atomic variable
>>   */
>> -static inline int
>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>> +static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t
>> incr)
>>  {
>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>> +       (void)odp_atomic64_fetch_add_rlx(ptr, incr);
>>  }
>>
>>  #ifdef __cplusplus
>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>> b/platform/linux-generic/include/api/odp_barrier.h
>> index a7b3215..f8eae9a 100644
>> --- a/platform/linux-generic/include/api/odp_barrier.h
>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>> @@ -27,18 +27,18 @@ extern "C" {
>>   * ODP execution barrier
>>   */
>>  typedef struct odp_barrier_t {
>> -       int              count;  /**< @private Thread count */
>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>> +       uint32_t       num_threads;  /**< @private Thread count
>> (constant) */
>> +       odp_atomic32_t in_barrier;   /**< @private Threaads in barrier */
>>  } odp_barrier_t;
>>
>>
>>  /**
>>   * Init barrier with thread count
>>   *
>> - * @param barrier    Barrier
>> - * @param count      Thread count
>> + * @param barrier     Barrier
>> + * @param num_threads Number of threads which share the barrier
>>   */
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>
>>
>>  /**
>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>> b/platform/linux-generic/include/api/odp_rwlock.h
>> index 252ebb2..ff8a9a2 100644
>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>> @@ -10,26 +10,30 @@
>>  /**
>>   * @file
>>   *
>> - * ODP RW Locks
>> + * ODP read/write lock
>> + * RW lock support multiple concurrent reads but only one (exclusive)
>> writer.
>>   */
>>
>> +#include <odp_atomic.h>
>> +
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>>
>>  /**
>>   * The odp_rwlock_t type.
>> - * write lock count is -1,
>> - * read lock count > 0
>> + * write lock is ~0U
>> + * read lock count >0 && <~0U
>>   */
>>  typedef struct {
>> -       volatile int32_t cnt; /**< -1 Write lock,
>> -                               > 0 for Read lock. */
>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>> +                                == ~0: locked for write,
>> +                                > 0 number of concurrent read locks */
>>  } odp_rwlock_t;
>>
>>
>>  /**
>> - * Initialize the rwlock to an unlocked state.
>> + * Initialize the rwlock to the unlocked state.
>>   *
>>   * @param rwlock pointer to the RW Lock.
>>   */
>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Aquire a write lock.
>> + * Aquire the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>
>>  /**
>> - * Release a write lock.
>> + * Release the write lock.
>>   *
>>   * @param rwlock pointer to a RW Lock.
>>   */
>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>> b/platform/linux-generic/include/api/odp_ticketlock.h
>> index 6277a18..c4b5e34 100644
>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>> @@ -27,8 +27,8 @@ extern "C" {
>>   * ODP ticketlock
>>   */
>>  typedef struct odp_ticketlock_t {
>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>> +       odp_atomic32_t next_ticket; /**< @private Next ticket */
>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>  } odp_ticketlock_t;
>>
>>
>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>> b/platform/linux-generic/include/odp_buffer_internal.h
>> index 2002b51..530ab96 100644
>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>         uint32_t                 index;      /* buf index in the pool */
>>         size_t                   size;       /* max data size */
>>         size_t                   cur_offset; /* current offset */
>> -       odp_atomic_int_t         ref_count;  /* reference count */
>> +       odp_atomic32_t           ref_count;  /* reference count */
>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>         int                      type;       /* type of next header */
>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>> diff --git a/platform/linux-generic/odp_barrier.c
>> b/platform/linux-generic/odp_barrier.c
>> index a82b294..6c3b884 100644
>> --- a/platform/linux-generic/odp_barrier.c
>> +++ b/platform/linux-generic/odp_barrier.c
>> @@ -8,41 +8,48 @@
>>  #include <odp_sync.h>
>>  #include <odp_spin_internal.h>
>>
>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>  {
>> -       barrier->count = count;
>> -       barrier->bar = 0;
>> -       odp_sync_stores();
>> +       barrier->num_threads = num_threads; /* Constant after
>> initialisation */
>> +       odp_atomic32_store_rls(&barrier->in_barrier, 0);
>>  }
>>
>>  /*
>>   * Efficient barrier_sync -
>>   *
>>   *   Barriers are initialized with a count of the number of callers
>> - *   that must sync on the barrier before any may proceed.
>> + *   that must sync on (enter) the barrier before any may proceed (exit).
>>   *
>>   *   To avoid race conditions and to permit the barrier to be fully
>>   *   reusable, the barrier value cycles between 0..2*count-1. When
>> - *   synchronizing the wasless variable simply tracks which half of
>> + *   synchronizing the waslow variable simply tracks which half of
>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>   *   barrier crosses to the other half of the cycle.
>>   */
>>
>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>  {
>> -       int count;
>> -       int wasless;
>> +       uint32_t count;
>> +       bool waslow;
>>
>> -       odp_sync_stores();
>> -       wasless = barrier->bar < barrier->count;
>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>> +       /* FIXME do we need acquire barrier as well? */
>> +       /* Increase threads in_barrier count, this will automatically
>> release
>> +        * the other threads when lower/upper range is switched */
>> +       count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
>> +       /* Compute lower or higher range indicator */
>> +       waslow = count < barrier->num_threads;
>>
>> -       if (count == 2*barrier->count-1) {
>> -               barrier->bar = 0;
>> -       } else {
>> -               while ((barrier->bar < barrier->count) == wasless)
>> -                       odp_spin();
>> +       /* Check if in_barrier count has "wrapped" */
>> +       if (count == 2 * barrier->num_threads - 1) {
>> +               /* Manually wrap the counter */
>> +               odp_atomic32_add_rls(&barrier->in_barrier,
>> +
>> (uint32_t)(-2*(int)barrier->num_threads));
>> +               /* We don't need to wait below, return immediately */
>> +               return;
>> +       }
>> +       /* Wait for counter to change half */
>> +       while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
>> +              barrier->num_threads) == waslow) {
>> +               odp_spin();
>>         }
>> -
>> -       odp_mem_barrier();
>>  }
>> diff --git a/platform/linux-generic/odp_buffer.c
>> b/platform/linux-generic/odp_buffer.c
>> index e54e0e7..a5939f3 100644
>> --- a/platform/linux-generic/odp_buffer.c
>> +++ b/platform/linux-generic/odp_buffer.c
>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>> odp_buffer_t buf)
>>         len += snprintf(&str[len], n-len,
>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>         len += snprintf(&str[len], n-len,
>> -                       "  ref_count    %i\n",        hdr->ref_count);
>> +                       "  ref_count    %u\n",
>> +                       odp_atomic32_load_rlx(&hdr->ref_count));
>>         len += snprintf(&str[len], n-len,
>>                         "  type         %i\n",        hdr->type);
>>         len += snprintf(&str[len], n-len,
>> diff --git a/platform/linux-generic/odp_crypto.c
>> b/platform/linux-generic/odp_crypto.c
>> index b37ad6b..d9fff10 100644
>> --- a/platform/linux-generic/odp_crypto.c
>> +++ b/platform/linux-generic/odp_crypto.c
>> @@ -26,7 +26,7 @@
>>  #define MAX_SESSIONS 32
>>
>>  typedef struct {
>> -       odp_atomic_u32_t next;
>> +       odp_atomic32_t   next;
>>         uint32_t         max;
>>         odp_crypto_generic_session_t sessions[0];
>>  } odp_crypto_global_t;
>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>         uint32_t idx;
>>         odp_crypto_generic_session_t *session = NULL;
>>
>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>> +       idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
>>         if (idx < global->max) {
>>                 session = &global->sessions[idx];
>>                 session->index = idx;
>> diff --git a/platform/linux-generic/odp_queue.c
>> b/platform/linux-generic/odp_queue.c
>> index 1318bcd..08c0d29 100644
>> --- a/platform/linux-generic/odp_queue.c
>> +++ b/platform/linux-generic/odp_queue.c
>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>> *context)
>>  {
>>         queue_entry_t *queue;
>>         queue = queue_to_qentry(handle);
>> +       /* Setting a new queue context can be viewed as a release
>> operation,
>> +        * all writes to the context must be observable before the context
>> +        * is made observable */
>>         odp_sync_stores();
>> -       queue->s.param.context = context;
>> +       queue->s.param.context = context; /* Store-release */
>> +       /* Ensure queue modification is globally visible before we return
>> +        * and the application might cause the queue to be scheduled */
>>         odp_sync_stores();
>>         return 0;
>>  }
>> diff --git a/platform/linux-generic/odp_ring.c
>> b/platform/linux-generic/odp_ring.c
>> index 632aa66..d1ec825 100644
>> --- a/platform/linux-generic/odp_ring.c
>> +++ b/platform/linux-generic/odp_ring.c
>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
>> unsigned flags)
>>                 r->cons.size = count;
>>                 r->prod.mask = count-1;
>>                 r->cons.mask = count-1;
>> -               r->prod.head = 0;
>> -               r->cons.head = 0;
>> -               r->prod.tail = 0;
>> -               r->cons.tail = 0;
>> +               odp_atomic32_store_rlx(&r->prod.head, 0);
>> +               odp_atomic32_store_rlx(&r->cons.head, 0);
>> +               odp_atomic32_store_rlx(&r->prod.tail, 0);
>> +               odp_atomic32_store_rlx(&r->cons.tail, 0);
>>
>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>         } else {
>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t prod_head, prod_next;
>>         uint32_t cons_tail, free_entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool ok;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 /* Reset n to the initial burst count */
>>                 n = max;
>>
>> -               prod_head = r->prod.head;
>> -               cons_tail = r->cons.tail;
>> +               prod_head = odp_atomic32_load_rlx(&r->prod.head);
>> +               cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * prod_head > cons_tail). So 'free_entries' is always
>> between 0
>> @@ -259,13 +259,13 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>                 }
>>
>>                 prod_next = prod_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
>> -                                             prod_next);
>> -       } while (odp_unlikely(success == 0));
>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
>> +                                                  prod_head,
>> +                                                  prod_next) ==
>> prod_head;
>> +       } while (odp_unlikely(!ok));
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -279,10 +279,10 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>          * If there are other enqueues in progress that preceeded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->prod.tail != prod_head))
>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) !=
>> prod_head))
>>                 odp_spin();
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>         return ret;
>>  }
>>
>> @@ -298,8 +298,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>         uint32_t mask = r->prod.mask;
>>         int ret;
>>
>> -       prod_head = r->prod.head;
>> -       cons_tail = r->cons.tail;
>> +       prod_head = odp_atomic32_load_rlx(&r->prod.head);
>> +       cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>> @@ -320,11 +320,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>> * const *obj_table,
>>         }
>>
>>         prod_next = prod_head + n;
>> -       r->prod.head = prod_next;
>> +       odp_atomic32_store_rlx(&r->prod.head, prod_next);
>>
>>         /* write entries in ring */
>>         ENQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /* if we exceed the watermark */
>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>> r->prod.watermark)) {
>> @@ -334,7 +333,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>> const *obj_table,
>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>         }
>>
>> -       r->prod.tail = prod_next;
>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>         return ret;
>>  }
>>
>> @@ -348,7 +347,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         uint32_t cons_head, prod_tail;
>>         uint32_t cons_next, entries;
>>         const unsigned max = n;
>> -       int success;
>> +       bool ok;
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> @@ -357,8 +356,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 /* Restore n as it may change every loop */
>>                 n = max;
>>
>> -               cons_head = r->cons.head;
>> -               prod_tail = r->prod.tail;
>> +               cons_head = odp_atomic32_load_rlx(&r->cons.head);
>> +               prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>                 /* The subtraction is done between two unsigned 32bits
>> value
>>                  * (the result is always modulo 32 bits even if we have
>>                  * cons_head > prod_tail). So 'entries' is always between
>> 0
>> @@ -378,22 +377,22 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>                 }
>>
>>                 cons_next = cons_head + n;
>> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
>> -                                             cons_next);
>> -       } while (odp_unlikely(success == 0));
>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
>> +                                                  cons_head,
>> +                                                  cons_next) ==
>> cons_head;
>> +       } while (odp_unlikely(!ok));
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>>         /*
>>          * If there are other dequeues in progress that preceded us,
>>          * we need to wait for them to complete
>>          */
>> -       while (odp_unlikely(r->cons.tail != cons_head))
>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) !=
>> cons_head))
>>                 odp_spin();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>> @@ -409,8 +408,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         unsigned i;
>>         uint32_t mask = r->prod.mask;
>>
>> -       cons_head = r->cons.head;
>> -       prod_tail = r->prod.tail;
>> +       cons_head = odp_atomic32_load_rlx(&r->cons.head);
>> +       prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>         /* The subtraction is done between two unsigned 32bits value
>>          * (the result is always modulo 32 bits even if we have
>>          * cons_head > prod_tail). So 'entries' is always between 0
>> @@ -429,13 +428,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>> **obj_table,
>>         }
>>
>>         cons_next = cons_head + n;
>> -       r->cons.head = cons_next;
>> +       odp_atomic32_store_rlx(&r->cons.head, cons_next);
>>
>>         /* copy in table */
>>         DEQUEUE_PTRS();
>> -       odp_mem_barrier();
>>
>> -       r->cons.tail = cons_next;
>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>  }
>>
>> @@ -482,8 +480,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>> **obj_table, unsigned n)
>>   */
>>  int odph_ring_full(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>  }
>>
>> @@ -492,8 +490,8 @@ int odph_ring_full(const odph_ring_t *r)
>>   */
>>  int odph_ring_empty(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return !!(cons_tail == prod_tail);
>>  }
>>
>> @@ -502,8 +500,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (prod_tail - cons_tail) & r->prod.mask;
>>  }
>>
>> @@ -512,8 +510,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>   */
>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>  {
>> -       uint32_t prod_tail = r->prod.tail;
>> -       uint32_t cons_tail = r->cons.tail;
>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>  }
>>
>> @@ -523,10 +521,10 @@ void odph_ring_dump(const odph_ring_t *r)
>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>         ODP_DBG("  flags=%x\n", r->flags);
>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>> +       ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.tail));
>> +       ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.head));
>> +       ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.tail));
>> +       ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.head));
>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>         if (r->prod.watermark == r->prod.size)
>> diff --git a/platform/linux-generic/odp_rwlock.c
>> b/platform/linux-generic/odp_rwlock.c
>> index 11c8dd7..ba0a7ca 100644
>> --- a/platform/linux-generic/odp_rwlock.c
>> +++ b/platform/linux-generic/odp_rwlock.c
>> @@ -4,58 +4,56 @@
>>   * SPDX-License-Identifier:     BSD-3-Clause
>>   */
>>
>> +#include <stdbool.h>
>>  #include <odp_atomic.h>
>>  #include <odp_rwlock.h>
>> -
>>  #include <odp_spin_internal.h>
>>
>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>  {
>> -       rwlock->cnt = 0;
>> +       odp_atomic32_store_rlx(&rwlock->cnt, 0);
>>  }
>>
>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int  is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> +       bool gotit;
>> +       do {
>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>                 /* waiting for read lock */
>> -               if (cnt < 0) {
>> +               if ((int32_t)cnt < 0) {
>>                         odp_spin();
>>                         continue;
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             cnt, cnt + 1);
>> -       }
>> +               /* Attempt to take another read lock */
>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
>> +                                                     cnt, cnt + 1) ==
>> cnt;
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release one read lock by subtracting 1 */
>> +       odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
>>  }
>>
>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>  {
>> -       int32_t cnt;
>> -       int is_locked = 0;
>> -
>> -       while (is_locked == 0) {
>> -               cnt = rwlock->cnt;
>> -               /* lock aquired, wait */
>> +       bool gotit;
>> +       do {
>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>                 if (cnt != 0) {
>> +                       /* Lock is busy */
>>                         odp_spin();
>>                         continue;
>>                 }
>> -               is_locked = odp_atomic_cmpset_u32(
>> -                                       (volatile uint32_t *)&rwlock->cnt,
>> -                                             0, -1);
>> -       }
>> +               /* Attempt to take write lock */
>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
>> +                                                     (uint32_t)-1) == 0;
>> +       } while (!gotit);
>>  }
>>
>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>  {
>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>> +       /* Release the write lock by adding 1 */
>> +       odp_atomic32_add_rls(&rwlock->cnt, 1);
>>  }
>> diff --git a/platform/linux-generic/odp_thread.c
>> b/platform/linux-generic/odp_thread.c
>> index b869b27..569b235 100644
>> --- a/platform/linux-generic/odp_thread.c
>> +++ b/platform/linux-generic/odp_thread.c
>> @@ -31,7 +31,7 @@ typedef struct {
>>
>>  typedef struct {
>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>> -       odp_atomic_int_t num;
>> +       odp_atomic32_t   num;
>>
>>  } thread_globals_t;
>>
>> @@ -67,7 +67,7 @@ static int thread_id(void)
>>         int id;
>>         int cpu;
>>
>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>> +       id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
>>
>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>                 ODP_ERR("Too many threads\n");
>> @@ -77,7 +77,7 @@ static int thread_id(void)
>>         cpu = sched_getcpu();
>>
>>         if (cpu < 0) {
>> -               ODP_ERR("getcpu failed\n");
>> +               ODP_ERR("sched_getcpu failed\n");
>>                 return -1;
>>         }
>>
>> diff --git a/platform/linux-generic/odp_ticketlock.c
>> b/platform/linux-generic/odp_ticketlock.c
>> index be5b885..cadc0e0 100644
>> --- a/platform/linux-generic/odp_ticketlock.c
>> +++ b/platform/linux-generic/odp_ticketlock.c
>> @@ -12,9 +12,8 @@
>>
>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>  {
>> -       ticketlock->next_ticket = 0;
>> -       ticketlock->cur_ticket  = 0;
>> -       odp_sync_stores();
>> +       odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
>> +       odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
>>  }
>>
>>
>> @@ -22,30 +21,14 @@ void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>  {
>>         uint32_t ticket;
>>
>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>> +       ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>
>> -       while (ticket != ticketlock->cur_ticket)
>> +       while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>                 odp_spin();
>> -
>> -       odp_mem_barrier();
>>  }
>>
>>
>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>  {
>> -       odp_sync_stores();
>> -
>> -       ticketlock->cur_ticket++;
>> -
>> -#if defined __OCTEON__
>> -       odp_sync_stores();
>> -#else
>> -       odp_mem_barrier();
>> -#endif
>> -}
>> -
>> -
>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>> -{
>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>> +       odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
>>  }
>> diff --git a/platform/linux-generic/odp_timer.c
>> b/platform/linux-generic/odp_timer.c
>> index 313c713..938429f 100644
>> --- a/platform/linux-generic/odp_timer.c
>> +++ b/platform/linux-generic/odp_timer.c
>> @@ -32,8 +32,8 @@ typedef struct {
>>
>>  typedef struct {
>>         int               allocated;
>> -       volatile int      active;
>> -       volatile uint64_t cur_tick;
>> +       odp_atomic32_t    active;
>> +       odp_atomic64_t    cur_tick;
>>         timer_t           timerid;
>>         odp_timer_t       timer_hdl;
>>         odp_buffer_pool_t pool;
>> @@ -150,16 +150,14 @@ static void notify_function(union sigval sigval)
>>
>>         timer = sigval.sival_ptr;
>>
>> -       if (timer->active == 0) {
>> +       if (odp_atomic32_load_rlx(&timer->active) == 0) {
>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>                 return;
>>         }
>>
>>         /* ODP_DBG("Tick\n"); */
>>
>> -       cur_tick = timer->cur_tick++;
>> -
>> -       odp_sync_stores();
>> +       cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
>>
>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>
>> @@ -318,8 +316,7 @@ odp_timer_t odp_timer_create(const char *name,
>> odp_buffer_pool_t pool,
>>                 timer->tick[i].list = NULL;
>>         }
>>
>> -       timer->active = 1;
>> -       odp_sync_stores();
>> +       odp_atomic32_store_rls(&timer->active, 1);
>>
>>         timer_start(timer);
>>
>> @@ -340,7 +337,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>> timer_hdl, uint64_t tmo_tick,
>>         id = (int)timer_hdl - 1;
>>         timer = &odp_timer.timer[id];
>>
>> -       cur_tick = timer->cur_tick;
>> +       cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
>>         if (tmo_tick <= cur_tick) {
>>                 ODP_DBG("timeout too close\n");
>>                 return ODP_TIMER_TMO_INVALID;
>> @@ -416,7 +413,7 @@ uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
>>         uint32_t id;
>>
>>         id = timer_hdl - 1;
>> -       return odp_timer.timer[id].cur_tick;
>> +       return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
>>  }
>>
>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>> diff --git a/test/api_test/odp_atomic_test.c
>> b/test/api_test/odp_atomic_test.c
>> index 9019d4f..4d27b32 100644
>> --- a/test/api_test/odp_atomic_test.c
>> +++ b/test/api_test/odp_atomic_test.c
>> @@ -10,17 +10,14 @@
>>  #include <odp_common.h>
>>  #include <odp_atomic_test.h>
>>
>> -static odp_atomic_int_t a32;
>> -static odp_atomic_u32_t a32u;
>> -static odp_atomic_u64_t a64u;
>> +static odp_atomic32_t a32u;
>> +static odp_atomic64_t a64u;
>>
>> -static odp_atomic_int_t numthrds;
>> +static odp_barrier_t barrier;
>>
>>  static const char * const test_name[] = {
>>         "dummy",
>>         "test atomic basic ops add/sub/inc/dec",
>> -       "test atomic inc/dec of signed word",
>> -       "test atomic add/sub of signed word",
>>         "test atomic inc/dec of unsigned word",
>>         "test atomic add/sub of unsigned word",
>>         "test atomic inc/dec of unsigned double word",
>> @@ -31,39 +28,29 @@ static struct timeval tv0[MAX_WORKERS],
>> tv1[MAX_WORKERS];
>>
>>  static void usage(void)
>>  {
>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>> +       printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
>>                "\t<testcase> is\n"
>>                "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>> -              "\t\t2 - Test inc dec of signed word\n"
>> -              "\t\t3 - Test add sub of signed word\n"
>> -              "\t\t4 - Test inc dec of unsigned word\n"
>> -              "\t\t5 - Test add sub of unsigned word\n"
>> -              "\t\t6 - Test inc dec of double word\n"
>> -              "\t\t7 - Test add sub of double word\n"
>> -              "\t<num of pthread> is optional\n"
>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>> +              "\t\t2 - Test inc dec of unsigned word\n"
>> +              "\t\t3 - Test add sub of unsigned word\n"
>> +              "\t\t4 - Test inc dec of double word\n"
>> +              "\t\t5 - Test add sub of double word\n"
>> +              "\t<num of thread> is optional\n"
>> +              "\t\t<1 - 31> - no of threads to start\n"
>>                "\t\tif user doesn't specify this option, then\n"
>> -              "\t\tno of pthreads created is equivalent to no of cores\n"
>> +              "\t\tno of threads created is equivalent to no of cores\n"
>>                "\t\tavailable in the system\n"
>>                "\tExample usage:\n"
>>                "\t\t./odp_atomic -t 2\n"
>>                "\t\t./odp_atomic -t 3 -n 12\n");
>>  }
>>
>> -void test_atomic_inc_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_int(&a32);
>> -}
>> -
>>  void test_atomic_inc_u32(void)
>>  {
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u32(&a32u);
>> +               odp_atomic32_add_rlx(&a32u, 1);
>>  }
>>
>>  void test_atomic_inc_64(void)
>> @@ -71,15 +58,7 @@ void test_atomic_inc_64(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_inc_u64(&a64u);
>> -}
>> -
>> -void test_atomic_dec_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_int(&a32);
>> +               odp_atomic64_add_rlx(&a64u, 1);
>>  }
>>
>>  void test_atomic_dec_u32(void)
>> @@ -87,7 +66,7 @@ void test_atomic_dec_u32(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u32(&a32u);
>> +               odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
>>  }
>>
>>  void test_atomic_dec_64(void)
>> @@ -95,15 +74,7 @@ void test_atomic_dec_64(void)
>>         int i;
>>
>>         for (i = 0; i < CNT; i++)
>> -               odp_atomic_dec_u64(&a64u);
>> -}
>> -
>> -void test_atomic_add_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>> +               odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
>>  }
>>
>>  void test_atomic_add_u32(void)
>> @@ -111,7 +82,7 @@ void test_atomic_add_u32(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>> +               odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_add_64(void)
>> @@ -119,15 +90,7 @@ void test_atomic_add_64(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_sub_32(void)
>> -{
>> -       int i;
>> -
>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>> +               odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_sub_u32(void)
>> @@ -135,7 +98,7 @@ void test_atomic_sub_u32(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>> +               odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_sub_64(void)
>> @@ -143,19 +106,7 @@ void test_atomic_sub_64(void)
>>         int i;
>>
>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>> -}
>> -
>> -void test_atomic_inc_dec_32(void)
>> -{
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -}
>> -
>> -void test_atomic_add_sub_32(void)
>> -{
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> +               odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
>>  }
>>
>>  void test_atomic_inc_dec_u32(void)
>> @@ -188,11 +139,6 @@ void test_atomic_add_sub_64(void)
>>   */
>>  void test_atomic_basic(void)
>>  {
>> -       test_atomic_inc_32();
>> -       test_atomic_dec_32();
>> -       test_atomic_add_32();
>> -       test_atomic_sub_32();
>> -
>>         test_atomic_inc_u32();
>>         test_atomic_dec_u32();
>>         test_atomic_add_u32();
>> @@ -206,31 +152,24 @@ void test_atomic_basic(void)
>>
>>  void test_atomic_init(void)
>>  {
>> -       odp_atomic_init_int(&a32);
>> -       odp_atomic_init_u32(&a32u);
>> -       odp_atomic_init_u64(&a64u);
>> +       odp_atomic32_store_rlx(&a32u, 0);
>> +       odp_atomic64_store_rlx(&a64u, 0);
>>  }
>>
>>  void test_atomic_store(void)
>>  {
>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>> +       odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
>> +       odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
>>  }
>>
>>  int test_atomic_validate(void)
>>  {
>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>> -               return -1;
>> -       }
>> -
>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>> +       if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
>>                 ODP_ERR("Atomic u32 usual functions failed\n");
>>                 return -1;
>>         }
>>
>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>> +       if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
>>                 ODP_ERR("Atomic u64 usual functions failed\n");
>>                 return -1;
>>         }
>> @@ -247,11 +186,8 @@ static void *run_thread(void *arg)
>>
>>         ODP_DBG("Thread %i starts\n", thr);
>>
>> -       odp_atomic_inc_int(&numthrds);
>> -
>> -       /* Wait here until all pthreads are created */
>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>> -               ;
>> +       /* Wait here until all threads have arrived */
>> +       odp_barrier_sync(&barrier);
>>
>>         gettimeofday(&tv0[thr], NULL);
>>
>> @@ -259,12 +195,6 @@ static void *run_thread(void *arg)
>>         case TEST_MIX:
>>                 test_atomic_basic();
>>                 break;
>> -       case TEST_INC_DEC_S32:
>> -               test_atomic_inc_dec_32();
>> -               break;
>> -       case TEST_ADD_SUB_S32:
>> -               test_atomic_add_sub_32();
>> -               break;
>>         case TEST_INC_DEC_U32:
>>                 test_atomic_inc_dec_u32();
>>                 break;
>> @@ -327,7 +257,6 @@ int main(int argc, char *argv[])
>>         if (pthrdnum == 0)
>>                 pthrdnum = odp_sys_core_count();
>>
>> -       odp_atomic_init_int(&numthrds);
>>         test_atomic_init();
>>         test_atomic_store();
>>
>> @@ -342,6 +271,7 @@ int main(int argc, char *argv[])
>>                 usage();
>>                 goto err_exit;
>>         }
>> +       odp_barrier_init(&barrier, pthrdnum);
>>         odp_test_thread_create(run_thread, &thrdarg);
>>
>>         odp_test_thread_exit(&thrdarg);
>> diff --git a/test/api_test/odp_atomic_test.h
>> b/test/api_test/odp_atomic_test.h
>> index 7814da5..aaa9d34 100644
>> --- a/test/api_test/odp_atomic_test.h
>> +++ b/test/api_test/odp_atomic_test.h
>> @@ -18,14 +18,11 @@
>>  #define ADD_SUB_CNT    5
>>
>>  #define        CNT 500000
>> -#define        S32_INIT_VAL    (1UL << 10)
>>  #define        U32_INIT_VAL    (1UL << 10)
>>  #define        U64_INIT_VAL    (1ULL << 33)
>>
>>  typedef enum {
>>         TEST_MIX = 1, /* Must be first test case num */
>> -       TEST_INC_DEC_S32,
>> -       TEST_ADD_SUB_S32,
>>         TEST_INC_DEC_U32,
>>         TEST_ADD_SUB_U32,
>>         TEST_INC_DEC_64,
>> @@ -34,16 +31,10 @@ typedef enum {
>>  } odp_test_atomic_t;
>>
>>
>> -void test_atomic_inc_dec_32(void);
>> -void test_atomic_add_sub_32(void);
>>  void test_atomic_inc_dec_u32(void);
>>  void test_atomic_add_sub_u32(void);
>>  void test_atomic_inc_dec_64(void);
>>  void test_atomic_add_sub_64(void);
>> -void test_atomic_inc_32(void);
>> -void test_atomic_dec_32(void);
>> -void test_atomic_add_32(void);
>> -void test_atomic_sub_32(void);
>>  void test_atomic_inc_u32(void);
>>  void test_atomic_dec_u32(void);
>>  void test_atomic_add_u32(void);
>> --
>> 1.9.1
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>
>
Ola Liljedahl Oct. 16, 2014, 8:53 a.m. UTC | #4
Yes these fallbacks should be supported by Clang/LLVM. I used clang 3.4 to
compile odp_atomic.h for different architectures (as LLVM by default
supports cross compilation). Some (32-bit) architectures may not support
e.g. __sync builtins on 64-bit data types but this is a limitation of those
architectures. E.g. 32-bit PowerPC does not support 64-bit atomic
operations, you will get a linker error because libgcc.a will not include
the necessary helper routines. This was a problem with the original code as
well. The new API actually supports a work around for this limitation
because now the atomic data types are structs and not just the scalar
variables. This struct could be extended with e.g. a spin lock that could
protect a multi-word implementation and allow atomic accesses. It wouldn't
be very fast...

Where do I add "-std=c99 --pedantic" in the ODP makefiles to try this for
all of linux-generic?

The true performance improvement would only come on architectures that
support acquire and release semantics natively, e.g. ARMv8. But by combing
through the code, I have decreased the number of barriers (there was some
redundancy) so the code might be faster on e.g. Cortex-A15 and possibly
OCTEON as well. Counter updates which now can be relaxed (no barriers)
should definitively be faster.


On 16 October 2014 03:38, Mike Holmes <mike.holmes@linaro.org> wrote:

>
>
> On 15 October 2014 19:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
>
>> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
>> intended to be generic wouldn't omitting these be better?
>>
>> On Wed, Oct 15, 2014 at 4:46 PM, Ola Liljedahl <ola.liljedahl@linaro.org>
>> wrote:
>>
>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>> ---
>>> Implementation of C11-based memory model for atomic operations.
>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>> code that
>>> implements multithreaded synchronization primitives (e.g. locks,
>>> barriers).
>>> Rewrote such primitives to use the new atomic operations.
>>> Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
>>> Other architectures will fall back to GCC __sync builtins which often
>>> include
>>> unnecessarily heavy barrier/sync operations (always sequentially
>>> consistent).
>>>
>>
> Are these fallbacks also 100% supported by LLVM ?
> ODP Linux generic implementation is already compiler crippled and not C
> std compliant  with Variable Length Arrays in Structs (VLAIS)
> The Linux kernel is still trying to fix those GCCisims
> http://lkml.iu.edu/hypermail/linux/kernel/1410.1/03100.html
>
> If this works with std=c99 --pedantic etc for both gcc and llvm, then it
> looks like it would be an improvement from your introduction, is there any
> form of benchmark to show it helps ?
>
>
>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter)
>>> and
>>> odp_ring enqueue/dequeue (need release barrier but only had compiler
>>> barrier).
>>>
>>>  example/generator/odp_generator.c                  |  43 +-
>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>  example/odp_example/odp_example.c                  |   2 +-
>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>  helper/include/odph_ring.h                         |   8 +-
>>>  platform/linux-generic/include/api/odp_atomic.h    | 820
>>> ++++++++++++---------
>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>  .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>  platform/linux-generic/odp_barrier.c               |  43 +-
>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>  platform/linux-generic/odp_crypto.c                |   4 +-
>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>  platform/linux-generic/odp_ring.c                  |  86 ++-
>>>  platform/linux-generic/odp_rwlock.c                |  46 +-
>>>  platform/linux-generic/odp_thread.c                |   6 +-
>>>  platform/linux-generic/odp_ticketlock.c            |  27 +-
>>>  platform/linux-generic/odp_timer.c                 |  17 +-
>>>  test/api_test/odp_atomic_test.c                    | 126 +---
>>>  test/api_test/odp_atomic_test.h                    |   9 -
>>>  21 files changed, 651 insertions(+), 636 deletions(-)
>>>
>>> diff --git a/example/generator/odp_generator.c
>>> b/example/generator/odp_generator.c
>>> index eb8b340..cf2d77b 100644
>>> --- a/example/generator/odp_generator.c
>>> +++ b/example/generator/odp_generator.c
>>> @@ -62,10 +62,10 @@ typedef struct {
>>>   * counters
>>>  */
>>>  static struct {
>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>> +       odp_atomic64_t seq;     /**< ip seq to be send */
>>> +       odp_atomic64_t ip;      /**< ip packets */
>>> +       odp_atomic64_t udp;     /**< udp packets */
>>> +       odp_atomic64_t icmp;    /**< icmp packets */
>>>  } counters;
>>>
>>>  /** * Thread specific arguments
>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_UDPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_UDP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>> ODPH_ICMPHDR_LEN +
>>>                                        ODPH_IPV4HDR_LEN);
>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
>>>         ip->id = odp_cpu_to_be_16(seq);
>>>         ip->chksum = 0;
>>>         odph_ipv4_csum_update(pkt);
>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>                 }
>>>
>>>                 if (args->appl.interval != 0) {
>>> +                       uint64_t seq =
>>> odp_atomic64_load_rlx(&counters.seq);
>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>> -                              thr, counters.seq, counters.seq%0xffff);
>>> +                              thr, seq, seq%0xffff);
>>>                         /* TODO use odp timer */
>>>                         usleep(args->appl.interval * 1000);
>>>                 }
>>> -               if (args->appl.number != -1 && counters.seq
>>> -                   >= (unsigned int)args->appl.number) {
>>> +               if (args->appl.number != -1 &&
>>> +                   odp_atomic64_load_rlx(&counters.seq) >=
>>> +                   (unsigned int)args->appl.number) {
>>>                         break;
>>>                 }
>>>         }
>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>         /* receive number of reply pks until timeout */
>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
>>>                 while (args->appl.timeout >= 0) {
>>> -                       if (counters.icmp >= (unsigned
>>> int)args->appl.number)
>>> +                       if (odp_atomic64_load_rlx(&counters.icmp) >=
>>> +                           (unsigned int)args->appl.number)
>>>                                 break;
>>>                         /* TODO use odp timer */
>>>                         sleep(1);
>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>
>>>         /* print info */
>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>>> +               printf("  [%02i] total send: %ju\n", thr,
>>> +                      odp_atomic64_load_rlx(&counters.seq));
>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>> -                      thr, counters.seq, counters.icmp);
>>> +                      thr, odp_atomic64_load_rlx(&counters.seq),
>>> +                      odp_atomic64_load_rlx(&counters.icmp));
>>>         }
>>>         return arg;
>>>  }
>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>                         continue;
>>>
>>> -               odp_atomic_inc_u64(&counters.ip);
>>> +               odp_atomic64_add_rlx(&counters.ip, 1);
>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>                 ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>
>>>                 /* udp */
>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>> -                       odp_atomic_inc_u64(&counters.udp);
>>> +                       odp_atomic64_add_rlx(&counters.udp, 1);
>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>                                         odp_be_to_cpu_16(udp->length) -
>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>> pkt_tbl[], unsigned len)
>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>                         /* echo reply */
>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>> +                               odp_atomic64_add_rlx(&counters.icmp, 1);
>>>                                 memcpy(&tvsend, buf + offset +
>>> ODPH_ICMPHDR_LEN,
>>>                                        sizeof(struct timeval));
>>>                                 /* TODO This should be changed to use an
>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>         }
>>>
>>>         /* init counters */
>>> -       odp_atomic_init_u64(&counters.seq);
>>> -       odp_atomic_init_u64(&counters.ip);
>>> -       odp_atomic_init_u64(&counters.udp);
>>> -       odp_atomic_init_u64(&counters.icmp);
>>> +       odp_atomic64_store_rlx(&counters.seq, 0);
>>> +       odp_atomic64_store_rlx(&counters.ip, 0);
>>> +       odp_atomic64_store_rlx(&counters.udp, 0);
>>> +       odp_atomic64_store_rlx(&counters.icmp, 0);
>>>
>>>         /* Reserve memory for args from shared mem */
>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>> index 2f2dc19..76c27d0 100644
>>> --- a/example/ipsec/odp_ipsec.c
>>> +++ b/example/ipsec/odp_ipsec.c
>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>         printf("Num worker threads: %i\n", num_workers);
>>>
>>>         /* Create a barrier to synchronize thread startup */
>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>
>>>         /*
>>>          * By default core #0 runs Linux kernel background tasks.
>>> diff --git a/example/odp_example/odp_example.c
>>> b/example/odp_example/odp_example.c
>>> index 0e9aa3d..c473395 100644
>>> --- a/example/odp_example/odp_example.c
>>> +++ b/example/odp_example/odp_example.c
>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>         odp_shm_print_all();
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>
>>>         if (args.proc_mode) {
>>>                 int ret;
>>> diff --git a/example/timer/odp_timer_test.c
>>> b/example/timer/odp_timer_test.c
>>> index 78b2ae2..dfbeae9 100644
>>> --- a/example/timer/odp_timer_test.c
>>> +++ b/example/timer/odp_timer_test.c
>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>         printf("\n");
>>>
>>>         /* Barrier to sync test case execution */
>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>
>>>         /* Create and launch worker threads */
>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>> index 76c1db8..5e78b34 100644
>>> --- a/helper/include/odph_ring.h
>>> +++ b/helper/include/odph_ring.h
>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>>                 uint32_t size;           /* Size of ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Producer head. */
>>> -               uint32_t tail;          /* Producer tail. */
>>> +               odp_atomic32_t head;    /* Producer head. */
>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>         } prod ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Consumer */
>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>>                 uint32_t size;           /* Size of the ring. */
>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>> -               uint32_t head;          /* Consumer head. */
>>> -               uint32_t tail;          /* Consumer tail. */
>>> +               odp_atomic32_t head;    /* Consumer head. */
>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>         } cons ODP_ALIGNED_CACHE;
>>>
>>>         /** @private Memory space of ring starts here. */
>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>> b/platform/linux-generic/include/api/odp_atomic.h
>>> index 0cc4cf4..89f183c 100644
>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>> @@ -4,463 +4,559 @@
>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>   */
>>>
>>> -
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP atomic operations
>>> + * ODP atomic types and operations, semantically a subset of C11
>>> atomics.
>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>> directly
>>> + * without using the required access functions.
>>> + * Atomic functions must be used to operate on atomic variables!
>>>   */
>>>
>>>  #ifndef ODP_ATOMIC_H_
>>>  #define ODP_ATOMIC_H_
>>>
>>> +#include <stdint.h>
>>> +#include <odp_align.h>
>>> +#include <odp_hints.h>
>>> +
>>>  #ifdef __cplusplus
>>>  extern "C" {
>>>  #endif
>>>
>>> -
>>> -#include <odp_std_types.h>
>>> -
>>> -
>>> -/**
>>> - * Atomic integer
>>> - */
>>> -typedef volatile int32_t odp_atomic_int_t;
>>> -
>>>  /**
>>> - * Atomic unsigned integer 64 bits
>>> + * 32-bit (unsigned) atomic type
>>>   */
>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>> +typedef struct {
>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>> +} odp_atomic32_t
>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>
>>>  /**
>>> - * Atomic unsigned integer 32 bits
>>> + * 64-bit (unsigned) atomic type
>>>   */
>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>> -
>>> +typedef struct {
>>> +       uint64_t v; /**< Actual storage for the atomic variable */
>>> +} odp_atomic64_t
>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>>
>>> -/**
>>> - * Initialize atomic integer
>>> - *
>>> - * @param ptr    An integer atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> -
>>> -/**
>>> - * Load value of atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic integer value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>>
>>> +/*****************************************************************************
>>> + * Just a few helpers
>>> +
>>> *****************************************************************************/
>>>
>>> -/**
>>> - * Store value to atomic integer
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>> new_value)
>>> -{
>>> -       *ptr = new_value;
>>> -}
>>> -
>>> -/**
>>> - * Fetch and add atomic integer
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and subtract atomic integer
>>> - *
>>> - * @param ptr    An atomic integer variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>> value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic int variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Decrement atomic integer by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>> -}
>>> +#ifdef __OCTEON__
>>> +/* OCTEON Write Memory Barrier */
>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>> +       /* Double syncw to work around errata */ \
>>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
>>> +       : : : "memory")
>>> +/* syncw is also used to flush the write buffer which makes stores
>>> visible
>>> + * quicker which should be beneficial to release operations */
>>> +#define OCTEON_FLUSH() __asm __volatile( \
>>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
>>> +       : : : "memory")
>>> +#else
>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>> ARMv7-a */
>>> +/** Compiler and hardware full memory barrier */
>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>> +/** Flush write buffer on OCTEON */
>>> +#define OCTEON_FLUSH() (void)0
>>> +#endif
>>>
>>> -/**
>>> - * Initialize atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       *ptr = 0;
>>> -}
>>> +/** Compiler memory barrier */
>>> +#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
>>>
>>> -/**
>>> - * Load value of atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return atomic uint32 value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> - */
>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return *ptr;
>>> -}
>>>
>>> +/*****************************************************************************
>>> + * Operations on 32-bit atomics
>>> + * odp_atomic32_load_rlx
>>> + * odp_atomic32_store_rlx
>>> + * odp_atomic32_load_acq
>>> + * odp_atomic32_store_rls
>>> + * odp_atomic32_cmp_and_swap_rlx - return old value
>>> + * odp_atomic32_fetch_add_rlx - return old value
>>> + * odp_atomic32_fetch_add_rls - return old value
>>> + * odp_atomic32_add_rlx - no return value
>>> + * odp_atomic32_add_rls - no return value
>>> +
>>> *****************************************************************************/
>>>
>>>  /**
>>> - * Store value to atomic uint32
>>> + * Relaxed atomic load of 32-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>>   *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * @return Value of the variable
>>>   */
>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>> -                                       uint32_t new_value)
>>> +static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
>>>  {
>>> -       *ptr = new_value;
>>> +       uint32_t val;
>>> +       COMPILER_BARRIER();
>>> +       /* Read of aligned word is atomic */
>>> +       val = ptr->v;
>>> +       COMPILER_BARRIER();
>>> +       return val;
>>>  }
>>>
>>>  /**
>>> - * Fetch and add atomic uint32
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> + * Relaxed atomic store of 32-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>>   *
>>> - * @return Value of the variable before the operation
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param val   Value to write to the variable
>>>   */
>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> +static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr, uint32_t
>>> val)
>>>  {
>>> -       return __sync_fetch_and_add(ptr, value);
>>> +       COMPILER_BARRIER();
>>> +       /* Write of aligned word is atomic */
>>> +       ptr->v = val;
>>> +       COMPILER_BARRIER();
>>>  }
>>>
>>>  /**
>>> - * Fetch and subtract uint32
>>> + * Atomic load-acquire of 32-bit atomic variable
>>> + * @note SC-load-acquire barrier, later accesses cannot move before
>>> + * the load-acquire access.
>>>   *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be sub to the variable
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>   *
>>> - * @return Value of the variable before the operation
>>> + * @return Value of the variable
>>>   */
>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>> -                                               uint32_t value)
>>> +static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
>>>  {
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> +#if defined __aarch64__
>>> +       uint32_t val;
>>> +       __asm __volatile("ldar %w0, [%1]"
>>> +                : "=&r"(val)
>>> +                                : "r"(&ptr->v)
>>> +                                : "memory");
>>> +       return val;
>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>> +       /* Read of aligned word is atomic */
>>> +       uint32_t val = ptr->v;
>>> +       /* To prevent later accesses from moving up */
>>> +       /* FIXME: Herb Sutter claims HW barrier not needed on x86? */
>>> +       COMPILER_HW_BARRIER();
>>> +       return val;
>>> +#else
>>> +#warning odp_atomic32_load_acq() may not be efficiently implemented
>>> +       /* Assume read of aligned word is atomic */
>>> +       uint32_t val = ptr->v;
>>> +       /* To prevent later accesses from moving up */
>>> +       COMPILER_HW_BARRIER();
>>> +       return val;
>>> +#endif
>>>  }
>>>
>>>  /**
>>> - * Fetch and increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __OCTEON__
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       uint32_t ret;
>>> -
>>> -       __asm__ __volatile__ ("syncws");
>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>> -                             "r" (ptr));
>>> -
>>> -       return ret;
>>> -}
>>> -
>>> + * Atomic store-release of 32-bit atomic variable
>>> + * @note SC-store-release barrier, earlier accesses cannot move after
>>> + * store-release access.
>>> + *
>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>> + * @param val  Value to write to the atomic variable
>>> + */
>>> +static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr, uint32_t
>>> val)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       /* Compiler and HW barrier to prevent earlier accesses from
>>> moving
>>> +        * down */
>>> +       COMPILER_HW_BARRIER();
>>> +       /* Write of aligned word is atomic */
>>> +       ptr->v = val;
>>> +       /* Compiler and HW barrier to prevent this store from moving
>>> down after
>>> +        * a later load-acquire and thus create overlapping critical
>>> sections.
>>> +        * Herb Sutter thinks this is needed */
>>> +       COMPILER_HW_BARRIER();
>>> +#elif defined __aarch64__
>>> +       __asm __volatile("stlr %w0, [%1]"
>>> +                :
>>> +                : "r"(val), "r"(&ptr->v)
>>> +                                : "memory");
>>> +#elif defined __mips64__
>>> +       /* Compiler and HW barrier to prevent earlier accesses from
>>> moving
>>> +        * down */
>>> +       COMPILER_HW_BARRIER();
>>> +       /* Write of aligned word is atomic */
>>> +       ptr->v = val;
>>> +       /* Compiler and HW barrier to prevent this store from moving
>>> down after
>>> +        * a later load-acquire and thus create overlapping critical
>>> sections.
>>> +        * Herb Sutter thinks this is needed */
>>> +       COMPILER_HW_BARRIER();
>>> +#elif defined __x86_64__
>>> +       /* This is actually an atomic exchange operation */
>>> +       /* Generates good code on x86_64 */
>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>  #else
>>> -
>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>> -}
>>> -
>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>> +       /* This is actually an atomic exchange operation */
>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>  #endif
>>> -
>>> -/**
>>> - * Increment atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>>  }
>>>
>>> -/**
>>> - * Fetch and decrement uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>> -}
>>>
>>>  /**
>>> - * Decrement atomic uint32 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>> + * Atomic compare and swap of 32-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>> + * @note Not compare-and-set! Called should compare return value with
>>> expected
>>> + * parameter to check if swap operation succeeded.
>>> + *
>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>> + * @param exp  Expected old value
>>> + * @param val  New value
>>> + * @return Actual old value, if different from 'exp' then swap failed
>>> + */
>>> +static inline uint32_t
>>> +odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
>>> +                             uint32_t exp,
>>> +                             uint32_t val)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t old;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%1]"
>>> +                : "=&r"(old)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Clear exclusive access monitor */
>>> +                       __asm __volatile("clrex");
>>> +                       break;
>>> +               }
>>> +               /* Current value is as expected, attempt to write new
>>> value */
>>> +               __asm __volatile("strex %0, %1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               /* Restart the loop so we can re-read the previous value
>>> */
>>> +       } while (odp_unlikely(status != 0));
>>> +       return old;
>>> +#elif defined __aarch64__
>>> +       uint32_t old;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldxr %w0, [%1]"
>>> +                : "=&r"(old)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       /* Clear exclusive access monitor */
>>> +                       __asm __volatile("clrex");
>>> +                       break;
>>> +               }
>>> +               /* Current value is as expected, attempt to write new
>>> value */
>>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               /* Restart the loop so we can re-read the previous value
>>> */
>>> +       } while (odp_unlikely(status != 0));
>>> +       return old;
>>> +#elif defined __mips64__
>>> +       uint32_t old, new_val;
>>> +       do {
>>> +               __asm __volatile("llw %0, [%1]"
>>> +                : "=&r"(old)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               if (odp_unlikely(old != exp)) {
>>> +                       /* Value has changed, can't proceed */
>>> +                       break;
>>> +               }
>>> +               /* Current value is as expected, attempt to write new
>>> value */
>>> +               new_val = val;
>>> +               __asm __volatile("scw %0, [%1]"
>>> +                : "+&r"(new_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(new_val == 0));
>>> +       return old;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>>> +#else
>>> +#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently
>>> implemented
>>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>>> +#endif
>>>  }
>>>
>>>  /**
>>> - * Atomic compare and set for 32bit
>>> - *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> - */
>>> -static inline int
>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
>>> -{
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> + * Atomic fetch and add to 32-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>> + * @note A - B <=> A + (-B)
>>> + *
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + *
>>> + * @return Value of the atomic variable before the addition
>>> + */
>>> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>>> +               uint32_t incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint32_t old_val, new_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrex %0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("strex %0, %1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(status != 0));
>>> +       return old_val;
>>> +#elif defined __aarch64__
>>> +       uint32_t old_val, new_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldxr %w0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(status != 0));
>>> +       return old_val;
>>> +#elif defined __mips64__
>>> +       uint32_t old_val, new_val;
>>> +       do {
>>> +               __asm __volatile("llw %0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("scw %0, [%1]"
>>> +                : "+&r"(new_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(new_val == 0));
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +#warning odp_atomic32_fetch_add_rlx() may not be efficiently implemented
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>>  }
>>>
>>>  /**
>>> - * Initialize atomic uint64
>>> + * Atomic fetch and add to 32-bit atomic variable
>>> + * @note Sequential consistent memory model, barriers before and after
>>> the
>>> + * @note A - B <=> A + (-B)
>>>   *
>>> - * @param ptr    An atomic variable
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * @return Value of the atomic variable before the addition
>>>   */
>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>> +static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
>>> +               uint32_t incr)
>>>  {
>>> -       *ptr = 0;
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       COMPILER_HW_BARRIER();
>>> +       return odp_atomic32_fetch_add_rlx(ptr, incr);
>>> +#elif defined __aarch64__
>>> +       /* We basically get acquire/release semantics */
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#elif defined __mips64__
>>> +       uint32_t old;
>>> +       COMPILER_HW_BARRIER();
>>> +       old = odp_atomic32_fetch_add_rlx(ptr, incr);
>>> +       OCTEON_FLUSH();
>>> +       return old;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +#warning odp_atomic32_fetch_add_rls() may not be efficiently implemented
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>>  }
>>>
>>>  /**
>>> - * Load value of atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> + * Atomic add to 32-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>>   *
>>> - * @return atomic uint64 value
>>> - *
>>> - * @note The operation is not synchronized with other threads
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>>   */
>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>> +static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
>>> +                                       uint32_t incr)
>>>  {
>>> -       return *ptr;
>>> +       /* Use odp_atomic32_fetch_add_rlx() for now */
>>> +       (void)odp_atomic32_fetch_add_rlx(ptr, incr);
>>>  }
>>>
>>>  /**
>>> - * Store value to atomic uint64
>>> - *
>>> - * @param ptr        An atomic variable
>>> - * @param new_value  Store new_value to a variable
>>> + * Atomic add to 32-bit atomic variable
>>> + * @note Sequential consistent memory model, barriers before and after
>>> the
>>> + * operation.
>>>   *
>>> - * @note The operation is not synchronized with other threads
>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>>   */
>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>> -                                       uint64_t new_value)
>>> +static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t
>>> incr)
>>>  {
>>> -       *ptr = new_value;
>>> +       /* Use odp_atomic32_fetch_add_rls() for now */
>>> +       (void)odp_atomic32_fetch_add_rls(ptr, incr);
>>>  }
>>>
>>> -/**
>>> - * Add atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>> -{
>>> -       __sync_fetch_and_add(ptr, value);
>>> -}
>>>
>>> +/*****************************************************************************
>>> + * Operations on 64-bit atomics
>>> + * odp_atomic64_load_rlx
>>> + * odp_atomic64_store_rlx
>>> + * odp_atomic64_fetch_add_rlx
>>> + * odp_atomic64_add_rlx
>>> +
>>> *****************************************************************************/
>>>
>>>  /**
>>> - * Fetch and add atomic uint64
>>> + * Relaxed atomic load of 64-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>>   *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be added to the variable
>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>>   *
>>> - * @return Value of the variable before the operation
>>> + * @return Value of the atomic variable
>>>   */
>>> -
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> +static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
>>>  {
>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t val;
>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>> +                        "clrex" /* Clear exclusive access monitor */
>>> +                : "=&r"(val)
>>> +                                : "r"(&ptr->v)
>>> +                                : );
>>> +       return val;
>>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>>> +       /* Read of aligned quad/double word is atomic */
>>> +       return ptr->v;
>>>  #else
>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_add(ptr, value);
>>> -}
>>> +#warning odp_atomic64_load_rlx() may not be efficiently implemented
>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>>  #endif
>>> -/**
>>> - * Subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>> value)
>>> -{
>>> -       __sync_fetch_and_sub(ptr, value);
>>>  }
>>>
>>>  /**
>>> - * Fetch and subtract atomic uint64
>>> - *
>>> - * @param ptr    An atomic variable
>>> - * @param value  A value to be subtracted from the variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -#if defined __powerpc__ && !defined __powerpc64__
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>> -                                   (uint32_t)value);
>>> -}
>>> + * Relaxed atomic store of 64-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>> + *
>>> + * @param ptr  Pointer to a 64-bit atomic variable
>>> + * @param val  Value to write to the atomic variable
>>> + */
>>> +static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
>>> +               uint64_t val)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val;
>>> +       int status;
>>> +       do {
>>> +               /* Read atomic variable exclusively so we can write to it
>>> +                * later */
>>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               (void)old_val; /* Ignore old value */
>>> +               /* Attempt to write the new value */
>>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>>> +       /* Write of aligned quad/double word is atomic */
>>> +       ptr->v = val;
>>>  #else
>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>> -                                               uint64_t value)
>>> -{
>>> -       return __sync_fetch_and_sub(ptr, value);
>>> -}
>>> +#warning odp_atomic64_store_rlx() may not be efficiently implemented
>>> +       /* This is actually an atomic exchange operation */
>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>  #endif
>>> -/**
>>> - * Fetch and increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Increment atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>> -}
>>> -
>>> -/**
>>> - * Fetch and decrement atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - * @return Value of the variable before the operation
>>> - */
>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>>  }
>>>
>>>  /**
>>> - * Decrement atomic uint64 by 1
>>> - *
>>> - * @param ptr    An atomic variable
>>> - *
>>> - */
>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>> -{
>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>> + * Atomic fetch and add to 64-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>> + *
>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>> + *
>>> + * @return Value of the atomic variable before the addition
>>> + */
>>> +static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
>>> +               uint64_t incr)
>>> +{
>>> +#if defined __arm__ /* A32/T32 ISA */
>>> +       uint64_t old_val, new_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +       return old_val;
>>> +#elif defined __aarch64__
>>> +       uint64_t old_val, new_val;
>>> +       int status;
>>> +       do {
>>> +               __asm __volatile("ldxr %x0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("stxr %w0, %x1, [%2]"
>>> +                : "=&r"(status)
>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>> succeeds */
>>> +       return old_val;
>>> +#elif defined __mips64__
>>> +       uint64_t old_val, new_val;
>>> +       do {
>>> +               __asm __volatile("ll %0, [%1]"
>>> +                : "=&r"(old_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +               new_val = old_val + incr;
>>> +               __asm __volatile("sc %0, [%1]"
>>> +                : "+&r"(new_val)
>>> +                                        : "r"(&ptr->v)
>>> +                                        : "memory");
>>> +       } while (odp_unlikely(new_val == 0));
>>> +       return old_val;
>>> +#elif defined __x86_64__
>>> +       /* Generates good code on x86_64 */
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#else
>>> +#warning odp_atomic64_fetch_add_rlx() may not be efficiently implemented
>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>> +#endif
>>>  }
>>>
>>>  /**
>>> - * Atomic compare and set for 64bit
>>> + * Atomic add to 64-bit atomic variable
>>> + * @note Relaxed memory model, no barriers.
>>>   *
>>> - * @param dst destination location into which the value will be written.
>>> - * @param exp expected value.
>>> - * @param src new value.
>>> - * @return Non-zero on success; 0 on failure.
>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>> + * @param incr  The value to be added to the atomic variable
>>>   */
>>> -static inline int
>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
>>> +static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t
>>> incr)
>>>  {
>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>> +       (void)odp_atomic64_fetch_add_rlx(ptr, incr);
>>>  }
>>>
>>>  #ifdef __cplusplus
>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>> b/platform/linux-generic/include/api/odp_barrier.h
>>> index a7b3215..f8eae9a 100644
>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>> @@ -27,18 +27,18 @@ extern "C" {
>>>   * ODP execution barrier
>>>   */
>>>  typedef struct odp_barrier_t {
>>> -       int              count;  /**< @private Thread count */
>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>> +       uint32_t       num_threads;  /**< @private Thread count
>>> (constant) */
>>> +       odp_atomic32_t in_barrier;   /**< @private Threaads in barrier */
>>>  } odp_barrier_t;
>>>
>>>
>>>  /**
>>>   * Init barrier with thread count
>>>   *
>>> - * @param barrier    Barrier
>>> - * @param count      Thread count
>>> + * @param barrier     Barrier
>>> + * @param num_threads Number of threads which share the barrier
>>>   */
>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>
>>>
>>>  /**
>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>> index 252ebb2..ff8a9a2 100644
>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>> @@ -10,26 +10,30 @@
>>>  /**
>>>   * @file
>>>   *
>>> - * ODP RW Locks
>>> + * ODP read/write lock
>>> + * RW lock support multiple concurrent reads but only one (exclusive)
>>> writer.
>>>   */
>>>
>>> +#include <odp_atomic.h>
>>> +
>>>  #ifdef __cplusplus
>>>  extern "C" {
>>>  #endif
>>>
>>>  /**
>>>   * The odp_rwlock_t type.
>>> - * write lock count is -1,
>>> - * read lock count > 0
>>> + * write lock is ~0U
>>> + * read lock count >0 && <~0U
>>>   */
>>>  typedef struct {
>>> -       volatile int32_t cnt; /**< -1 Write lock,
>>> -                               > 0 for Read lock. */
>>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>>> +                                == ~0: locked for write,
>>> +                                > 0 number of concurrent read locks */
>>>  } odp_rwlock_t;
>>>
>>>
>>>  /**
>>> - * Initialize the rwlock to an unlocked state.
>>> + * Initialize the rwlock to the unlocked state.
>>>   *
>>>   * @param rwlock pointer to the RW Lock.
>>>   */
>>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>>
>>>  /**
>>> - * Aquire a write lock.
>>> + * Aquire the write lock.
>>>   *
>>>   * @param rwlock pointer to a RW Lock.
>>>   */
>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>>
>>>  /**
>>> - * Release a write lock.
>>> + * Release the write lock.
>>>   *
>>>   * @param rwlock pointer to a RW Lock.
>>>   */
>>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>>> b/platform/linux-generic/include/api/odp_ticketlock.h
>>> index 6277a18..c4b5e34 100644
>>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>>> @@ -27,8 +27,8 @@ extern "C" {
>>>   * ODP ticketlock
>>>   */
>>>  typedef struct odp_ticketlock_t {
>>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>>> +       odp_atomic32_t next_ticket; /**< @private Next ticket */
>>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>>  } odp_ticketlock_t;
>>>
>>>
>>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>>> b/platform/linux-generic/include/odp_buffer_internal.h
>>> index 2002b51..530ab96 100644
>>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>>         uint32_t                 index;      /* buf index in the pool */
>>>         size_t                   size;       /* max data size */
>>>         size_t                   cur_offset; /* current offset */
>>> -       odp_atomic_int_t         ref_count;  /* reference count */
>>> +       odp_atomic32_t           ref_count;  /* reference count */
>>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>>         int                      type;       /* type of next header */
>>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>>> diff --git a/platform/linux-generic/odp_barrier.c
>>> b/platform/linux-generic/odp_barrier.c
>>> index a82b294..6c3b884 100644
>>> --- a/platform/linux-generic/odp_barrier.c
>>> +++ b/platform/linux-generic/odp_barrier.c
>>> @@ -8,41 +8,48 @@
>>>  #include <odp_sync.h>
>>>  #include <odp_spin_internal.h>
>>>
>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>>  {
>>> -       barrier->count = count;
>>> -       barrier->bar = 0;
>>> -       odp_sync_stores();
>>> +       barrier->num_threads = num_threads; /* Constant after
>>> initialisation */
>>> +       odp_atomic32_store_rls(&barrier->in_barrier, 0);
>>>  }
>>>
>>>  /*
>>>   * Efficient barrier_sync -
>>>   *
>>>   *   Barriers are initialized with a count of the number of callers
>>> - *   that must sync on the barrier before any may proceed.
>>> + *   that must sync on (enter) the barrier before any may proceed
>>> (exit).
>>>   *
>>>   *   To avoid race conditions and to permit the barrier to be fully
>>>   *   reusable, the barrier value cycles between 0..2*count-1. When
>>> - *   synchronizing the wasless variable simply tracks which half of
>>> + *   synchronizing the waslow variable simply tracks which half of
>>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>>   *   barrier crosses to the other half of the cycle.
>>>   */
>>>
>>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>>  {
>>> -       int count;
>>> -       int wasless;
>>> +       uint32_t count;
>>> +       bool waslow;
>>>
>>> -       odp_sync_stores();
>>> -       wasless = barrier->bar < barrier->count;
>>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>>> +       /* FIXME do we need acquire barrier as well? */
>>> +       /* Increase threads in_barrier count, this will automatically
>>> release
>>> +        * the other threads when lower/upper range is switched */
>>> +       count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
>>> +       /* Compute lower or higher range indicator */
>>> +       waslow = count < barrier->num_threads;
>>>
>>> -       if (count == 2*barrier->count-1) {
>>> -               barrier->bar = 0;
>>> -       } else {
>>> -               while ((barrier->bar < barrier->count) == wasless)
>>> -                       odp_spin();
>>> +       /* Check if in_barrier count has "wrapped" */
>>> +       if (count == 2 * barrier->num_threads - 1) {
>>> +               /* Manually wrap the counter */
>>> +               odp_atomic32_add_rls(&barrier->in_barrier,
>>> +
>>> (uint32_t)(-2*(int)barrier->num_threads));
>>> +               /* We don't need to wait below, return immediately */
>>> +               return;
>>> +       }
>>> +       /* Wait for counter to change half */
>>> +       while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
>>> +              barrier->num_threads) == waslow) {
>>> +               odp_spin();
>>>         }
>>> -
>>> -       odp_mem_barrier();
>>>  }
>>> diff --git a/platform/linux-generic/odp_buffer.c
>>> b/platform/linux-generic/odp_buffer.c
>>> index e54e0e7..a5939f3 100644
>>> --- a/platform/linux-generic/odp_buffer.c
>>> +++ b/platform/linux-generic/odp_buffer.c
>>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>>> odp_buffer_t buf)
>>>         len += snprintf(&str[len], n-len,
>>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>>         len += snprintf(&str[len], n-len,
>>> -                       "  ref_count    %i\n",        hdr->ref_count);
>>> +                       "  ref_count    %u\n",
>>> +                       odp_atomic32_load_rlx(&hdr->ref_count));
>>>         len += snprintf(&str[len], n-len,
>>>                         "  type         %i\n",        hdr->type);
>>>         len += snprintf(&str[len], n-len,
>>> diff --git a/platform/linux-generic/odp_crypto.c
>>> b/platform/linux-generic/odp_crypto.c
>>> index b37ad6b..d9fff10 100644
>>> --- a/platform/linux-generic/odp_crypto.c
>>> +++ b/platform/linux-generic/odp_crypto.c
>>> @@ -26,7 +26,7 @@
>>>  #define MAX_SESSIONS 32
>>>
>>>  typedef struct {
>>> -       odp_atomic_u32_t next;
>>> +       odp_atomic32_t   next;
>>>         uint32_t         max;
>>>         odp_crypto_generic_session_t sessions[0];
>>>  } odp_crypto_global_t;
>>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>>         uint32_t idx;
>>>         odp_crypto_generic_session_t *session = NULL;
>>>
>>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>>> +       idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
>>>         if (idx < global->max) {
>>>                 session = &global->sessions[idx];
>>>                 session->index = idx;
>>> diff --git a/platform/linux-generic/odp_queue.c
>>> b/platform/linux-generic/odp_queue.c
>>> index 1318bcd..08c0d29 100644
>>> --- a/platform/linux-generic/odp_queue.c
>>> +++ b/platform/linux-generic/odp_queue.c
>>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>>> *context)
>>>  {
>>>         queue_entry_t *queue;
>>>         queue = queue_to_qentry(handle);
>>> +       /* Setting a new queue context can be viewed as a release
>>> operation,
>>> +        * all writes to the context must be observable before the
>>> context
>>> +        * is made observable */
>>>         odp_sync_stores();
>>> -       queue->s.param.context = context;
>>> +       queue->s.param.context = context; /* Store-release */
>>> +       /* Ensure queue modification is globally visible before we return
>>> +        * and the application might cause the queue to be scheduled */
>>>         odp_sync_stores();
>>>         return 0;
>>>  }
>>> diff --git a/platform/linux-generic/odp_ring.c
>>> b/platform/linux-generic/odp_ring.c
>>> index 632aa66..d1ec825 100644
>>> --- a/platform/linux-generic/odp_ring.c
>>> +++ b/platform/linux-generic/odp_ring.c
>>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned count,
>>> unsigned flags)
>>>                 r->cons.size = count;
>>>                 r->prod.mask = count-1;
>>>                 r->cons.mask = count-1;
>>> -               r->prod.head = 0;
>>> -               r->cons.head = 0;
>>> -               r->prod.tail = 0;
>>> -               r->cons.tail = 0;
>>> +               odp_atomic32_store_rlx(&r->prod.head, 0);
>>> +               odp_atomic32_store_rlx(&r->cons.head, 0);
>>> +               odp_atomic32_store_rlx(&r->prod.tail, 0);
>>> +               odp_atomic32_store_rlx(&r->cons.tail, 0);
>>>
>>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>>         } else {
>>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>         uint32_t prod_head, prod_next;
>>>         uint32_t cons_tail, free_entries;
>>>         const unsigned max = n;
>>> -       int success;
>>> +       bool ok;
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>         int ret;
>>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>                 /* Reset n to the initial burst count */
>>>                 n = max;
>>>
>>> -               prod_head = r->prod.head;
>>> -               cons_tail = r->cons.tail;
>>> +               prod_head = odp_atomic32_load_rlx(&r->prod.head);
>>> +               cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>>                 /* The subtraction is done between two unsigned 32bits
>>> value
>>>                  * (the result is always modulo 32 bits even if we have
>>>                  * prod_head > cons_tail). So 'free_entries' is always
>>> between 0
>>> @@ -259,13 +259,13 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>                 }
>>>
>>>                 prod_next = prod_head + n;
>>> -               success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
>>> -                                             prod_next);
>>> -       } while (odp_unlikely(success == 0));
>>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
>>> +                                                  prod_head,
>>> +                                                  prod_next) ==
>>> prod_head;
>>> +       } while (odp_unlikely(!ok));
>>>
>>>         /* write entries in ring */
>>>         ENQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /* if we exceed the watermark */
>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>> r->prod.watermark)) {
>>> @@ -279,10 +279,10 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>          * If there are other enqueues in progress that preceeded us,
>>>          * we need to wait for them to complete
>>>          */
>>> -       while (odp_unlikely(r->prod.tail != prod_head))
>>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) !=
>>> prod_head))
>>>                 odp_spin();
>>>
>>> -       r->prod.tail = prod_next;
>>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>>         return ret;
>>>  }
>>>
>>> @@ -298,8 +298,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>         uint32_t mask = r->prod.mask;
>>>         int ret;
>>>
>>> -       prod_head = r->prod.head;
>>> -       cons_tail = r->cons.tail;
>>> +       prod_head = odp_atomic32_load_rlx(&r->prod.head);
>>> +       cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>>         /* The subtraction is done between two unsigned 32bits value
>>>          * (the result is always modulo 32 bits even if we have
>>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>>> @@ -320,11 +320,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>>> * const *obj_table,
>>>         }
>>>
>>>         prod_next = prod_head + n;
>>> -       r->prod.head = prod_next;
>>> +       odp_atomic32_store_rlx(&r->prod.head, prod_next);
>>>
>>>         /* write entries in ring */
>>>         ENQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /* if we exceed the watermark */
>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>> r->prod.watermark)) {
>>> @@ -334,7 +333,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void *
>>> const *obj_table,
>>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>>         }
>>>
>>> -       r->prod.tail = prod_next;
>>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>>         return ret;
>>>  }
>>>
>>> @@ -348,7 +347,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         uint32_t cons_head, prod_tail;
>>>         uint32_t cons_next, entries;
>>>         const unsigned max = n;
>>> -       int success;
>>> +       bool ok;
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>
>>> @@ -357,8 +356,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>                 /* Restore n as it may change every loop */
>>>                 n = max;
>>>
>>> -               cons_head = r->cons.head;
>>> -               prod_tail = r->prod.tail;
>>> +               cons_head = odp_atomic32_load_rlx(&r->cons.head);
>>> +               prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>>                 /* The subtraction is done between two unsigned 32bits
>>> value
>>>                  * (the result is always modulo 32 bits even if we have
>>>                  * cons_head > prod_tail). So 'entries' is always
>>> between 0
>>> @@ -378,22 +377,22 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>                 }
>>>
>>>                 cons_next = cons_head + n;
>>> -               success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
>>> -                                             cons_next);
>>> -       } while (odp_unlikely(success == 0));
>>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
>>> +                                                  cons_head,
>>> +                                                  cons_next) ==
>>> cons_head;
>>> +       } while (odp_unlikely(!ok));
>>>
>>>         /* copy in table */
>>>         DEQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>>         /*
>>>          * If there are other dequeues in progress that preceded us,
>>>          * we need to wait for them to complete
>>>          */
>>> -       while (odp_unlikely(r->cons.tail != cons_head))
>>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) !=
>>> cons_head))
>>>                 odp_spin();
>>>
>>> -       r->cons.tail = cons_next;
>>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>>
>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>  }
>>> @@ -409,8 +408,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         unsigned i;
>>>         uint32_t mask = r->prod.mask;
>>>
>>> -       cons_head = r->cons.head;
>>> -       prod_tail = r->prod.tail;
>>> +       cons_head = odp_atomic32_load_rlx(&r->cons.head);
>>> +       prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>>         /* The subtraction is done between two unsigned 32bits value
>>>          * (the result is always modulo 32 bits even if we have
>>>          * cons_head > prod_tail). So 'entries' is always between 0
>>> @@ -429,13 +428,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>>> **obj_table,
>>>         }
>>>
>>>         cons_next = cons_head + n;
>>> -       r->cons.head = cons_next;
>>> +       odp_atomic32_store_rlx(&r->cons.head, cons_next);
>>>
>>>         /* copy in table */
>>>         DEQUEUE_PTRS();
>>> -       odp_mem_barrier();
>>>
>>> -       r->cons.tail = cons_next;
>>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>  }
>>>
>>> @@ -482,8 +480,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>>> **obj_table, unsigned n)
>>>   */
>>>  int odph_ring_full(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>>  }
>>>
>>> @@ -492,8 +490,8 @@ int odph_ring_full(const odph_ring_t *r)
>>>   */
>>>  int odph_ring_empty(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>         return !!(cons_tail == prod_tail);
>>>  }
>>>
>>> @@ -502,8 +500,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>>   */
>>>  unsigned odph_ring_count(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>         return (prod_tail - cons_tail) & r->prod.mask;
>>>  }
>>>
>>> @@ -512,8 +510,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>>   */
>>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>>  {
>>> -       uint32_t prod_tail = r->prod.tail;
>>> -       uint32_t cons_tail = r->cons.tail;
>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>>  }
>>>
>>> @@ -523,10 +521,10 @@ void odph_ring_dump(const odph_ring_t *r)
>>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>>         ODP_DBG("  flags=%x\n", r->flags);
>>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>>> +       ODP_DBG("  ct=%"PRIu32"\n",
>>> odp_atomic32_load_rlx(&r->cons.tail));
>>> +       ODP_DBG("  ch=%"PRIu32"\n",
>>> odp_atomic32_load_rlx(&r->cons.head));
>>> +       ODP_DBG("  pt=%"PRIu32"\n",
>>> odp_atomic32_load_rlx(&r->prod.tail));
>>> +       ODP_DBG("  ph=%"PRIu32"\n",
>>> odp_atomic32_load_rlx(&r->prod.head));
>>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>>         if (r->prod.watermark == r->prod.size)
>>> diff --git a/platform/linux-generic/odp_rwlock.c
>>> b/platform/linux-generic/odp_rwlock.c
>>> index 11c8dd7..ba0a7ca 100644
>>> --- a/platform/linux-generic/odp_rwlock.c
>>> +++ b/platform/linux-generic/odp_rwlock.c
>>> @@ -4,58 +4,56 @@
>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>   */
>>>
>>> +#include <stdbool.h>
>>>  #include <odp_atomic.h>
>>>  #include <odp_rwlock.h>
>>> -
>>>  #include <odp_spin_internal.h>
>>>
>>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>>  {
>>> -       rwlock->cnt = 0;
>>> +       odp_atomic32_store_rlx(&rwlock->cnt, 0);
>>>  }
>>>
>>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>>  {
>>> -       int32_t cnt;
>>> -       int  is_locked = 0;
>>> -
>>> -       while (is_locked == 0) {
>>> -               cnt = rwlock->cnt;
>>> +       bool gotit;
>>> +       do {
>>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>>                 /* waiting for read lock */
>>> -               if (cnt < 0) {
>>> +               if ((int32_t)cnt < 0) {
>>>                         odp_spin();
>>>                         continue;
>>>                 }
>>> -               is_locked = odp_atomic_cmpset_u32(
>>> -                                       (volatile uint32_t
>>> *)&rwlock->cnt,
>>> -                                             cnt, cnt + 1);
>>> -       }
>>> +               /* Attempt to take another read lock */
>>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
>>> +                                                     cnt, cnt + 1) ==
>>> cnt;
>>> +       } while (!gotit);
>>>  }
>>>
>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>>  {
>>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>> +       /* Release one read lock by subtracting 1 */
>>> +       odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
>>>  }
>>>
>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>>  {
>>> -       int32_t cnt;
>>> -       int is_locked = 0;
>>> -
>>> -       while (is_locked == 0) {
>>> -               cnt = rwlock->cnt;
>>> -               /* lock aquired, wait */
>>> +       bool gotit;
>>> +       do {
>>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>>                 if (cnt != 0) {
>>> +                       /* Lock is busy */
>>>                         odp_spin();
>>>                         continue;
>>>                 }
>>> -               is_locked = odp_atomic_cmpset_u32(
>>> -                                       (volatile uint32_t
>>> *)&rwlock->cnt,
>>> -                                             0, -1);
>>> -       }
>>> +               /* Attempt to take write lock */
>>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
>>> +                                                     (uint32_t)-1) == 0;
>>> +       } while (!gotit);
>>>  }
>>>
>>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>>  {
>>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>> +       /* Release the write lock by adding 1 */
>>> +       odp_atomic32_add_rls(&rwlock->cnt, 1);
>>>  }
>>> diff --git a/platform/linux-generic/odp_thread.c
>>> b/platform/linux-generic/odp_thread.c
>>> index b869b27..569b235 100644
>>> --- a/platform/linux-generic/odp_thread.c
>>> +++ b/platform/linux-generic/odp_thread.c
>>> @@ -31,7 +31,7 @@ typedef struct {
>>>
>>>  typedef struct {
>>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>>> -       odp_atomic_int_t num;
>>> +       odp_atomic32_t   num;
>>>
>>>  } thread_globals_t;
>>>
>>> @@ -67,7 +67,7 @@ static int thread_id(void)
>>>         int id;
>>>         int cpu;
>>>
>>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>>> +       id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
>>>
>>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>>                 ODP_ERR("Too many threads\n");
>>> @@ -77,7 +77,7 @@ static int thread_id(void)
>>>         cpu = sched_getcpu();
>>>
>>>         if (cpu < 0) {
>>> -               ODP_ERR("getcpu failed\n");
>>> +               ODP_ERR("sched_getcpu failed\n");
>>>                 return -1;
>>>         }
>>>
>>> diff --git a/platform/linux-generic/odp_ticketlock.c
>>> b/platform/linux-generic/odp_ticketlock.c
>>> index be5b885..cadc0e0 100644
>>> --- a/platform/linux-generic/odp_ticketlock.c
>>> +++ b/platform/linux-generic/odp_ticketlock.c
>>> @@ -12,9 +12,8 @@
>>>
>>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>>  {
>>> -       ticketlock->next_ticket = 0;
>>> -       ticketlock->cur_ticket  = 0;
>>> -       odp_sync_stores();
>>> +       odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
>>> +       odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
>>>  }
>>>
>>>
>>> @@ -22,30 +21,14 @@ void odp_ticketlock_lock(odp_ticketlock_t
>>> *ticketlock)
>>>  {
>>>         uint32_t ticket;
>>>
>>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>> +       ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>>
>>> -       while (ticket != ticketlock->cur_ticket)
>>> +       while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>>                 odp_spin();
>>> -
>>> -       odp_mem_barrier();
>>>  }
>>>
>>>
>>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>>  {
>>> -       odp_sync_stores();
>>> -
>>> -       ticketlock->cur_ticket++;
>>> -
>>> -#if defined __OCTEON__
>>> -       odp_sync_stores();
>>> -#else
>>> -       odp_mem_barrier();
>>> -#endif
>>> -}
>>> -
>>> -
>>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>>> -{
>>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>>> +       odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
>>>  }
>>> diff --git a/platform/linux-generic/odp_timer.c
>>> b/platform/linux-generic/odp_timer.c
>>> index 313c713..938429f 100644
>>> --- a/platform/linux-generic/odp_timer.c
>>> +++ b/platform/linux-generic/odp_timer.c
>>> @@ -32,8 +32,8 @@ typedef struct {
>>>
>>>  typedef struct {
>>>         int               allocated;
>>> -       volatile int      active;
>>> -       volatile uint64_t cur_tick;
>>> +       odp_atomic32_t    active;
>>> +       odp_atomic64_t    cur_tick;
>>>         timer_t           timerid;
>>>         odp_timer_t       timer_hdl;
>>>         odp_buffer_pool_t pool;
>>> @@ -150,16 +150,14 @@ static void notify_function(union sigval sigval)
>>>
>>>         timer = sigval.sival_ptr;
>>>
>>> -       if (timer->active == 0) {
>>> +       if (odp_atomic32_load_rlx(&timer->active) == 0) {
>>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>>                 return;
>>>         }
>>>
>>>         /* ODP_DBG("Tick\n"); */
>>>
>>> -       cur_tick = timer->cur_tick++;
>>> -
>>> -       odp_sync_stores();
>>> +       cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
>>>
>>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>>
>>> @@ -318,8 +316,7 @@ odp_timer_t odp_timer_create(const char *name,
>>> odp_buffer_pool_t pool,
>>>                 timer->tick[i].list = NULL;
>>>         }
>>>
>>> -       timer->active = 1;
>>> -       odp_sync_stores();
>>> +       odp_atomic32_store_rls(&timer->active, 1);
>>>
>>>         timer_start(timer);
>>>
>>> @@ -340,7 +337,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>>> timer_hdl, uint64_t tmo_tick,
>>>         id = (int)timer_hdl - 1;
>>>         timer = &odp_timer.timer[id];
>>>
>>> -       cur_tick = timer->cur_tick;
>>> +       cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
>>>         if (tmo_tick <= cur_tick) {
>>>                 ODP_DBG("timeout too close\n");
>>>                 return ODP_TIMER_TMO_INVALID;
>>> @@ -416,7 +413,7 @@ uint64_t odp_timer_current_tick(odp_timer_t
>>> timer_hdl)
>>>         uint32_t id;
>>>
>>>         id = timer_hdl - 1;
>>> -       return odp_timer.timer[id].cur_tick;
>>> +       return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
>>>  }
>>>
>>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>>> diff --git a/test/api_test/odp_atomic_test.c
>>> b/test/api_test/odp_atomic_test.c
>>> index 9019d4f..4d27b32 100644
>>> --- a/test/api_test/odp_atomic_test.c
>>> +++ b/test/api_test/odp_atomic_test.c
>>> @@ -10,17 +10,14 @@
>>>  #include <odp_common.h>
>>>  #include <odp_atomic_test.h>
>>>
>>> -static odp_atomic_int_t a32;
>>> -static odp_atomic_u32_t a32u;
>>> -static odp_atomic_u64_t a64u;
>>> +static odp_atomic32_t a32u;
>>> +static odp_atomic64_t a64u;
>>>
>>> -static odp_atomic_int_t numthrds;
>>> +static odp_barrier_t barrier;
>>>
>>>  static const char * const test_name[] = {
>>>         "dummy",
>>>         "test atomic basic ops add/sub/inc/dec",
>>> -       "test atomic inc/dec of signed word",
>>> -       "test atomic add/sub of signed word",
>>>         "test atomic inc/dec of unsigned word",
>>>         "test atomic add/sub of unsigned word",
>>>         "test atomic inc/dec of unsigned double word",
>>> @@ -31,39 +28,29 @@ static struct timeval tv0[MAX_WORKERS],
>>> tv1[MAX_WORKERS];
>>>
>>>  static void usage(void)
>>>  {
>>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>>> +       printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
>>>                "\t<testcase> is\n"
>>>                "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>>> -              "\t\t2 - Test inc dec of signed word\n"
>>> -              "\t\t3 - Test add sub of signed word\n"
>>> -              "\t\t4 - Test inc dec of unsigned word\n"
>>> -              "\t\t5 - Test add sub of unsigned word\n"
>>> -              "\t\t6 - Test inc dec of double word\n"
>>> -              "\t\t7 - Test add sub of double word\n"
>>> -              "\t<num of pthread> is optional\n"
>>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>>> +              "\t\t2 - Test inc dec of unsigned word\n"
>>> +              "\t\t3 - Test add sub of unsigned word\n"
>>> +              "\t\t4 - Test inc dec of double word\n"
>>> +              "\t\t5 - Test add sub of double word\n"
>>> +              "\t<num of thread> is optional\n"
>>> +              "\t\t<1 - 31> - no of threads to start\n"
>>>                "\t\tif user doesn't specify this option, then\n"
>>> -              "\t\tno of pthreads created is equivalent to no of
>>> cores\n"
>>> +              "\t\tno of threads created is equivalent to no of cores\n"
>>>                "\t\tavailable in the system\n"
>>>                "\tExample usage:\n"
>>>                "\t\t./odp_atomic -t 2\n"
>>>                "\t\t./odp_atomic -t 3 -n 12\n");
>>>  }
>>>
>>> -void test_atomic_inc_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_int(&a32);
>>> -}
>>> -
>>>  void test_atomic_inc_u32(void)
>>>  {
>>>         int i;
>>>
>>>         for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_u32(&a32u);
>>> +               odp_atomic32_add_rlx(&a32u, 1);
>>>  }
>>>
>>>  void test_atomic_inc_64(void)
>>> @@ -71,15 +58,7 @@ void test_atomic_inc_64(void)
>>>         int i;
>>>
>>>         for (i = 0; i < CNT; i++)
>>> -               odp_atomic_inc_u64(&a64u);
>>> -}
>>> -
>>> -void test_atomic_dec_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_int(&a32);
>>> +               odp_atomic64_add_rlx(&a64u, 1);
>>>  }
>>>
>>>  void test_atomic_dec_u32(void)
>>> @@ -87,7 +66,7 @@ void test_atomic_dec_u32(void)
>>>         int i;
>>>
>>>         for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_u32(&a32u);
>>> +               odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
>>>  }
>>>
>>>  void test_atomic_dec_64(void)
>>> @@ -95,15 +74,7 @@ void test_atomic_dec_64(void)
>>>         int i;
>>>
>>>         for (i = 0; i < CNT; i++)
>>> -               odp_atomic_dec_u64(&a64u);
>>> -}
>>> -
>>> -void test_atomic_add_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>>> +               odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
>>>  }
>>>
>>>  void test_atomic_add_u32(void)
>>> @@ -111,7 +82,7 @@ void test_atomic_add_u32(void)
>>>         int i;
>>>
>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>>> +               odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
>>>  }
>>>
>>>  void test_atomic_add_64(void)
>>> @@ -119,15 +90,7 @@ void test_atomic_add_64(void)
>>>         int i;
>>>
>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_sub_32(void)
>>> -{
>>> -       int i;
>>> -
>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>>> +               odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
>>>  }
>>>
>>>  void test_atomic_sub_u32(void)
>>> @@ -135,7 +98,7 @@ void test_atomic_sub_u32(void)
>>>         int i;
>>>
>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>>> +               odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
>>>  }
>>>
>>>  void test_atomic_sub_64(void)
>>> @@ -143,19 +106,7 @@ void test_atomic_sub_64(void)
>>>         int i;
>>>
>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>>> -}
>>> -
>>> -void test_atomic_inc_dec_32(void)
>>> -{
>>> -       test_atomic_inc_32();
>>> -       test_atomic_dec_32();
>>> -}
>>> -
>>> -void test_atomic_add_sub_32(void)
>>> -{
>>> -       test_atomic_add_32();
>>> -       test_atomic_sub_32();
>>> +               odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
>>>  }
>>>
>>>  void test_atomic_inc_dec_u32(void)
>>> @@ -188,11 +139,6 @@ void test_atomic_add_sub_64(void)
>>>   */
>>>  void test_atomic_basic(void)
>>>  {
>>> -       test_atomic_inc_32();
>>> -       test_atomic_dec_32();
>>> -       test_atomic_add_32();
>>> -       test_atomic_sub_32();
>>> -
>>>         test_atomic_inc_u32();
>>>         test_atomic_dec_u32();
>>>         test_atomic_add_u32();
>>> @@ -206,31 +152,24 @@ void test_atomic_basic(void)
>>>
>>>  void test_atomic_init(void)
>>>  {
>>> -       odp_atomic_init_int(&a32);
>>> -       odp_atomic_init_u32(&a32u);
>>> -       odp_atomic_init_u64(&a64u);
>>> +       odp_atomic32_store_rlx(&a32u, 0);
>>> +       odp_atomic64_store_rlx(&a64u, 0);
>>>  }
>>>
>>>  void test_atomic_store(void)
>>>  {
>>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>>> +       odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
>>> +       odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
>>>  }
>>>
>>>  int test_atomic_validate(void)
>>>  {
>>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>>> -               return -1;
>>> -       }
>>> -
>>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>>> +       if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
>>>                 ODP_ERR("Atomic u32 usual functions failed\n");
>>>                 return -1;
>>>         }
>>>
>>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>>> +       if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
>>>                 ODP_ERR("Atomic u64 usual functions failed\n");
>>>                 return -1;
>>>         }
>>> @@ -247,11 +186,8 @@ static void *run_thread(void *arg)
>>>
>>>         ODP_DBG("Thread %i starts\n", thr);
>>>
>>> -       odp_atomic_inc_int(&numthrds);
>>> -
>>> -       /* Wait here until all pthreads are created */
>>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>>> -               ;
>>> +       /* Wait here until all threads have arrived */
>>> +       odp_barrier_sync(&barrier);
>>>
>>>         gettimeofday(&tv0[thr], NULL);
>>>
>>> @@ -259,12 +195,6 @@ static void *run_thread(void *arg)
>>>         case TEST_MIX:
>>>                 test_atomic_basic();
>>>                 break;
>>> -       case TEST_INC_DEC_S32:
>>> -               test_atomic_inc_dec_32();
>>> -               break;
>>> -       case TEST_ADD_SUB_S32:
>>> -               test_atomic_add_sub_32();
>>> -               break;
>>>         case TEST_INC_DEC_U32:
>>>                 test_atomic_inc_dec_u32();
>>>                 break;
>>> @@ -327,7 +257,6 @@ int main(int argc, char *argv[])
>>>         if (pthrdnum == 0)
>>>                 pthrdnum = odp_sys_core_count();
>>>
>>> -       odp_atomic_init_int(&numthrds);
>>>         test_atomic_init();
>>>         test_atomic_store();
>>>
>>> @@ -342,6 +271,7 @@ int main(int argc, char *argv[])
>>>                 usage();
>>>                 goto err_exit;
>>>         }
>>> +       odp_barrier_init(&barrier, pthrdnum);
>>>         odp_test_thread_create(run_thread, &thrdarg);
>>>
>>>         odp_test_thread_exit(&thrdarg);
>>> diff --git a/test/api_test/odp_atomic_test.h
>>> b/test/api_test/odp_atomic_test.h
>>> index 7814da5..aaa9d34 100644
>>> --- a/test/api_test/odp_atomic_test.h
>>> +++ b/test/api_test/odp_atomic_test.h
>>> @@ -18,14 +18,11 @@
>>>  #define ADD_SUB_CNT    5
>>>
>>>  #define        CNT 500000
>>> -#define        S32_INIT_VAL    (1UL << 10)
>>>  #define        U32_INIT_VAL    (1UL << 10)
>>>  #define        U64_INIT_VAL    (1ULL << 33)
>>>
>>>  typedef enum {
>>>         TEST_MIX = 1, /* Must be first test case num */
>>> -       TEST_INC_DEC_S32,
>>> -       TEST_ADD_SUB_S32,
>>>         TEST_INC_DEC_U32,
>>>         TEST_ADD_SUB_U32,
>>>         TEST_INC_DEC_64,
>>> @@ -34,16 +31,10 @@ typedef enum {
>>>  } odp_test_atomic_t;
>>>
>>>
>>> -void test_atomic_inc_dec_32(void);
>>> -void test_atomic_add_sub_32(void);
>>>  void test_atomic_inc_dec_u32(void);
>>>  void test_atomic_add_sub_u32(void);
>>>  void test_atomic_inc_dec_64(void);
>>>  void test_atomic_add_sub_64(void);
>>> -void test_atomic_inc_32(void);
>>> -void test_atomic_dec_32(void);
>>> -void test_atomic_add_32(void);
>>> -void test_atomic_sub_32(void);
>>>  void test_atomic_inc_u32(void);
>>>  void test_atomic_dec_u32(void);
>>>  void test_atomic_add_u32(void);
>>> --
>>> 1.9.1
>>>
>>>
>>> _______________________________________________
>>> lng-odp mailing list
>>> lng-odp@lists.linaro.org
>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>>
>>
>>
>> _______________________________________________
>> lng-odp mailing list
>> lng-odp@lists.linaro.org
>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>
>>
>
>
> --
> *Mike Holmes*
> Linaro  Sr Technical Manager
> LNG - ODP
>
Savolainen, Petri (NSN - FI/Espoo) Oct. 16, 2014, 9:57 a.m. UTC | #5
Hi,

I think we don’t need  to specify these in three different versions. It should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like that already today, since we don’t promise (in API documentation) to include memory barriers into those calls. I think "relaxed" is the common case for _applications_, those would mainly modify counters through this API – and not implement synchronization data structures (like the ticketlock). If ODP _implementation_ or _application platform_ implements such data structure, it’s not huge overhead to put those odp_sync_stores or compiler memory barriers there when needed. Application would mainly use those (in thousands of places), but those would be implemented only once (in few places).


Why not just change this …

/**
 * Fetch and add atomic uint32
 *
 * @param ptr    An atomic variable
 * @param value  A value to be added to the variable
 *
 * @return Value of the variable before the operation
 */
static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
						uint32_t value)
{
	return __sync_fetch_and_add(ptr, value);
}

… into this …

static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
						uint32_t value)
{
	return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
}


-Petri



From: lng-odp-bounces@lists.linaro.org [mailto:lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl

Sent: Thursday, October 16, 2014 11:42 AM
To: Bill Fischofer
Cc: lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

These suffixes stand for relaxed, release and acquire. They are important concepts in C11/C++11 atomics and memory models. It is all about what (observable) ordering is required in multithreaded (multiprocessor) environments, happens-before and happens-after relationships.

A relaxed access is independent of all other accesses and need no synchronization.
An acquire access denotes some type of shared resource acquisition. Loads and stores after the acquire load must be prevented from moving up (either by compiler or by the HW), this is a half-sided barrier. Loads and stores from before the acquire are allowed to move down.
A release access denotes releases of a shared resource. Loads and stores before the release store must be prevented from moving down (either by compiler or by HW), this is also a half-sided barrier. Loads and stores after the release are allowed to move up.

Code that uses atomic variables (e.g. for implementing shared memory data structures such as locks and rings) must know which type of atomic operations is required. The ODP ticket lock implementation makes a good example:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);   //the ticket counter does not protect anything to incrementing it can be relaxed

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))    //acquiring the currently served position will include (a half-sided) barrier so to contain accesses from inside the critical section
                odp_spin();
}


void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the currently server position will also include (a half-sided) barrier to contain inside accesses
}

Implementations may use barriers of some kind inside these primitive atomic operations. Some architectures don't even need explicit barriers as they have memory access instructions (e.g. load and store) with acquire and release semantics. Full barriers are heavy (and semantically an overkill) and you want to avoid them if possible. To use full barriers for updates to e.g. global statistics counters will affect performance, such updates can be relaxed (they still need to be atomic of course).

See these two good presentations Herb Sutter on the C++ standards committee.
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2

On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org> wrote:
Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are intended to be generic wouldn't omitting these be better?
Ola Liljedahl Oct. 16, 2014, 10:20 a.m. UTC | #6
I think the API need to be more explicit in what ordering guarantees are
made. As the original implementation is sequentially consistent, I think
there is a big risk of applications relying on this behavior without
knowing, SC is also what a naive programmer expects. Just modifying the
existing implementation is not enough in my opinion. ODP is a library for
multithreaded programming, pretending single-threaded SC semantics is all
the user has to care about is wrong.

I was considering splitting the API in a private part (with all the
acquire/release functions that are used for high order synchronization
functionality) and a public part (basically 32-bit and 64-bit counter
support using relaxed memory model). But I think also applications can
benefit from the acquire/release (and relaxed) functions, applications
might implement shared memory data structures themselves, not using e.g.
ODP locks. Think lock-less hash tables. No reason to limit innovation of
shared memory data structure and multithreaded programming to inside ODP.

It could still make sense to have a dedicated header file for e.g. atomic
statistics counters (odp_counters.h?) and not mix them with the generic
atomics support. But the definitions in odp_counters.h would just refer to
odp_atomic.h so do not if it is worth the hassle.

We are not allowed to use C11 features in ODP linux-generic implementation
(C99 only), I brought up this issue recently but no change was approved.
When this restriction is lifted, we can change the implementation to
directly use C11 atomics.

-- Ola


On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

> Hi,
>
> I think we don’t need  to specify these in three different versions. It
> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> that already today, since we don’t promise (in API documentation) to
> include memory barriers into those calls. I think "relaxed" is the common
> case for _applications_, those would mainly modify counters through this
> API – and not implement synchronization data structures (like the
> ticketlock). If ODP _implementation_ or _application platform_ implements
> such data structure, it’s not huge overhead to put those odp_sync_stores or
> compiler memory barriers there when needed. Application would mainly use
> those (in thousands of places), but those would be implemented only once
> (in few places).
>
>
> Why not just change this …
>
> /**
>  * Fetch and add atomic uint32
>  *
>  * @param ptr    An atomic variable
>  * @param value  A value to be added to the variable
>  *
>  * @return Value of the variable before the operation
>  */
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __sync_fetch_and_add(ptr, value);
> }
>
> … into this …
>
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> }
>
>
> -Petri
>
>
>
> From: lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> Sent: Thursday, October 16, 2014 11:42 AM
> To: Bill Fischofer
> Cc: lng-odp-forward
> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
> These suffixes stand for relaxed, release and acquire. They are important
> concepts in C11/C++11 atomics and memory models. It is all about what
> (observable) ordering is required in multithreaded (multiprocessor)
> environments, happens-before and happens-after relationships.
>
> A relaxed access is independent of all other accesses and need no
> synchronization.
> An acquire access denotes some type of shared resource acquisition. Loads
> and stores after the acquire load must be prevented from moving up (either
> by compiler or by the HW), this is a half-sided barrier. Loads and stores
> from before the acquire are allowed to move down.
> A release access denotes releases of a shared resource. Loads and stores
> before the release store must be prevented from moving down (either by
> compiler or by HW), this is also a half-sided barrier. Loads and stores
> after the release are allowed to move up.
>
> Code that uses atomic variables (e.g. for implementing shared memory data
> structures such as locks and rings) must know which type of atomic
> operations is required. The ODP ticket lock implementation makes a good
> example:
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> {
>         uint32_t ticket;
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>   //the ticket counter does not protect anything to incrementing it can be
> relaxed
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>  //acquiring the currently served position will include (a half-sided)
> barrier so to contain accesses from inside the critical section
>                 odp_spin();
> }
>
>
> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> {
>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> currently server position will also include (a half-sided) barrier to
> contain inside accesses
> }
>
> Implementations may use barriers of some kind inside these primitive
> atomic operations. Some architectures don't even need explicit barriers as
> they have memory access instructions (e.g. load and store) with acquire and
> release semantics. Full barriers are heavy (and semantically an overkill)
> and you want to avoid them if possible. To use full barriers for updates to
> e.g. global statistics counters will affect performance, such updates can
> be relaxed (they still need to be atomic of course).
>
> See these two good presentations Herb Sutter on the C++ standards
> committee.
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>
> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
>
>
Mike Holmes Oct. 16, 2014, 10:54 a.m. UTC | #7
On 16 October 2014 04:53, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:

> Yes these fallbacks should be supported by Clang/LLVM. I used clang 3.4 to
> compile odp_atomic.h for different architectures (as LLVM by default
> supports cross compilation). Some (32-bit) architectures may not support
> e.g. __sync builtins on 64-bit data types but this is a limitation of those
> architectures. E.g. 32-bit PowerPC does not support 64-bit atomic
> operations, you will get a linker error because libgcc.a will not include
> the necessary helper routines. This was a problem with the original code as
> well. The new API actually supports a work around for this limitation
> because now the atomic data types are structs and not just the scalar
> variables. This struct could be extended with e.g. a spin lock that could
> protect a multi-word implementation and allow atomic accesses. It wouldn't
> be very fast...
>
> Where do I add "-std=c99 --pedantic" in the ODP makefiles to try this for
> all of linux-generic?
>

 CFLAGS="-std=c99 -pedantic" ./configure
make -k  #to skip all the failing cases


>
> The true performance improvement would only come on architectures that
> support acquire and release semantics natively, e.g. ARMv8. But by combing
> through the code, I have decreased the number of barriers (there was some
> redundancy) so the code might be faster on e.g. Cortex-A15 and possibly
> OCTEON as well. Counter updates which now can be relaxed (no barriers)
> should definitively be faster.
>
>
> On 16 October 2014 03:38, Mike Holmes <mike.holmes@linaro.org> wrote:
>
>>
>>
>> On 15 October 2014 19:18, Bill Fischofer <bill.fischofer@linaro.org>
>> wrote:
>>
>>> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
>>> intended to be generic wouldn't omitting these be better?
>>>
>>> On Wed, Oct 15, 2014 at 4:46 PM, Ola Liljedahl <ola.liljedahl@linaro.org
>>> > wrote:
>>>
>>>> Signed-off-by: Ola Liljedahl <ola.liljedahl@linaro.org>
>>>> ---
>>>> Implementation of C11-based memory model for atomic operations.
>>>> Attempt to remove all explicit memory barriers (odp_sync_stores) from
>>>> code that
>>>> implements multithreaded synchronization primitives (e.g. locks,
>>>> barriers).
>>>> Rewrote such primitives to use the new atomic operations.
>>>> Optimized support for ARMv6/v7, ARMv8(aarch64), x86_64, MIPS64/OCTEON
>>>> Other architectures will fall back to GCC __sync builtins which often
>>>> include
>>>> unnecessarily heavy barrier/sync operations (always sequentially
>>>> consistent).
>>>>
>>>
>> Are these fallbacks also 100% supported by LLVM ?
>> ODP Linux generic implementation is already compiler crippled and not C
>> std compliant  with Variable Length Arrays in Structs (VLAIS)
>> The Linux kernel is still trying to fix those GCCisims
>> http://lkml.iu.edu/hypermail/linux/kernel/1410.1/03100.html
>>
>> If this works with std=c99 --pedantic etc for both gcc and llvm, then it
>> looks like it would be an improvement from your introduction, is there any
>> form of benchmark to show it helps ?
>>
>>
>>> Fixed race conditions in odp_barrier_sync() (non-atomic wrap of counter)
>>>> and
>>>> odp_ring enqueue/dequeue (need release barrier but only had compiler
>>>> barrier).
>>>>
>>>>  example/generator/odp_generator.c                  |  43 +-
>>>>  example/ipsec/odp_ipsec.c                          |   2 +-
>>>>  example/odp_example/odp_example.c                  |   2 +-
>>>>  example/timer/odp_timer_test.c                     |   2 +-
>>>>  helper/include/odph_ring.h                         |   8 +-
>>>>  platform/linux-generic/include/api/odp_atomic.h    | 820
>>>> ++++++++++++---------
>>>>  platform/linux-generic/include/api/odp_barrier.h   |  10 +-
>>>>  platform/linux-generic/include/api/odp_rwlock.h    |  20 +-
>>>>  .../linux-generic/include/api/odp_ticketlock.h     |   4 +-
>>>>  .../linux-generic/include/odp_buffer_internal.h    |   2 +-
>>>>  platform/linux-generic/odp_barrier.c               |  43 +-
>>>>  platform/linux-generic/odp_buffer.c                |   3 +-
>>>>  platform/linux-generic/odp_crypto.c                |   4 +-
>>>>  platform/linux-generic/odp_queue.c                 |   7 +-
>>>>  platform/linux-generic/odp_ring.c                  |  86 ++-
>>>>  platform/linux-generic/odp_rwlock.c                |  46 +-
>>>>  platform/linux-generic/odp_thread.c                |   6 +-
>>>>  platform/linux-generic/odp_ticketlock.c            |  27 +-
>>>>  platform/linux-generic/odp_timer.c                 |  17 +-
>>>>  test/api_test/odp_atomic_test.c                    | 126 +---
>>>>  test/api_test/odp_atomic_test.h                    |   9 -
>>>>  21 files changed, 651 insertions(+), 636 deletions(-)
>>>>
>>>> diff --git a/example/generator/odp_generator.c
>>>> b/example/generator/odp_generator.c
>>>> index eb8b340..cf2d77b 100644
>>>> --- a/example/generator/odp_generator.c
>>>> +++ b/example/generator/odp_generator.c
>>>> @@ -62,10 +62,10 @@ typedef struct {
>>>>   * counters
>>>>  */
>>>>  static struct {
>>>> -       odp_atomic_u64_t seq;   /**< ip seq to be send */
>>>> -       odp_atomic_u64_t ip;    /**< ip packets */
>>>> -       odp_atomic_u64_t udp;   /**< udp packets */
>>>> -       odp_atomic_u64_t icmp;  /**< icmp packets */
>>>> +       odp_atomic64_t seq;     /**< ip seq to be send */
>>>> +       odp_atomic64_t ip;      /**< ip packets */
>>>> +       odp_atomic64_t udp;     /**< udp packets */
>>>> +       odp_atomic64_t icmp;    /**< icmp packets */
>>>>  } counters;
>>>>
>>>>  /** * Thread specific arguments
>>>> @@ -201,7 +201,7 @@ static void pack_udp_pkt(odp_buffer_t obuf)
>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>> ODPH_UDPHDR_LEN +
>>>>                                        ODPH_IPV4HDR_LEN);
>>>>         ip->proto = ODPH_IPPROTO_UDP;
>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
>>>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>         ip->chksum = 0;
>>>>         odph_ipv4_csum_update(pkt);
>>>> @@ -258,7 +258,7 @@ static void pack_icmp_pkt(odp_buffer_t obuf)
>>>>         ip->tot_len = odp_cpu_to_be_16(args->appl.payload +
>>>> ODPH_ICMPHDR_LEN +
>>>>                                        ODPH_IPV4HDR_LEN);
>>>>         ip->proto = ODPH_IPPROTO_ICMP;
>>>> -       seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
>>>> +       seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
>>>>         ip->id = odp_cpu_to_be_16(seq);
>>>>         ip->chksum = 0;
>>>>         odph_ipv4_csum_update(pkt);
>>>> @@ -334,13 +334,15 @@ static void *gen_send_thread(void *arg)
>>>>                 }
>>>>
>>>>                 if (args->appl.interval != 0) {
>>>> +                       uint64_t seq =
>>>> odp_atomic64_load_rlx(&counters.seq);
>>>>                         printf("  [%02i] send pkt no:%ju seq %ju\n",
>>>> -                              thr, counters.seq, counters.seq%0xffff);
>>>> +                              thr, seq, seq%0xffff);
>>>>                         /* TODO use odp timer */
>>>>                         usleep(args->appl.interval * 1000);
>>>>                 }
>>>> -               if (args->appl.number != -1 && counters.seq
>>>> -                   >= (unsigned int)args->appl.number) {
>>>> +               if (args->appl.number != -1 &&
>>>> +                   odp_atomic64_load_rlx(&counters.seq) >=
>>>> +                   (unsigned int)args->appl.number) {
>>>>                         break;
>>>>                 }
>>>>         }
>>>> @@ -348,7 +350,8 @@ static void *gen_send_thread(void *arg)
>>>>         /* receive number of reply pks until timeout */
>>>>         if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0)
>>>> {
>>>>                 while (args->appl.timeout >= 0) {
>>>> -                       if (counters.icmp >= (unsigned
>>>> int)args->appl.number)
>>>> +                       if (odp_atomic64_load_rlx(&counters.icmp) >=
>>>> +                           (unsigned int)args->appl.number)
>>>>                                 break;
>>>>                         /* TODO use odp timer */
>>>>                         sleep(1);
>>>> @@ -358,10 +361,12 @@ static void *gen_send_thread(void *arg)
>>>>
>>>>         /* print info */
>>>>         if (args->appl.mode == APPL_MODE_UDP) {
>>>> -               printf("  [%02i] total send: %ju\n", thr, counters.seq);
>>>> +               printf("  [%02i] total send: %ju\n", thr,
>>>> +                      odp_atomic64_load_rlx(&counters.seq));
>>>>         } else if (args->appl.mode == APPL_MODE_PING) {
>>>>                 printf("  [%02i] total send: %ju total receive: %ju\n",
>>>> -                      thr, counters.seq, counters.icmp);
>>>> +                      thr, odp_atomic64_load_rlx(&counters.seq),
>>>> +                      odp_atomic64_load_rlx(&counters.icmp));
>>>>         }
>>>>         return arg;
>>>>  }
>>>> @@ -395,7 +400,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>                 if (!odp_packet_inflag_ipv4(pkt))
>>>>                         continue;
>>>>
>>>> -               odp_atomic_inc_u64(&counters.ip);
>>>> +               odp_atomic64_add_rlx(&counters.ip, 1);
>>>>                 rlen += sprintf(msg, "receive Packet proto:IP ");
>>>>                 buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
>>>>                 ip = (odph_ipv4hdr_t *)(buf +
>>>> odp_packet_l3_offset(pkt));
>>>> @@ -405,7 +410,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>
>>>>                 /* udp */
>>>>                 if (ip->proto == ODPH_IPPROTO_UDP) {
>>>> -                       odp_atomic_inc_u64(&counters.udp);
>>>> +                       odp_atomic64_add_rlx(&counters.udp, 1);
>>>>                         udp = (odph_udphdr_t *)(buf + offset);
>>>>                         rlen += sprintf(msg + rlen, "UDP payload %d ",
>>>>                                         odp_be_to_cpu_16(udp->length) -
>>>> @@ -417,7 +422,7 @@ static void print_pkts(int thr, odp_packet_t
>>>> pkt_tbl[], unsigned len)
>>>>                         icmp = (odph_icmphdr_t *)(buf + offset);
>>>>                         /* echo reply */
>>>>                         if (icmp->type == ICMP_ECHOREPLY) {
>>>> -                               odp_atomic_inc_u64(&counters.icmp);
>>>> +                               odp_atomic64_add_rlx(&counters.icmp, 1);
>>>>                                 memcpy(&tvsend, buf + offset +
>>>> ODPH_ICMPHDR_LEN,
>>>>                                        sizeof(struct timeval));
>>>>                                 /* TODO This should be changed to use an
>>>> @@ -530,10 +535,10 @@ int main(int argc, char *argv[])
>>>>         }
>>>>
>>>>         /* init counters */
>>>> -       odp_atomic_init_u64(&counters.seq);
>>>> -       odp_atomic_init_u64(&counters.ip);
>>>> -       odp_atomic_init_u64(&counters.udp);
>>>> -       odp_atomic_init_u64(&counters.icmp);
>>>> +       odp_atomic64_store_rlx(&counters.seq, 0);
>>>> +       odp_atomic64_store_rlx(&counters.ip, 0);
>>>> +       odp_atomic64_store_rlx(&counters.udp, 0);
>>>> +       odp_atomic64_store_rlx(&counters.icmp, 0);
>>>>
>>>>         /* Reserve memory for args from shared mem */
>>>>         shm = odp_shm_reserve("shm_args", sizeof(args_t),
>>>> diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
>>>> index 2f2dc19..76c27d0 100644
>>>> --- a/example/ipsec/odp_ipsec.c
>>>> +++ b/example/ipsec/odp_ipsec.c
>>>> @@ -1223,7 +1223,7 @@ main(int argc, char *argv[])
>>>>         printf("Num worker threads: %i\n", num_workers);
>>>>
>>>>         /* Create a barrier to synchronize thread startup */
>>>> -       odp_barrier_init_count(&sync_barrier, num_workers);
>>>> +       odp_barrier_init(&sync_barrier, num_workers);
>>>>
>>>>         /*
>>>>          * By default core #0 runs Linux kernel background tasks.
>>>> diff --git a/example/odp_example/odp_example.c
>>>> b/example/odp_example/odp_example.c
>>>> index 0e9aa3d..c473395 100644
>>>> --- a/example/odp_example/odp_example.c
>>>> +++ b/example/odp_example/odp_example.c
>>>> @@ -1120,7 +1120,7 @@ int main(int argc, char *argv[])
>>>>         odp_shm_print_all();
>>>>
>>>>         /* Barrier to sync test case execution */
>>>> -       odp_barrier_init_count(&globals->barrier, num_workers);
>>>> +       odp_barrier_init(&globals->barrier, num_workers);
>>>>
>>>>         if (args.proc_mode) {
>>>>                 int ret;
>>>> diff --git a/example/timer/odp_timer_test.c
>>>> b/example/timer/odp_timer_test.c
>>>> index 78b2ae2..dfbeae9 100644
>>>> --- a/example/timer/odp_timer_test.c
>>>> +++ b/example/timer/odp_timer_test.c
>>>> @@ -372,7 +372,7 @@ int main(int argc, char *argv[])
>>>>         printf("\n");
>>>>
>>>>         /* Barrier to sync test case execution */
>>>> -       odp_barrier_init_count(&test_barrier, num_workers);
>>>> +       odp_barrier_init(&test_barrier, num_workers);
>>>>
>>>>         /* Create and launch worker threads */
>>>>         odph_linux_pthread_create(thread_tbl, num_workers, first_core,
>>>> diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
>>>> index 76c1db8..5e78b34 100644
>>>> --- a/helper/include/odph_ring.h
>>>> +++ b/helper/include/odph_ring.h
>>>> @@ -138,8 +138,8 @@ typedef struct odph_ring {
>>>>                 uint32_t sp_enqueue;     /* True, if single producer. */
>>>>                 uint32_t size;           /* Size of ring. */
>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>> -               uint32_t head;          /* Producer head. */
>>>> -               uint32_t tail;          /* Producer tail. */
>>>> +               odp_atomic32_t head;    /* Producer head. */
>>>> +               odp_atomic32_t tail;    /* Producer tail. */
>>>>         } prod ODP_ALIGNED_CACHE;
>>>>
>>>>         /** @private Consumer */
>>>> @@ -147,8 +147,8 @@ typedef struct odph_ring {
>>>>                 uint32_t sc_dequeue;     /* True, if single consumer. */
>>>>                 uint32_t size;           /* Size of the ring. */
>>>>                 uint32_t mask;           /* Mask (size-1) of ring. */
>>>> -               uint32_t head;          /* Consumer head. */
>>>> -               uint32_t tail;          /* Consumer tail. */
>>>> +               odp_atomic32_t head;    /* Consumer head. */
>>>> +               odp_atomic32_t tail;    /* Consumer tail. */
>>>>         } cons ODP_ALIGNED_CACHE;
>>>>
>>>>         /** @private Memory space of ring starts here. */
>>>> diff --git a/platform/linux-generic/include/api/odp_atomic.h
>>>> b/platform/linux-generic/include/api/odp_atomic.h
>>>> index 0cc4cf4..89f183c 100644
>>>> --- a/platform/linux-generic/include/api/odp_atomic.h
>>>> +++ b/platform/linux-generic/include/api/odp_atomic.h
>>>> @@ -4,463 +4,559 @@
>>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>>   */
>>>>
>>>> -
>>>>  /**
>>>>   * @file
>>>>   *
>>>> - * ODP atomic operations
>>>> + * ODP atomic types and operations, semantically a subset of C11
>>>> atomics.
>>>> + * Scalar variable wrapped in a struct to avoid accessing scalar
>>>> directly
>>>> + * without using the required access functions.
>>>> + * Atomic functions must be used to operate on atomic variables!
>>>>   */
>>>>
>>>>  #ifndef ODP_ATOMIC_H_
>>>>  #define ODP_ATOMIC_H_
>>>>
>>>> +#include <stdint.h>
>>>> +#include <odp_align.h>
>>>> +#include <odp_hints.h>
>>>> +
>>>>  #ifdef __cplusplus
>>>>  extern "C" {
>>>>  #endif
>>>>
>>>> -
>>>> -#include <odp_std_types.h>
>>>> -
>>>> -
>>>> -/**
>>>> - * Atomic integer
>>>> - */
>>>> -typedef volatile int32_t odp_atomic_int_t;
>>>> -
>>>>  /**
>>>> - * Atomic unsigned integer 64 bits
>>>> + * 32-bit (unsigned) atomic type
>>>>   */
>>>> -typedef volatile uint64_t odp_atomic_u64_t;
>>>> +typedef struct {
>>>> +       uint32_t v; /**< Actual storage for the atomic variable */
>>>> +} odp_atomic32_t
>>>> +ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
>>>>
>>>>  /**
>>>> - * Atomic unsigned integer 32 bits
>>>> + * 64-bit (unsigned) atomic type
>>>>   */
>>>> -typedef volatile uint32_t odp_atomic_u32_t;
>>>> -
>>>> +typedef struct {
>>>> +       uint64_t v; /**< Actual storage for the atomic variable */
>>>> +} odp_atomic64_t
>>>> +ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
>>>>
>>>> -/**
>>>> - * Initialize atomic integer
>>>> - *
>>>> - * @param ptr    An integer atomic variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       *ptr = 0;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Load value of atomic integer
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return atomic integer value
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return *ptr;
>>>> -}
>>>>
>>>> +/*****************************************************************************
>>>> + * Just a few helpers
>>>> +
>>>> *****************************************************************************/
>>>>
>>>> -/**
>>>> - * Store value to atomic integer
>>>> - *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int
>>>> new_value)
>>>> -{
>>>> -       *ptr = new_value;
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and add atomic integer
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int
>>>> value)
>>>> -{
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and subtract atomic integer
>>>> - *
>>>> - * @param ptr    An atomic integer variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int
>>>> value)
>>>> -{
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and increment atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Increment atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and decrement atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic int variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_int(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Decrement atomic integer by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_sub_int(ptr, 1);
>>>> -}
>>>> +#ifdef __OCTEON__
>>>> +/* OCTEON Write Memory Barrier */
>>>> +#define COMPILER_HW_BARRIER() __asm __volatile( \
>>>> +       /* Double syncw to work around errata */ \
>>>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
>>>> +       : : : "memory")
>>>> +/* syncw is also used to flush the write buffer which makes stores
>>>> visible
>>>> + * quicker which should be beneficial to release operations */
>>>> +#define OCTEON_FLUSH() __asm __volatile( \
>>>> +       ".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
>>>> +       : : : "memory")
>>>> +#else
>>>> +/* __sync_synchronize() generates the right insn for ARMv6t2 and
>>>> ARMv7-a */
>>>> +/** Compiler and hardware full memory barrier */
>>>> +#define COMPILER_HW_BARRIER() __sync_synchronize()
>>>> +/** Flush write buffer on OCTEON */
>>>> +#define OCTEON_FLUSH() (void)0
>>>> +#endif
>>>>
>>>> -/**
>>>> - * Initialize atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       *ptr = 0;
>>>> -}
>>>> +/** Compiler memory barrier */
>>>> +#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
>>>>
>>>> -/**
>>>> - * Load value of atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return atomic uint32 value
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> - */
>>>> -static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return *ptr;
>>>> -}
>>>>
>>>> +/*****************************************************************************
>>>> + * Operations on 32-bit atomics
>>>> + * odp_atomic32_load_rlx
>>>> + * odp_atomic32_store_rlx
>>>> + * odp_atomic32_load_acq
>>>> + * odp_atomic32_store_rls
>>>> + * odp_atomic32_cmp_and_swap_rlx - return old value
>>>> + * odp_atomic32_fetch_add_rlx - return old value
>>>> + * odp_atomic32_fetch_add_rls - return old value
>>>> + * odp_atomic32_add_rlx - no return value
>>>> + * odp_atomic32_add_rls - no return value
>>>> +
>>>> *****************************************************************************/
>>>>
>>>>  /**
>>>> - * Store value to atomic uint32
>>>> + * Relaxed atomic load of 32-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>>   *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>   *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @return Value of the variable
>>>>   */
>>>> -static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
>>>> -                                       uint32_t new_value)
>>>> +static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
>>>>  {
>>>> -       *ptr = new_value;
>>>> +       uint32_t val;
>>>> +       COMPILER_BARRIER();
>>>> +       /* Read of aligned word is atomic */
>>>> +       val = ptr->v;
>>>> +       COMPILER_BARRIER();
>>>> +       return val;
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and add atomic uint32
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> + * Relaxed atomic store of 32-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>>   *
>>>> - * @return Value of the variable before the operation
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param val   Value to write to the variable
>>>>   */
>>>> -static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>>> -                                               uint32_t value)
>>>> +static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr,
>>>> uint32_t val)
>>>>  {
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> +       COMPILER_BARRIER();
>>>> +       /* Write of aligned word is atomic */
>>>> +       ptr->v = val;
>>>> +       COMPILER_BARRIER();
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and subtract uint32
>>>> + * Atomic load-acquire of 32-bit atomic variable
>>>> + * @note SC-load-acquire barrier, later accesses cannot move before
>>>> + * the load-acquire access.
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be sub to the variable
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>>   *
>>>> - * @return Value of the variable before the operation
>>>> + * @return Value of the variable
>>>>   */
>>>> -static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
>>>> -                                               uint32_t value)
>>>> +static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
>>>>  {
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> +#if defined __aarch64__
>>>> +       uint32_t val;
>>>> +       __asm __volatile("ldar %w0, [%1]"
>>>> +                : "=&r"(val)
>>>> +                                : "r"(&ptr->v)
>>>> +                                : "memory");
>>>> +       return val;
>>>> +#elif defined __arm__  || defined __mips64__ || defined __x86_64__
>>>> +       /* Read of aligned word is atomic */
>>>> +       uint32_t val = ptr->v;
>>>> +       /* To prevent later accesses from moving up */
>>>> +       /* FIXME: Herb Sutter claims HW barrier not needed on x86? */
>>>> +       COMPILER_HW_BARRIER();
>>>> +       return val;
>>>> +#else
>>>> +#warning odp_atomic32_load_acq() may not be efficiently implemented
>>>> +       /* Assume read of aligned word is atomic */
>>>> +       uint32_t val = ptr->v;
>>>> +       /* To prevent later accesses from moving up */
>>>> +       COMPILER_HW_BARRIER();
>>>> +       return val;
>>>> +#endif
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and increment atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -#if defined __OCTEON__
>>>> -
>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       uint32_t ret;
>>>> -
>>>> -       __asm__ __volatile__ ("syncws");
>>>> -       __asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
>>>> -                             "r" (ptr));
>>>> -
>>>> -       return ret;
>>>> -}
>>>> -
>>>> + * Atomic store-release of 32-bit atomic variable
>>>> + * @note SC-store-release barrier, earlier accesses cannot move after
>>>> + * store-release access.
>>>> + *
>>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>>> + * @param val  Value to write to the atomic variable
>>>> + */
>>>> +static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr,
>>>> uint32_t val)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       /* Compiler and HW barrier to prevent earlier accesses from
>>>> moving
>>>> +        * down */
>>>> +       COMPILER_HW_BARRIER();
>>>> +       /* Write of aligned word is atomic */
>>>> +       ptr->v = val;
>>>> +       /* Compiler and HW barrier to prevent this store from moving
>>>> down after
>>>> +        * a later load-acquire and thus create overlapping critical
>>>> sections.
>>>> +        * Herb Sutter thinks this is needed */
>>>> +       COMPILER_HW_BARRIER();
>>>> +#elif defined __aarch64__
>>>> +       __asm __volatile("stlr %w0, [%1]"
>>>> +                :
>>>> +                : "r"(val), "r"(&ptr->v)
>>>> +                                : "memory");
>>>> +#elif defined __mips64__
>>>> +       /* Compiler and HW barrier to prevent earlier accesses from
>>>> moving
>>>> +        * down */
>>>> +       COMPILER_HW_BARRIER();
>>>> +       /* Write of aligned word is atomic */
>>>> +       ptr->v = val;
>>>> +       /* Compiler and HW barrier to prevent this store from moving
>>>> down after
>>>> +        * a later load-acquire and thus create overlapping critical
>>>> sections.
>>>> +        * Herb Sutter thinks this is needed */
>>>> +       COMPILER_HW_BARRIER();
>>>> +#elif defined __x86_64__
>>>> +       /* This is actually an atomic exchange operation */
>>>> +       /* Generates good code on x86_64 */
>>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>  #else
>>>> -
>>>> -static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_u32(ptr, 1);
>>>> -}
>>>> -
>>>> +#warning odp_atomic32_store_rls() may not be efficiently implemented
>>>> +       /* This is actually an atomic exchange operation */
>>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>  #endif
>>>> -
>>>> -/**
>>>> - * Increment atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_u32(ptr, 1);
>>>>  }
>>>>
>>>> -/**
>>>> - * Fetch and decrement uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_u32(ptr, 1);
>>>> -}
>>>>
>>>>  /**
>>>> - * Decrement atomic uint32 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_sub_u32(ptr, 1);
>>>> + * Atomic compare and swap of 32-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>> + * @note Not compare-and-set! Called should compare return value with
>>>> expected
>>>> + * parameter to check if swap operation succeeded.
>>>> + *
>>>> + * @param ptr  Pointer to a 32-bit atomic variable
>>>> + * @param exp  Expected old value
>>>> + * @param val  New value
>>>> + * @return Actual old value, if different from 'exp' then swap failed
>>>> + */
>>>> +static inline uint32_t
>>>> +odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
>>>> +                             uint32_t exp,
>>>> +                             uint32_t val)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint32_t old;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrex %0, [%1]"
>>>> +                : "=&r"(old)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Value has changed, can't proceed */
>>>> +                       /* Clear exclusive access monitor */
>>>> +                       __asm __volatile("clrex");
>>>> +                       break;
>>>> +               }
>>>> +               /* Current value is as expected, attempt to write new
>>>> value */
>>>> +               __asm __volatile("strex %0, %1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               /* Restart the loop so we can re-read the previous
>>>> value */
>>>> +       } while (odp_unlikely(status != 0));
>>>> +       return old;
>>>> +#elif defined __aarch64__
>>>> +       uint32_t old;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldxr %w0, [%1]"
>>>> +                : "=&r"(old)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Value has changed, can't proceed */
>>>> +                       /* Clear exclusive access monitor */
>>>> +                       __asm __volatile("clrex");
>>>> +                       break;
>>>> +               }
>>>> +               /* Current value is as expected, attempt to write new
>>>> value */
>>>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               /* Restart the loop so we can re-read the previous
>>>> value */
>>>> +       } while (odp_unlikely(status != 0));
>>>> +       return old;
>>>> +#elif defined __mips64__
>>>> +       uint32_t old, new_val;
>>>> +       do {
>>>> +               __asm __volatile("llw %0, [%1]"
>>>> +                : "=&r"(old)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               if (odp_unlikely(old != exp)) {
>>>> +                       /* Value has changed, can't proceed */
>>>> +                       break;
>>>> +               }
>>>> +               /* Current value is as expected, attempt to write new
>>>> value */
>>>> +               new_val = val;
>>>> +               __asm __volatile("scw %0, [%1]"
>>>> +                : "+&r"(new_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(new_val == 0));
>>>> +       return old;
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>>>> +#else
>>>> +#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently
>>>> implemented
>>>> +       return __sync_val_compare_and_swap(&ptr->v, exp, val);
>>>> +#endif
>>>>  }
>>>>
>>>>  /**
>>>> - * Atomic compare and set for 32bit
>>>> - *
>>>> - * @param dst destination location into which the value will be
>>>> written.
>>>> - * @param exp expected value.
>>>> - * @param src new value.
>>>> - * @return Non-zero on success; 0 on failure.
>>>> - */
>>>> -static inline int
>>>> -odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t
>>>> src)
>>>> -{
>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>> + * Atomic fetch and add to 32-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>> + * @note A - B <=> A + (-B)
>>>> + *
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>> + *
>>>> + * @return Value of the atomic variable before the addition
>>>> + */
>>>> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>>>> +               uint32_t incr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint32_t old_val, new_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrex %0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("strex %0, %1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(status != 0));
>>>> +       return old_val;
>>>> +#elif defined __aarch64__
>>>> +       uint32_t old_val, new_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldxr %w0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("stxr %w0, %w1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(status != 0));
>>>> +       return old_val;
>>>> +#elif defined __mips64__
>>>> +       uint32_t old_val, new_val;
>>>> +       do {
>>>> +               __asm __volatile("llw %0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("scw %0, [%1]"
>>>> +                : "+&r"(new_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(new_val == 0));
>>>> +       return old_val;
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +#warning odp_atomic32_fetch_add_rlx() may not be efficiently
>>>> implemented
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>>  }
>>>>
>>>>  /**
>>>> - * Initialize atomic uint64
>>>> + * Atomic fetch and add to 32-bit atomic variable
>>>> + * @note Sequential consistent memory model, barriers before and after
>>>> the
>>>> + * @note A - B <=> A + (-B)
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>>   *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @return Value of the atomic variable before the addition
>>>>   */
>>>> -static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
>>>> +static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
>>>> +               uint32_t incr)
>>>>  {
>>>> -       *ptr = 0;
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       COMPILER_HW_BARRIER();
>>>> +       return odp_atomic32_fetch_add_rlx(ptr, incr);
>>>> +#elif defined __aarch64__
>>>> +       /* We basically get acquire/release semantics */
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#elif defined __mips64__
>>>> +       uint32_t old;
>>>> +       COMPILER_HW_BARRIER();
>>>> +       old = odp_atomic32_fetch_add_rlx(ptr, incr);
>>>> +       OCTEON_FLUSH();
>>>> +       return old;
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +#warning odp_atomic32_fetch_add_rls() may not be efficiently
>>>> implemented
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>>  }
>>>>
>>>>  /**
>>>> - * Load value of atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> + * Atomic add to 32-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>>   *
>>>> - * @return atomic uint64 value
>>>> - *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>>   */
>>>> -static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
>>>> +static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
>>>> +                                       uint32_t incr)
>>>>  {
>>>> -       return *ptr;
>>>> +       /* Use odp_atomic32_fetch_add_rlx() for now */
>>>> +       (void)odp_atomic32_fetch_add_rlx(ptr, incr);
>>>>  }
>>>>
>>>>  /**
>>>> - * Store value to atomic uint64
>>>> - *
>>>> - * @param ptr        An atomic variable
>>>> - * @param new_value  Store new_value to a variable
>>>> + * Atomic add to 32-bit atomic variable
>>>> + * @note Sequential consistent memory model, barriers before and after
>>>> the
>>>> + * operation.
>>>>   *
>>>> - * @note The operation is not synchronized with other threads
>>>> + * @param ptr   Pointer to a 32-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>>   */
>>>> -static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
>>>> -                                       uint64_t new_value)
>>>> +static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t
>>>> incr)
>>>>  {
>>>> -       *ptr = new_value;
>>>> +       /* Use odp_atomic32_fetch_add_rls() for now */
>>>> +       (void)odp_atomic32_fetch_add_rls(ptr, incr);
>>>>  }
>>>>
>>>> -/**
>>>> - * Add atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t
>>>> value)
>>>> -{
>>>> -       __sync_fetch_and_add(ptr, value);
>>>> -}
>>>>
>>>> +/*****************************************************************************
>>>> + * Operations on 64-bit atomics
>>>> + * odp_atomic64_load_rlx
>>>> + * odp_atomic64_store_rlx
>>>> + * odp_atomic64_fetch_add_rlx
>>>> + * odp_atomic64_add_rlx
>>>> +
>>>> *****************************************************************************/
>>>>
>>>>  /**
>>>> - * Fetch and add atomic uint64
>>>> + * Relaxed atomic load of 64-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>>   *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be added to the variable
>>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>>>   *
>>>> - * @return Value of the variable before the operation
>>>> + * @return Value of the atomic variable
>>>>   */
>>>> -
>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> +static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
>>>>  {
>>>> -       return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
>>>> -                                   (uint32_t)value);
>>>> -}
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t val;
>>>> +       __asm __volatile("ldrexd %0, %H0, [%1]\n\t"
>>>> +                        "clrex" /* Clear exclusive access monitor */
>>>> +                : "=&r"(val)
>>>> +                                : "r"(&ptr->v)
>>>> +                                : );
>>>> +       return val;
>>>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>>>> +       /* Read of aligned quad/double word is atomic */
>>>> +       return ptr->v;
>>>>  #else
>>>> -static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_add(ptr, value);
>>>> -}
>>>> +#warning odp_atomic64_load_rlx() may not be efficiently implemented
>>>> +       return __sync_fetch_and_or(&ptr->v, 0);
>>>>  #endif
>>>> -/**
>>>> - * Subtract atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t
>>>> value)
>>>> -{
>>>> -       __sync_fetch_and_sub(ptr, value);
>>>>  }
>>>>
>>>>  /**
>>>> - * Fetch and subtract atomic uint64
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - * @param value  A value to be subtracted from the variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -#if defined __powerpc__ && !defined __powerpc64__
>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
>>>> -                                   (uint32_t)value);
>>>> -}
>>>> + * Relaxed atomic store of 64-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>> + *
>>>> + * @param ptr  Pointer to a 64-bit atomic variable
>>>> + * @param val  Value to write to the atomic variable
>>>> + */
>>>> +static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
>>>> +               uint64_t val)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val;
>>>> +       int status;
>>>> +       do {
>>>> +               /* Read atomic variable exclusively so we can write to
>>>> it
>>>> +                * later */
>>>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               (void)old_val; /* Ignore old value */
>>>> +               /* Attempt to write the new value */
>>>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
>>>> +       /* Write of aligned quad/double word is atomic */
>>>> +       ptr->v = val;
>>>>  #else
>>>> -static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
>>>> -                                               uint64_t value)
>>>> -{
>>>> -       return __sync_fetch_and_sub(ptr, value);
>>>> -}
>>>> +#warning odp_atomic64_store_rlx() may not be efficiently implemented
>>>> +       /* This is actually an atomic exchange operation */
>>>> +       (void)__sync_lock_test_and_set(&ptr->v, val);
>>>>  #endif
>>>> -/**
>>>> - * Fetch and increment atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_add_u64(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Increment atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_add_u64(ptr, 1);
>>>> -}
>>>> -
>>>> -/**
>>>> - * Fetch and decrement atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - * @return Value of the variable before the operation
>>>> - */
>>>> -static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       return odp_atomic_fetch_sub_u64(ptr, 1);
>>>>  }
>>>>
>>>>  /**
>>>> - * Decrement atomic uint64 by 1
>>>> - *
>>>> - * @param ptr    An atomic variable
>>>> - *
>>>> - */
>>>> -static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
>>>> -{
>>>> -       odp_atomic_fetch_sub_u64(ptr, 1);
>>>> + * Atomic fetch and add to 64-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>> + *
>>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>> + *
>>>> + * @return Value of the atomic variable before the addition
>>>> + */
>>>> +static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
>>>> +               uint64_t incr)
>>>> +{
>>>> +#if defined __arm__ /* A32/T32 ISA */
>>>> +       uint64_t old_val, new_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldrexd %0, %H0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("strexd %0, %1, %H1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +       return old_val;
>>>> +#elif defined __aarch64__
>>>> +       uint64_t old_val, new_val;
>>>> +       int status;
>>>> +       do {
>>>> +               __asm __volatile("ldxr %x0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("stxr %w0, %x1, [%2]"
>>>> +                : "=&r"(status)
>>>> +                                        : "r"(new_val), "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(status != 0)); /* Retry until write
>>>> succeeds */
>>>> +       return old_val;
>>>> +#elif defined __mips64__
>>>> +       uint64_t old_val, new_val;
>>>> +       do {
>>>> +               __asm __volatile("ll %0, [%1]"
>>>> +                : "=&r"(old_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +               new_val = old_val + incr;
>>>> +               __asm __volatile("sc %0, [%1]"
>>>> +                : "+&r"(new_val)
>>>> +                                        : "r"(&ptr->v)
>>>> +                                        : "memory");
>>>> +       } while (odp_unlikely(new_val == 0));
>>>> +       return old_val;
>>>> +#elif defined __x86_64__
>>>> +       /* Generates good code on x86_64 */
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#else
>>>> +#warning odp_atomic64_fetch_add_rlx() may not be efficiently
>>>> implemented
>>>> +       return __sync_fetch_and_add(&ptr->v, incr);
>>>> +#endif
>>>>  }
>>>>
>>>>  /**
>>>> - * Atomic compare and set for 64bit
>>>> + * Atomic add to 64-bit atomic variable
>>>> + * @note Relaxed memory model, no barriers.
>>>>   *
>>>> - * @param dst destination location into which the value will be
>>>> written.
>>>> - * @param exp expected value.
>>>> - * @param src new value.
>>>> - * @return Non-zero on success; 0 on failure.
>>>> + * @param ptr   Pointer to a 64-bit atomic variable
>>>> + * @param incr  The value to be added to the atomic variable
>>>>   */
>>>> -static inline int
>>>> -odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t
>>>> src)
>>>> +static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t
>>>> incr)
>>>>  {
>>>> -       return __sync_bool_compare_and_swap(dst, exp, src);
>>>> +       (void)odp_atomic64_fetch_add_rlx(ptr, incr);
>>>>  }
>>>>
>>>>  #ifdef __cplusplus
>>>> diff --git a/platform/linux-generic/include/api/odp_barrier.h
>>>> b/platform/linux-generic/include/api/odp_barrier.h
>>>> index a7b3215..f8eae9a 100644
>>>> --- a/platform/linux-generic/include/api/odp_barrier.h
>>>> +++ b/platform/linux-generic/include/api/odp_barrier.h
>>>> @@ -27,18 +27,18 @@ extern "C" {
>>>>   * ODP execution barrier
>>>>   */
>>>>  typedef struct odp_barrier_t {
>>>> -       int              count;  /**< @private Thread count */
>>>> -       odp_atomic_int_t bar;    /**< @private Barrier counter */
>>>> +       uint32_t       num_threads;  /**< @private Thread count
>>>> (constant) */
>>>> +       odp_atomic32_t in_barrier;   /**< @private Threaads in barrier
>>>> */
>>>>  } odp_barrier_t;
>>>>
>>>>
>>>>  /**
>>>>   * Init barrier with thread count
>>>>   *
>>>> - * @param barrier    Barrier
>>>> - * @param count      Thread count
>>>> + * @param barrier     Barrier
>>>> + * @param num_threads Number of threads which share the barrier
>>>>   */
>>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count);
>>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
>>>>
>>>>
>>>>  /**
>>>> diff --git a/platform/linux-generic/include/api/odp_rwlock.h
>>>> b/platform/linux-generic/include/api/odp_rwlock.h
>>>> index 252ebb2..ff8a9a2 100644
>>>> --- a/platform/linux-generic/include/api/odp_rwlock.h
>>>> +++ b/platform/linux-generic/include/api/odp_rwlock.h
>>>> @@ -10,26 +10,30 @@
>>>>  /**
>>>>   * @file
>>>>   *
>>>> - * ODP RW Locks
>>>> + * ODP read/write lock
>>>> + * RW lock support multiple concurrent reads but only one (exclusive)
>>>> writer.
>>>>   */
>>>>
>>>> +#include <odp_atomic.h>
>>>> +
>>>>  #ifdef __cplusplus
>>>>  extern "C" {
>>>>  #endif
>>>>
>>>>  /**
>>>>   * The odp_rwlock_t type.
>>>> - * write lock count is -1,
>>>> - * read lock count > 0
>>>> + * write lock is ~0U
>>>> + * read lock count >0 && <~0U
>>>>   */
>>>>  typedef struct {
>>>> -       volatile int32_t cnt; /**< -1 Write lock,
>>>> -                               > 0 for Read lock. */
>>>> +       odp_atomic32_t cnt; /**< == 0: unlocked,
>>>> +                                == ~0: locked for write,
>>>> +                                > 0 number of concurrent read locks */
>>>>  } odp_rwlock_t;
>>>>
>>>>
>>>>  /**
>>>> - * Initialize the rwlock to an unlocked state.
>>>> + * Initialize the rwlock to the unlocked state.
>>>>   *
>>>>   * @param rwlock pointer to the RW Lock.
>>>>   */
>>>> @@ -50,14 +54,14 @@ void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
>>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
>>>>
>>>>  /**
>>>> - * Aquire a write lock.
>>>> + * Aquire the write lock.
>>>>   *
>>>>   * @param rwlock pointer to a RW Lock.
>>>>   */
>>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
>>>>
>>>>  /**
>>>> - * Release a write lock.
>>>> + * Release the write lock.
>>>>   *
>>>>   * @param rwlock pointer to a RW Lock.
>>>>   */
>>>> diff --git a/platform/linux-generic/include/api/odp_ticketlock.h
>>>> b/platform/linux-generic/include/api/odp_ticketlock.h
>>>> index 6277a18..c4b5e34 100644
>>>> --- a/platform/linux-generic/include/api/odp_ticketlock.h
>>>> +++ b/platform/linux-generic/include/api/odp_ticketlock.h
>>>> @@ -27,8 +27,8 @@ extern "C" {
>>>>   * ODP ticketlock
>>>>   */
>>>>  typedef struct odp_ticketlock_t {
>>>> -       odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
>>>> -       volatile uint32_t cur_ticket;  /**< @private Current ticket */
>>>> +       odp_atomic32_t next_ticket; /**< @private Next ticket */
>>>> +       odp_atomic32_t cur_ticket;  /**< @private Current ticket */
>>>>  } odp_ticketlock_t;
>>>>
>>>>
>>>> diff --git a/platform/linux-generic/include/odp_buffer_internal.h
>>>> b/platform/linux-generic/include/odp_buffer_internal.h
>>>> index 2002b51..530ab96 100644
>>>> --- a/platform/linux-generic/include/odp_buffer_internal.h
>>>> +++ b/platform/linux-generic/include/odp_buffer_internal.h
>>>> @@ -88,7 +88,7 @@ typedef struct odp_buffer_hdr_t {
>>>>         uint32_t                 index;      /* buf index in the pool */
>>>>         size_t                   size;       /* max data size */
>>>>         size_t                   cur_offset; /* current offset */
>>>> -       odp_atomic_int_t         ref_count;  /* reference count */
>>>> +       odp_atomic32_t           ref_count;  /* reference count */
>>>>         odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
>>>>         int                      type;       /* type of next header */
>>>>         odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
>>>> diff --git a/platform/linux-generic/odp_barrier.c
>>>> b/platform/linux-generic/odp_barrier.c
>>>> index a82b294..6c3b884 100644
>>>> --- a/platform/linux-generic/odp_barrier.c
>>>> +++ b/platform/linux-generic/odp_barrier.c
>>>> @@ -8,41 +8,48 @@
>>>>  #include <odp_sync.h>
>>>>  #include <odp_spin_internal.h>
>>>>
>>>> -void odp_barrier_init_count(odp_barrier_t *barrier, int count)
>>>> +void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
>>>>  {
>>>> -       barrier->count = count;
>>>> -       barrier->bar = 0;
>>>> -       odp_sync_stores();
>>>> +       barrier->num_threads = num_threads; /* Constant after
>>>> initialisation */
>>>> +       odp_atomic32_store_rls(&barrier->in_barrier, 0);
>>>>  }
>>>>
>>>>  /*
>>>>   * Efficient barrier_sync -
>>>>   *
>>>>   *   Barriers are initialized with a count of the number of callers
>>>> - *   that must sync on the barrier before any may proceed.
>>>> + *   that must sync on (enter) the barrier before any may proceed
>>>> (exit).
>>>>   *
>>>>   *   To avoid race conditions and to permit the barrier to be fully
>>>>   *   reusable, the barrier value cycles between 0..2*count-1. When
>>>> - *   synchronizing the wasless variable simply tracks which half of
>>>> + *   synchronizing the waslow variable simply tracks which half of
>>>>   *   the cycle the barrier was in upon entry.  Exit is when the
>>>>   *   barrier crosses to the other half of the cycle.
>>>>   */
>>>>
>>>>  void odp_barrier_sync(odp_barrier_t *barrier)
>>>>  {
>>>> -       int count;
>>>> -       int wasless;
>>>> +       uint32_t count;
>>>> +       bool waslow;
>>>>
>>>> -       odp_sync_stores();
>>>> -       wasless = barrier->bar < barrier->count;
>>>> -       count = odp_atomic_fetch_inc_int(&barrier->bar);
>>>> +       /* FIXME do we need acquire barrier as well? */
>>>> +       /* Increase threads in_barrier count, this will automatically
>>>> release
>>>> +        * the other threads when lower/upper range is switched */
>>>> +       count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
>>>> +       /* Compute lower or higher range indicator */
>>>> +       waslow = count < barrier->num_threads;
>>>>
>>>> -       if (count == 2*barrier->count-1) {
>>>> -               barrier->bar = 0;
>>>> -       } else {
>>>> -               while ((barrier->bar < barrier->count) == wasless)
>>>> -                       odp_spin();
>>>> +       /* Check if in_barrier count has "wrapped" */
>>>> +       if (count == 2 * barrier->num_threads - 1) {
>>>> +               /* Manually wrap the counter */
>>>> +               odp_atomic32_add_rls(&barrier->in_barrier,
>>>> +
>>>> (uint32_t)(-2*(int)barrier->num_threads));
>>>> +               /* We don't need to wait below, return immediately */
>>>> +               return;
>>>> +       }
>>>> +       /* Wait for counter to change half */
>>>> +       while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
>>>> +              barrier->num_threads) == waslow) {
>>>> +               odp_spin();
>>>>         }
>>>> -
>>>> -       odp_mem_barrier();
>>>>  }
>>>> diff --git a/platform/linux-generic/odp_buffer.c
>>>> b/platform/linux-generic/odp_buffer.c
>>>> index e54e0e7..a5939f3 100644
>>>> --- a/platform/linux-generic/odp_buffer.c
>>>> +++ b/platform/linux-generic/odp_buffer.c
>>>> @@ -73,7 +73,8 @@ int odp_buffer_snprint(char *str, size_t n,
>>>> odp_buffer_t buf)
>>>>         len += snprintf(&str[len], n-len,
>>>>                         "  cur_offset   %zu\n",       hdr->cur_offset);
>>>>         len += snprintf(&str[len], n-len,
>>>> -                       "  ref_count    %i\n",        hdr->ref_count);
>>>> +                       "  ref_count    %u\n",
>>>> +                       odp_atomic32_load_rlx(&hdr->ref_count));
>>>>         len += snprintf(&str[len], n-len,
>>>>                         "  type         %i\n",        hdr->type);
>>>>         len += snprintf(&str[len], n-len,
>>>> diff --git a/platform/linux-generic/odp_crypto.c
>>>> b/platform/linux-generic/odp_crypto.c
>>>> index b37ad6b..d9fff10 100644
>>>> --- a/platform/linux-generic/odp_crypto.c
>>>> +++ b/platform/linux-generic/odp_crypto.c
>>>> @@ -26,7 +26,7 @@
>>>>  #define MAX_SESSIONS 32
>>>>
>>>>  typedef struct {
>>>> -       odp_atomic_u32_t next;
>>>> +       odp_atomic32_t   next;
>>>>         uint32_t         max;
>>>>         odp_crypto_generic_session_t sessions[0];
>>>>  } odp_crypto_global_t;
>>>> @@ -58,7 +58,7 @@ odp_crypto_generic_session_t *alloc_session(void)
>>>>         uint32_t idx;
>>>>         odp_crypto_generic_session_t *session = NULL;
>>>>
>>>> -       idx = odp_atomic_fetch_inc_u32(&global->next);
>>>> +       idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
>>>>         if (idx < global->max) {
>>>>                 session = &global->sessions[idx];
>>>>                 session->index = idx;
>>>> diff --git a/platform/linux-generic/odp_queue.c
>>>> b/platform/linux-generic/odp_queue.c
>>>> index 1318bcd..08c0d29 100644
>>>> --- a/platform/linux-generic/odp_queue.c
>>>> +++ b/platform/linux-generic/odp_queue.c
>>>> @@ -214,8 +214,13 @@ int odp_queue_set_context(odp_queue_t handle, void
>>>> *context)
>>>>  {
>>>>         queue_entry_t *queue;
>>>>         queue = queue_to_qentry(handle);
>>>> +       /* Setting a new queue context can be viewed as a release
>>>> operation,
>>>> +        * all writes to the context must be observable before the
>>>> context
>>>> +        * is made observable */
>>>>         odp_sync_stores();
>>>> -       queue->s.param.context = context;
>>>> +       queue->s.param.context = context; /* Store-release */
>>>> +       /* Ensure queue modification is globally visible before we
>>>> return
>>>> +        * and the application might cause the queue to be scheduled */
>>>>         odp_sync_stores();
>>>>         return 0;
>>>>  }
>>>> diff --git a/platform/linux-generic/odp_ring.c
>>>> b/platform/linux-generic/odp_ring.c
>>>> index 632aa66..d1ec825 100644
>>>> --- a/platform/linux-generic/odp_ring.c
>>>> +++ b/platform/linux-generic/odp_ring.c
>>>> @@ -187,10 +187,10 @@ odph_ring_create(const char *name, unsigned
>>>> count, unsigned flags)
>>>>                 r->cons.size = count;
>>>>                 r->prod.mask = count-1;
>>>>                 r->cons.mask = count-1;
>>>> -               r->prod.head = 0;
>>>> -               r->cons.head = 0;
>>>> -               r->prod.tail = 0;
>>>> -               r->cons.tail = 0;
>>>> +               odp_atomic32_store_rlx(&r->prod.head, 0);
>>>> +               odp_atomic32_store_rlx(&r->cons.head, 0);
>>>> +               odp_atomic32_store_rlx(&r->prod.tail, 0);
>>>> +               odp_atomic32_store_rlx(&r->cons.tail, 0);
>>>>
>>>>                 TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
>>>>         } else {
>>>> @@ -227,7 +227,7 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>>> * const *obj_table,
>>>>         uint32_t prod_head, prod_next;
>>>>         uint32_t cons_tail, free_entries;
>>>>         const unsigned max = n;
>>>> -       int success;
>>>> +       bool ok;
>>>>         unsigned i;
>>>>         uint32_t mask = r->prod.mask;
>>>>         int ret;
>>>> @@ -237,8 +237,8 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r, void
>>>> * const *obj_table,
>>>>                 /* Reset n to the initial burst count */
>>>>                 n = max;
>>>>
>>>> -               prod_head = r->prod.head;
>>>> -               cons_tail = r->cons.tail;
>>>> +               prod_head = odp_atomic32_load_rlx(&r->prod.head);
>>>> +               cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>>>                 /* The subtraction is done between two unsigned 32bits
>>>> value
>>>>                  * (the result is always modulo 32 bits even if we have
>>>>                  * prod_head > cons_tail). So 'free_entries' is always
>>>> between 0
>>>> @@ -259,13 +259,13 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r,
>>>> void * const *obj_table,
>>>>                 }
>>>>
>>>>                 prod_next = prod_head + n;
>>>> -               success = odp_atomic_cmpset_u32(&r->prod.head,
>>>> prod_head,
>>>> -                                             prod_next);
>>>> -       } while (odp_unlikely(success == 0));
>>>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
>>>> +                                                  prod_head,
>>>> +                                                  prod_next) ==
>>>> prod_head;
>>>> +       } while (odp_unlikely(!ok));
>>>>
>>>>         /* write entries in ring */
>>>>         ENQUEUE_PTRS();
>>>> -       odp_mem_barrier();
>>>>
>>>>         /* if we exceed the watermark */
>>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>>> r->prod.watermark)) {
>>>> @@ -279,10 +279,10 @@ int __odph_ring_mp_do_enqueue(odph_ring_t *r,
>>>> void * const *obj_table,
>>>>          * If there are other enqueues in progress that preceeded us,
>>>>          * we need to wait for them to complete
>>>>          */
>>>> -       while (odp_unlikely(r->prod.tail != prod_head))
>>>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) !=
>>>> prod_head))
>>>>                 odp_spin();
>>>>
>>>> -       r->prod.tail = prod_next;
>>>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>>>         return ret;
>>>>  }
>>>>
>>>> @@ -298,8 +298,8 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>>>> * const *obj_table,
>>>>         uint32_t mask = r->prod.mask;
>>>>         int ret;
>>>>
>>>> -       prod_head = r->prod.head;
>>>> -       cons_tail = r->cons.tail;
>>>> +       prod_head = odp_atomic32_load_rlx(&r->prod.head);
>>>> +       cons_tail = odp_atomic32_load_acq(&r->cons.tail);
>>>>         /* The subtraction is done between two unsigned 32bits value
>>>>          * (the result is always modulo 32 bits even if we have
>>>>          * prod_head > cons_tail). So 'free_entries' is always between 0
>>>> @@ -320,11 +320,10 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r,
>>>> void * const *obj_table,
>>>>         }
>>>>
>>>>         prod_next = prod_head + n;
>>>> -       r->prod.head = prod_next;
>>>> +       odp_atomic32_store_rlx(&r->prod.head, prod_next);
>>>>
>>>>         /* write entries in ring */
>>>>         ENQUEUE_PTRS();
>>>> -       odp_mem_barrier();
>>>>
>>>>         /* if we exceed the watermark */
>>>>         if (odp_unlikely(((mask + 1) - free_entries + n) >
>>>> r->prod.watermark)) {
>>>> @@ -334,7 +333,7 @@ int __odph_ring_sp_do_enqueue(odph_ring_t *r, void
>>>> * const *obj_table,
>>>>                 ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
>>>>         }
>>>>
>>>> -       r->prod.tail = prod_next;
>>>> +       odp_atomic32_store_rls(&r->prod.tail, prod_next);
>>>>         return ret;
>>>>  }
>>>>
>>>> @@ -348,7 +347,7 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>>> **obj_table,
>>>>         uint32_t cons_head, prod_tail;
>>>>         uint32_t cons_next, entries;
>>>>         const unsigned max = n;
>>>> -       int success;
>>>> +       bool ok;
>>>>         unsigned i;
>>>>         uint32_t mask = r->prod.mask;
>>>>
>>>> @@ -357,8 +356,8 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r, void
>>>> **obj_table,
>>>>                 /* Restore n as it may change every loop */
>>>>                 n = max;
>>>>
>>>> -               cons_head = r->cons.head;
>>>> -               prod_tail = r->prod.tail;
>>>> +               cons_head = odp_atomic32_load_rlx(&r->cons.head);
>>>> +               prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>>>                 /* The subtraction is done between two unsigned 32bits
>>>> value
>>>>                  * (the result is always modulo 32 bits even if we have
>>>>                  * cons_head > prod_tail). So 'entries' is always
>>>> between 0
>>>> @@ -378,22 +377,22 @@ int __odph_ring_mc_do_dequeue(odph_ring_t *r,
>>>> void **obj_table,
>>>>                 }
>>>>
>>>>                 cons_next = cons_head + n;
>>>> -               success = odp_atomic_cmpset_u32(&r->cons.head,
>>>> cons_head,
>>>> -                                             cons_next);
>>>> -       } while (odp_unlikely(success == 0));
>>>> +               ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
>>>> +                                                  cons_head,
>>>> +                                                  cons_next) ==
>>>> cons_head;
>>>> +       } while (odp_unlikely(!ok));
>>>>
>>>>         /* copy in table */
>>>>         DEQUEUE_PTRS();
>>>> -       odp_mem_barrier();
>>>>
>>>>         /*
>>>>          * If there are other dequeues in progress that preceded us,
>>>>          * we need to wait for them to complete
>>>>          */
>>>> -       while (odp_unlikely(r->cons.tail != cons_head))
>>>> +       while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) !=
>>>> cons_head))
>>>>                 odp_spin();
>>>>
>>>> -       r->cons.tail = cons_next;
>>>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>>>
>>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>>  }
>>>> @@ -409,8 +408,8 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r, void
>>>> **obj_table,
>>>>         unsigned i;
>>>>         uint32_t mask = r->prod.mask;
>>>>
>>>> -       cons_head = r->cons.head;
>>>> -       prod_tail = r->prod.tail;
>>>> +       cons_head = odp_atomic32_load_rlx(&r->cons.head);
>>>> +       prod_tail = odp_atomic32_load_acq(&r->prod.tail);
>>>>         /* The subtraction is done between two unsigned 32bits value
>>>>          * (the result is always modulo 32 bits even if we have
>>>>          * cons_head > prod_tail). So 'entries' is always between 0
>>>> @@ -429,13 +428,12 @@ int __odph_ring_sc_do_dequeue(odph_ring_t *r,
>>>> void **obj_table,
>>>>         }
>>>>
>>>>         cons_next = cons_head + n;
>>>> -       r->cons.head = cons_next;
>>>> +       odp_atomic32_store_rlx(&r->cons.head, cons_next);
>>>>
>>>>         /* copy in table */
>>>>         DEQUEUE_PTRS();
>>>> -       odp_mem_barrier();
>>>>
>>>> -       r->cons.tail = cons_next;
>>>> +       odp_atomic32_store_rls(&r->cons.tail, cons_next);
>>>>         return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
>>>>  }
>>>>
>>>> @@ -482,8 +480,8 @@ int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void
>>>> **obj_table, unsigned n)
>>>>   */
>>>>  int odph_ring_full(const odph_ring_t *r)
>>>>  {
>>>> -       uint32_t prod_tail = r->prod.tail;
>>>> -       uint32_t cons_tail = r->cons.tail;
>>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>>         return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
>>>>  }
>>>>
>>>> @@ -492,8 +490,8 @@ int odph_ring_full(const odph_ring_t *r)
>>>>   */
>>>>  int odph_ring_empty(const odph_ring_t *r)
>>>>  {
>>>> -       uint32_t prod_tail = r->prod.tail;
>>>> -       uint32_t cons_tail = r->cons.tail;
>>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>>         return !!(cons_tail == prod_tail);
>>>>  }
>>>>
>>>> @@ -502,8 +500,8 @@ int odph_ring_empty(const odph_ring_t *r)
>>>>   */
>>>>  unsigned odph_ring_count(const odph_ring_t *r)
>>>>  {
>>>> -       uint32_t prod_tail = r->prod.tail;
>>>> -       uint32_t cons_tail = r->cons.tail;
>>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>>         return (prod_tail - cons_tail) & r->prod.mask;
>>>>  }
>>>>
>>>> @@ -512,8 +510,8 @@ unsigned odph_ring_count(const odph_ring_t *r)
>>>>   */
>>>>  unsigned odph_ring_free_count(const odph_ring_t *r)
>>>>  {
>>>> -       uint32_t prod_tail = r->prod.tail;
>>>> -       uint32_t cons_tail = r->cons.tail;
>>>> +       uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
>>>> +       uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
>>>>         return (cons_tail - prod_tail - 1) & r->prod.mask;
>>>>  }
>>>>
>>>> @@ -523,10 +521,10 @@ void odph_ring_dump(const odph_ring_t *r)
>>>>         ODP_DBG("ring <%s>@%p\n", r->name, r);
>>>>         ODP_DBG("  flags=%x\n", r->flags);
>>>>         ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
>>>> -       ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
>>>> -       ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
>>>> -       ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
>>>> -       ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
>>>> +       ODP_DBG("  ct=%"PRIu32"\n",
>>>> odp_atomic32_load_rlx(&r->cons.tail));
>>>> +       ODP_DBG("  ch=%"PRIu32"\n",
>>>> odp_atomic32_load_rlx(&r->cons.head));
>>>> +       ODP_DBG("  pt=%"PRIu32"\n",
>>>> odp_atomic32_load_rlx(&r->prod.tail));
>>>> +       ODP_DBG("  ph=%"PRIu32"\n",
>>>> odp_atomic32_load_rlx(&r->prod.head));
>>>>         ODP_DBG("  used=%u\n", odph_ring_count(r));
>>>>         ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
>>>>         if (r->prod.watermark == r->prod.size)
>>>> diff --git a/platform/linux-generic/odp_rwlock.c
>>>> b/platform/linux-generic/odp_rwlock.c
>>>> index 11c8dd7..ba0a7ca 100644
>>>> --- a/platform/linux-generic/odp_rwlock.c
>>>> +++ b/platform/linux-generic/odp_rwlock.c
>>>> @@ -4,58 +4,56 @@
>>>>   * SPDX-License-Identifier:     BSD-3-Clause
>>>>   */
>>>>
>>>> +#include <stdbool.h>
>>>>  #include <odp_atomic.h>
>>>>  #include <odp_rwlock.h>
>>>> -
>>>>  #include <odp_spin_internal.h>
>>>>
>>>>  void odp_rwlock_init(odp_rwlock_t *rwlock)
>>>>  {
>>>> -       rwlock->cnt = 0;
>>>> +       odp_atomic32_store_rlx(&rwlock->cnt, 0);
>>>>  }
>>>>
>>>>  void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
>>>>  {
>>>> -       int32_t cnt;
>>>> -       int  is_locked = 0;
>>>> -
>>>> -       while (is_locked == 0) {
>>>> -               cnt = rwlock->cnt;
>>>> +       bool gotit;
>>>> +       do {
>>>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>>>                 /* waiting for read lock */
>>>> -               if (cnt < 0) {
>>>> +               if ((int32_t)cnt < 0) {
>>>>                         odp_spin();
>>>>                         continue;
>>>>                 }
>>>> -               is_locked = odp_atomic_cmpset_u32(
>>>> -                                       (volatile uint32_t
>>>> *)&rwlock->cnt,
>>>> -                                             cnt, cnt + 1);
>>>> -       }
>>>> +               /* Attempt to take another read lock */
>>>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
>>>> +                                                     cnt, cnt + 1) ==
>>>> cnt;
>>>> +       } while (!gotit);
>>>>  }
>>>>
>>>>  void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
>>>>  {
>>>> -       odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>>> +       /* Release one read lock by subtracting 1 */
>>>> +       odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
>>>>  }
>>>>
>>>>  void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
>>>>  {
>>>> -       int32_t cnt;
>>>> -       int is_locked = 0;
>>>> -
>>>> -       while (is_locked == 0) {
>>>> -               cnt = rwlock->cnt;
>>>> -               /* lock aquired, wait */
>>>> +       bool gotit;
>>>> +       do {
>>>> +               uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
>>>>                 if (cnt != 0) {
>>>> +                       /* Lock is busy */
>>>>                         odp_spin();
>>>>                         continue;
>>>>                 }
>>>> -               is_locked = odp_atomic_cmpset_u32(
>>>> -                                       (volatile uint32_t
>>>> *)&rwlock->cnt,
>>>> -                                             0, -1);
>>>> -       }
>>>> +               /* Attempt to take write lock */
>>>> +               gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
>>>> +                                                     (uint32_t)-1) ==
>>>> 0;
>>>> +       } while (!gotit);
>>>>  }
>>>>
>>>>  void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
>>>>  {
>>>> -       odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
>>>> +       /* Release the write lock by adding 1 */
>>>> +       odp_atomic32_add_rls(&rwlock->cnt, 1);
>>>>  }
>>>> diff --git a/platform/linux-generic/odp_thread.c
>>>> b/platform/linux-generic/odp_thread.c
>>>> index b869b27..569b235 100644
>>>> --- a/platform/linux-generic/odp_thread.c
>>>> +++ b/platform/linux-generic/odp_thread.c
>>>> @@ -31,7 +31,7 @@ typedef struct {
>>>>
>>>>  typedef struct {
>>>>         thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
>>>> -       odp_atomic_int_t num;
>>>> +       odp_atomic32_t   num;
>>>>
>>>>  } thread_globals_t;
>>>>
>>>> @@ -67,7 +67,7 @@ static int thread_id(void)
>>>>         int id;
>>>>         int cpu;
>>>>
>>>> -       id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
>>>> +       id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
>>>>
>>>>         if (id >= ODP_CONFIG_MAX_THREADS) {
>>>>                 ODP_ERR("Too many threads\n");
>>>> @@ -77,7 +77,7 @@ static int thread_id(void)
>>>>         cpu = sched_getcpu();
>>>>
>>>>         if (cpu < 0) {
>>>> -               ODP_ERR("getcpu failed\n");
>>>> +               ODP_ERR("sched_getcpu failed\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> diff --git a/platform/linux-generic/odp_ticketlock.c
>>>> b/platform/linux-generic/odp_ticketlock.c
>>>> index be5b885..cadc0e0 100644
>>>> --- a/platform/linux-generic/odp_ticketlock.c
>>>> +++ b/platform/linux-generic/odp_ticketlock.c
>>>> @@ -12,9 +12,8 @@
>>>>
>>>>  void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
>>>>  {
>>>> -       ticketlock->next_ticket = 0;
>>>> -       ticketlock->cur_ticket  = 0;
>>>> -       odp_sync_stores();
>>>> +       odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
>>>> +       odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
>>>>  }
>>>>
>>>>
>>>> @@ -22,30 +21,14 @@ void odp_ticketlock_lock(odp_ticketlock_t
>>>> *ticketlock)
>>>>  {
>>>>         uint32_t ticket;
>>>>
>>>> -       ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>>> +       ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket,
>>>> 1);
>>>>
>>>> -       while (ticket != ticketlock->cur_ticket)
>>>> +       while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>>>                 odp_spin();
>>>> -
>>>> -       odp_mem_barrier();
>>>>  }
>>>>
>>>>
>>>>  void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>>>>  {
>>>> -       odp_sync_stores();
>>>> -
>>>> -       ticketlock->cur_ticket++;
>>>> -
>>>> -#if defined __OCTEON__
>>>> -       odp_sync_stores();
>>>> -#else
>>>> -       odp_mem_barrier();
>>>> -#endif
>>>> -}
>>>> -
>>>> -
>>>> -int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
>>>> -{
>>>> -       return ticketlock->cur_ticket != ticketlock->next_ticket;
>>>> +       odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
>>>>  }
>>>> diff --git a/platform/linux-generic/odp_timer.c
>>>> b/platform/linux-generic/odp_timer.c
>>>> index 313c713..938429f 100644
>>>> --- a/platform/linux-generic/odp_timer.c
>>>> +++ b/platform/linux-generic/odp_timer.c
>>>> @@ -32,8 +32,8 @@ typedef struct {
>>>>
>>>>  typedef struct {
>>>>         int               allocated;
>>>> -       volatile int      active;
>>>> -       volatile uint64_t cur_tick;
>>>> +       odp_atomic32_t    active;
>>>> +       odp_atomic64_t    cur_tick;
>>>>         timer_t           timerid;
>>>>         odp_timer_t       timer_hdl;
>>>>         odp_buffer_pool_t pool;
>>>> @@ -150,16 +150,14 @@ static void notify_function(union sigval sigval)
>>>>
>>>>         timer = sigval.sival_ptr;
>>>>
>>>> -       if (timer->active == 0) {
>>>> +       if (odp_atomic32_load_rlx(&timer->active) == 0) {
>>>>                 ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
>>>>                 return;
>>>>         }
>>>>
>>>>         /* ODP_DBG("Tick\n"); */
>>>>
>>>> -       cur_tick = timer->cur_tick++;
>>>> -
>>>> -       odp_sync_stores();
>>>> +       cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
>>>>
>>>>         tick = &timer->tick[cur_tick % MAX_TICKS];
>>>>
>>>> @@ -318,8 +316,7 @@ odp_timer_t odp_timer_create(const char *name,
>>>> odp_buffer_pool_t pool,
>>>>                 timer->tick[i].list = NULL;
>>>>         }
>>>>
>>>> -       timer->active = 1;
>>>> -       odp_sync_stores();
>>>> +       odp_atomic32_store_rls(&timer->active, 1);
>>>>
>>>>         timer_start(timer);
>>>>
>>>> @@ -340,7 +337,7 @@ odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t
>>>> timer_hdl, uint64_t tmo_tick,
>>>>         id = (int)timer_hdl - 1;
>>>>         timer = &odp_timer.timer[id];
>>>>
>>>> -       cur_tick = timer->cur_tick;
>>>> +       cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
>>>>         if (tmo_tick <= cur_tick) {
>>>>                 ODP_DBG("timeout too close\n");
>>>>                 return ODP_TIMER_TMO_INVALID;
>>>> @@ -416,7 +413,7 @@ uint64_t odp_timer_current_tick(odp_timer_t
>>>> timer_hdl)
>>>>         uint32_t id;
>>>>
>>>>         id = timer_hdl - 1;
>>>> -       return odp_timer.timer[id].cur_tick;
>>>> +       return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
>>>>  }
>>>>
>>>>  odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
>>>> diff --git a/test/api_test/odp_atomic_test.c
>>>> b/test/api_test/odp_atomic_test.c
>>>> index 9019d4f..4d27b32 100644
>>>> --- a/test/api_test/odp_atomic_test.c
>>>> +++ b/test/api_test/odp_atomic_test.c
>>>> @@ -10,17 +10,14 @@
>>>>  #include <odp_common.h>
>>>>  #include <odp_atomic_test.h>
>>>>
>>>> -static odp_atomic_int_t a32;
>>>> -static odp_atomic_u32_t a32u;
>>>> -static odp_atomic_u64_t a64u;
>>>> +static odp_atomic32_t a32u;
>>>> +static odp_atomic64_t a64u;
>>>>
>>>> -static odp_atomic_int_t numthrds;
>>>> +static odp_barrier_t barrier;
>>>>
>>>>  static const char * const test_name[] = {
>>>>         "dummy",
>>>>         "test atomic basic ops add/sub/inc/dec",
>>>> -       "test atomic inc/dec of signed word",
>>>> -       "test atomic add/sub of signed word",
>>>>         "test atomic inc/dec of unsigned word",
>>>>         "test atomic add/sub of unsigned word",
>>>>         "test atomic inc/dec of unsigned double word",
>>>> @@ -31,39 +28,29 @@ static struct timeval tv0[MAX_WORKERS],
>>>> tv1[MAX_WORKERS];
>>>>
>>>>  static void usage(void)
>>>>  {
>>>> -       printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
>>>> +       printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
>>>>                "\t<testcase> is\n"
>>>>                "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
>>>> -              "\t\t2 - Test inc dec of signed word\n"
>>>> -              "\t\t3 - Test add sub of signed word\n"
>>>> -              "\t\t4 - Test inc dec of unsigned word\n"
>>>> -              "\t\t5 - Test add sub of unsigned word\n"
>>>> -              "\t\t6 - Test inc dec of double word\n"
>>>> -              "\t\t7 - Test add sub of double word\n"
>>>> -              "\t<num of pthread> is optional\n"
>>>> -              "\t\t<1 - 31> - no of pthreads to start\n"
>>>> +              "\t\t2 - Test inc dec of unsigned word\n"
>>>> +              "\t\t3 - Test add sub of unsigned word\n"
>>>> +              "\t\t4 - Test inc dec of double word\n"
>>>> +              "\t\t5 - Test add sub of double word\n"
>>>> +              "\t<num of thread> is optional\n"
>>>> +              "\t\t<1 - 31> - no of threads to start\n"
>>>>                "\t\tif user doesn't specify this option, then\n"
>>>> -              "\t\tno of pthreads created is equivalent to no of
>>>> cores\n"
>>>> +              "\t\tno of threads created is equivalent to no of
>>>> cores\n"
>>>>                "\t\tavailable in the system\n"
>>>>                "\tExample usage:\n"
>>>>                "\t\t./odp_atomic -t 2\n"
>>>>                "\t\t./odp_atomic -t 3 -n 12\n");
>>>>  }
>>>>
>>>> -void test_atomic_inc_32(void)
>>>> -{
>>>> -       int i;
>>>> -
>>>> -       for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_inc_int(&a32);
>>>> -}
>>>> -
>>>>  void test_atomic_inc_u32(void)
>>>>  {
>>>>         int i;
>>>>
>>>>         for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_inc_u32(&a32u);
>>>> +               odp_atomic32_add_rlx(&a32u, 1);
>>>>  }
>>>>
>>>>  void test_atomic_inc_64(void)
>>>> @@ -71,15 +58,7 @@ void test_atomic_inc_64(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_inc_u64(&a64u);
>>>> -}
>>>> -
>>>> -void test_atomic_dec_32(void)
>>>> -{
>>>> -       int i;
>>>> -
>>>> -       for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_dec_int(&a32);
>>>> +               odp_atomic64_add_rlx(&a64u, 1);
>>>>  }
>>>>
>>>>  void test_atomic_dec_u32(void)
>>>> @@ -87,7 +66,7 @@ void test_atomic_dec_u32(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_dec_u32(&a32u);
>>>> +               odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
>>>>  }
>>>>
>>>>  void test_atomic_dec_64(void)
>>>> @@ -95,15 +74,7 @@ void test_atomic_dec_64(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < CNT; i++)
>>>> -               odp_atomic_dec_u64(&a64u);
>>>> -}
>>>> -
>>>> -void test_atomic_add_32(void)
>>>> -{
>>>> -       int i;
>>>> -
>>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
>>>> +               odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
>>>>  }
>>>>
>>>>  void test_atomic_add_u32(void)
>>>> @@ -111,7 +82,7 @@ void test_atomic_add_u32(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
>>>> +               odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
>>>>  }
>>>>
>>>>  void test_atomic_add_64(void)
>>>> @@ -119,15 +90,7 @@ void test_atomic_add_64(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
>>>> -}
>>>> -
>>>> -void test_atomic_sub_32(void)
>>>> -{
>>>> -       int i;
>>>> -
>>>> -       for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
>>>> +               odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
>>>>  }
>>>>
>>>>  void test_atomic_sub_u32(void)
>>>> @@ -135,7 +98,7 @@ void test_atomic_sub_u32(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
>>>> +               odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
>>>>  }
>>>>
>>>>  void test_atomic_sub_64(void)
>>>> @@ -143,19 +106,7 @@ void test_atomic_sub_64(void)
>>>>         int i;
>>>>
>>>>         for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
>>>> -               odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
>>>> -}
>>>> -
>>>> -void test_atomic_inc_dec_32(void)
>>>> -{
>>>> -       test_atomic_inc_32();
>>>> -       test_atomic_dec_32();
>>>> -}
>>>> -
>>>> -void test_atomic_add_sub_32(void)
>>>> -{
>>>> -       test_atomic_add_32();
>>>> -       test_atomic_sub_32();
>>>> +               odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
>>>>  }
>>>>
>>>>  void test_atomic_inc_dec_u32(void)
>>>> @@ -188,11 +139,6 @@ void test_atomic_add_sub_64(void)
>>>>   */
>>>>  void test_atomic_basic(void)
>>>>  {
>>>> -       test_atomic_inc_32();
>>>> -       test_atomic_dec_32();
>>>> -       test_atomic_add_32();
>>>> -       test_atomic_sub_32();
>>>> -
>>>>         test_atomic_inc_u32();
>>>>         test_atomic_dec_u32();
>>>>         test_atomic_add_u32();
>>>> @@ -206,31 +152,24 @@ void test_atomic_basic(void)
>>>>
>>>>  void test_atomic_init(void)
>>>>  {
>>>> -       odp_atomic_init_int(&a32);
>>>> -       odp_atomic_init_u32(&a32u);
>>>> -       odp_atomic_init_u64(&a64u);
>>>> +       odp_atomic32_store_rlx(&a32u, 0);
>>>> +       odp_atomic64_store_rlx(&a64u, 0);
>>>>  }
>>>>
>>>>  void test_atomic_store(void)
>>>>  {
>>>> -       odp_atomic_store_int(&a32, S32_INIT_VAL);
>>>> -       odp_atomic_store_u32(&a32u, U32_INIT_VAL);
>>>> -       odp_atomic_store_u64(&a64u, U64_INIT_VAL);
>>>> +       odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
>>>> +       odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
>>>>  }
>>>>
>>>>  int test_atomic_validate(void)
>>>>  {
>>>> -       if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
>>>> -               ODP_ERR("Atomic signed 32 usual functions failed\n");
>>>> -               return -1;
>>>> -       }
>>>> -
>>>> -       if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
>>>> +       if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
>>>>                 ODP_ERR("Atomic u32 usual functions failed\n");
>>>>                 return -1;
>>>>         }
>>>>
>>>> -       if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
>>>> +       if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
>>>>                 ODP_ERR("Atomic u64 usual functions failed\n");
>>>>                 return -1;
>>>>         }
>>>> @@ -247,11 +186,8 @@ static void *run_thread(void *arg)
>>>>
>>>>         ODP_DBG("Thread %i starts\n", thr);
>>>>
>>>> -       odp_atomic_inc_int(&numthrds);
>>>> -
>>>> -       /* Wait here until all pthreads are created */
>>>> -       while (*(volatile int *)&numthrds < parg->numthrds)
>>>> -               ;
>>>> +       /* Wait here until all threads have arrived */
>>>> +       odp_barrier_sync(&barrier);
>>>>
>>>>         gettimeofday(&tv0[thr], NULL);
>>>>
>>>> @@ -259,12 +195,6 @@ static void *run_thread(void *arg)
>>>>         case TEST_MIX:
>>>>                 test_atomic_basic();
>>>>                 break;
>>>> -       case TEST_INC_DEC_S32:
>>>> -               test_atomic_inc_dec_32();
>>>> -               break;
>>>> -       case TEST_ADD_SUB_S32:
>>>> -               test_atomic_add_sub_32();
>>>> -               break;
>>>>         case TEST_INC_DEC_U32:
>>>>                 test_atomic_inc_dec_u32();
>>>>                 break;
>>>> @@ -327,7 +257,6 @@ int main(int argc, char *argv[])
>>>>         if (pthrdnum == 0)
>>>>                 pthrdnum = odp_sys_core_count();
>>>>
>>>> -       odp_atomic_init_int(&numthrds);
>>>>         test_atomic_init();
>>>>         test_atomic_store();
>>>>
>>>> @@ -342,6 +271,7 @@ int main(int argc, char *argv[])
>>>>                 usage();
>>>>                 goto err_exit;
>>>>         }
>>>> +       odp_barrier_init(&barrier, pthrdnum);
>>>>         odp_test_thread_create(run_thread, &thrdarg);
>>>>
>>>>         odp_test_thread_exit(&thrdarg);
>>>> diff --git a/test/api_test/odp_atomic_test.h
>>>> b/test/api_test/odp_atomic_test.h
>>>> index 7814da5..aaa9d34 100644
>>>> --- a/test/api_test/odp_atomic_test.h
>>>> +++ b/test/api_test/odp_atomic_test.h
>>>> @@ -18,14 +18,11 @@
>>>>  #define ADD_SUB_CNT    5
>>>>
>>>>  #define        CNT 500000
>>>> -#define        S32_INIT_VAL    (1UL << 10)
>>>>  #define        U32_INIT_VAL    (1UL << 10)
>>>>  #define        U64_INIT_VAL    (1ULL << 33)
>>>>
>>>>  typedef enum {
>>>>         TEST_MIX = 1, /* Must be first test case num */
>>>> -       TEST_INC_DEC_S32,
>>>> -       TEST_ADD_SUB_S32,
>>>>         TEST_INC_DEC_U32,
>>>>         TEST_ADD_SUB_U32,
>>>>         TEST_INC_DEC_64,
>>>> @@ -34,16 +31,10 @@ typedef enum {
>>>>  } odp_test_atomic_t;
>>>>
>>>>
>>>> -void test_atomic_inc_dec_32(void);
>>>> -void test_atomic_add_sub_32(void);
>>>>  void test_atomic_inc_dec_u32(void);
>>>>  void test_atomic_add_sub_u32(void);
>>>>  void test_atomic_inc_dec_64(void);
>>>>  void test_atomic_add_sub_64(void);
>>>> -void test_atomic_inc_32(void);
>>>> -void test_atomic_dec_32(void);
>>>> -void test_atomic_add_32(void);
>>>> -void test_atomic_sub_32(void);
>>>>  void test_atomic_inc_u32(void);
>>>>  void test_atomic_dec_u32(void);
>>>>  void test_atomic_add_u32(void);
>>>> --
>>>> 1.9.1
>>>>
>>>>
>>>> _______________________________________________
>>>> lng-odp mailing list
>>>> lng-odp@lists.linaro.org
>>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>>>
>>>
>>>
>>> _______________________________________________
>>> lng-odp mailing list
>>> lng-odp@lists.linaro.org
>>> http://lists.linaro.org/mailman/listinfo/lng-odp
>>>
>>>
>>
>>
>> --
>> *Mike Holmes*
>> Linaro  Sr Technical Manager
>> LNG - ODP
>>
>
>
Ola Liljedahl Oct. 16, 2014, 12:04 p.m. UTC | #8
One specific goal of this updated API is to access shared data correctly
(in the way defined by C11/C++11 standards, not using some ad-hoc model)
and to remove the need for explicit (HW and compiler) barriers. The API
should specify the actual intent of operations, the full memory barrier
does not. odp_sync_stores() is also not a complete solution as it only
orders stores, not loads. Now we have been lucky because the
implementations of odp_sync_stores() have always used a full barrier but
that's not the specified function.

Original implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);

        while (ticket != ticketlock->cur_ticket)
                odp_spin();

        odp_mem_barrier();
}

New implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
                odp_spin();
}

See the difference? Spot the bug in the original implementation?

The bug is that the lock() function does not have any acquire (or full)
barrier between the lock has been acquired (when ticket ==
lock->cur_ticket) and the return to the caller. This means a subsequent
load operation in the caller could be speculated before the lock has been
acquired (while we are still spinning) so read something when the lock is
held by another thread. The full barriers that are part of the
atomic_fetch_inc operation don't help.

Load-acquire will include the necessary barriers to make sure later loads
are not speculated before the lock is acquired (ticket ==
lock->cur_ticket). The barriers come automagically when you realize all
critical-enter operations need some type of acquire operation (e.g.
load-acquire) and all critical-exit operations need a release
operation(e.g. store-release).

-- Ola

On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

> Hi,
>
> I think we don’t need  to specify these in three different versions. It
> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> that already today, since we don’t promise (in API documentation) to
> include memory barriers into those calls. I think "relaxed" is the common
> case for _applications_, those would mainly modify counters through this
> API – and not implement synchronization data structures (like the
> ticketlock). If ODP _implementation_ or _application platform_ implements
> such data structure, it’s not huge overhead to put those odp_sync_stores or
> compiler memory barriers there when needed. Application would mainly use
> those (in thousands of places), but those would be implemented only once
> (in few places).
>
>
> Why not just change this …
>
> /**
>  * Fetch and add atomic uint32
>  *
>  * @param ptr    An atomic variable
>  * @param value  A value to be added to the variable
>  *
>  * @return Value of the variable before the operation
>  */
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __sync_fetch_and_add(ptr, value);
> }
>
> … into this …
>
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> }
>
>
> -Petri
>
>
>
> From: lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> Sent: Thursday, October 16, 2014 11:42 AM
> To: Bill Fischofer
> Cc: lng-odp-forward
> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
> These suffixes stand for relaxed, release and acquire. They are important
> concepts in C11/C++11 atomics and memory models. It is all about what
> (observable) ordering is required in multithreaded (multiprocessor)
> environments, happens-before and happens-after relationships.
>
> A relaxed access is independent of all other accesses and need no
> synchronization.
> An acquire access denotes some type of shared resource acquisition. Loads
> and stores after the acquire load must be prevented from moving up (either
> by compiler or by the HW), this is a half-sided barrier. Loads and stores
> from before the acquire are allowed to move down.
> A release access denotes releases of a shared resource. Loads and stores
> before the release store must be prevented from moving down (either by
> compiler or by HW), this is also a half-sided barrier. Loads and stores
> after the release are allowed to move up.
>
> Code that uses atomic variables (e.g. for implementing shared memory data
> structures such as locks and rings) must know which type of atomic
> operations is required. The ODP ticket lock implementation makes a good
> example:
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> {
>         uint32_t ticket;
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>   //the ticket counter does not protect anything to incrementing it can be
> relaxed
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>  //acquiring the currently served position will include (a half-sided)
> barrier so to contain accesses from inside the critical section
>                 odp_spin();
> }
>
>
> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> {
>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> currently server position will also include (a half-sided) barrier to
> contain inside accesses
> }
>
> Implementations may use barriers of some kind inside these primitive
> atomic operations. Some architectures don't even need explicit barriers as
> they have memory access instructions (e.g. load and store) with acquire and
> release semantics. Full barriers are heavy (and semantically an overkill)
> and you want to avoid them if possible. To use full barriers for updates to
> e.g. global statistics counters will affect performance, such updates can
> be relaxed (they still need to be atomic of course).
>
> See these two good presentations Herb Sutter on the C++ standards
> committee.
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>
> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
>
>
Savolainen, Petri (NSN - FI/Espoo) Oct. 16, 2014, 1:29 p.m. UTC | #9
Hi,

OK, there may be a bug in ticketlock on ARMv8. But is the right solution to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with a  odp_sync_loads() – which would ensure correct load ordering over the lock (prevent loads moving over it).

My concern is that API gets big/complex although the common case for the application could be simple (only “relaxed”). If other use cases are related to building synchronization primitives, it’s a limited problem that could be handled with correct usage of load/store barriers (or in assembly in an optimized ODP implementation).

Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if that’s the main use case for applications…


-Petri


From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]

Sent: Thursday, October 16, 2014 3:05 PM
To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

One specific goal of this updated API is to access shared data correctly (in the way defined by C11/C++11 standards, not using some ad-hoc model) and to remove the need for explicit (HW and compiler) barriers. The API should specify the actual intent of operations, the full memory barrier does not. odp_sync_stores() is also not a complete solution as it only orders stores, not loads. Now we have been lucky because the implementations of odp_sync_stores() have always used a full barrier but that's not the specified function.

Original implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);

        while (ticket != ticketlock->cur_ticket)
                odp_spin();

        odp_mem_barrier();
}

New implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
                odp_spin();
}

See the difference? Spot the bug in the original implementation?

The bug is that the lock() function does not have any acquire (or full) barrier between the lock has been acquired (when ticket == lock->cur_ticket) and the return to the caller. This means a subsequent load operation in the caller could be speculated before the lock has been acquired (while we are still spinning) so read something when the lock is held by another thread. The full barriers that are part of the atomic_fetch_inc operation don't help.

Load-acquire will include the necessary barriers to make sure later loads are not speculated before the lock is acquired (ticket == lock->cur_ticket). The barriers come automagically when you realize all critical-enter operations need some type of acquire operation (e.g. load-acquire) and all critical-exit operations need a release operation(e.g. store-release).

-- Ola

On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

I think we don’t need  to specify these in three different versions. It should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like that already today, since we don’t promise (in API documentation) to include memory barriers into those calls. I think "relaxed" is the common case for _applications_, those would mainly modify counters through this API – and not implement synchronization data structures (like the ticketlock). If ODP _implementation_ or _application platform_ implements such data structure, it’s not huge overhead to put those odp_sync_stores or compiler memory barriers there when needed. Application would mainly use those (in thousands of places), but those would be implemented only once (in few places).


Why not just change this …

/**
 * Fetch and add atomic uint32
 *
 * @param ptr    An atomic variable
 * @param value  A value to be added to the variable
 *
 * @return Value of the variable before the operation
 */
static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __sync_fetch_and_add(ptr, value);
}

… into this …

static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
}


-Petri



From: lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org> [mailto:lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org>] On Behalf Of ext Ola Liljedahl

Sent: Thursday, October 16, 2014 11:42 AM
To: Bill Fischofer
Cc: lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

These suffixes stand for relaxed, release and acquire. They are important concepts in C11/C++11 atomics and memory models. It is all about what (observable) ordering is required in multithreaded (multiprocessor) environments, happens-before and happens-after relationships.

A relaxed access is independent of all other accesses and need no synchronization.
An acquire access denotes some type of shared resource acquisition. Loads and stores after the acquire load must be prevented from moving up (either by compiler or by the HW), this is a half-sided barrier. Loads and stores from before the acquire are allowed to move down.
A release access denotes releases of a shared resource. Loads and stores before the release store must be prevented from moving down (either by compiler or by HW), this is also a half-sided barrier. Loads and stores after the release are allowed to move up.

Code that uses atomic variables (e.g. for implementing shared memory data structures such as locks and rings) must know which type of atomic operations is required. The ODP ticket lock implementation makes a good example:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);   //the ticket counter does not protect anything to incrementing it can be relaxed

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))    //acquiring the currently served position will include (a half-sided) barrier so to contain accesses from inside the critical section
                odp_spin();
}


void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the currently server position will also include (a half-sided) barrier to contain inside accesses
}

Implementations may use barriers of some kind inside these primitive atomic operations. Some architectures don't even need explicit barriers as they have memory access instructions (e.g. load and store) with acquire and release semantics. Full barriers are heavy (and semantically an overkill) and you want to avoid them if possible. To use full barriers for updates to e.g. global statistics counters will affect performance, such updates can be relaxed (they still need to be atomic of course).

See these two good presentations Herb Sutter on the C++ standards committee.
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2

On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org<mailto:bill.fischofer@linaro.org>> wrote:
Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are intended to be generic wouldn't omitting these be better?
Ola Liljedahl Oct. 16, 2014, 1:45 p.m. UTC | #10
Yes I think redesigning the API to mimic C11 atomics and follow a
standardised memory model is the way to go. Over time, users will be more
familiar with the memory models of C11 and C++11 and doing something
different in ODP will not benefit from this. This patch is all it takes for
ODP to step into the modern world.

The new atomics API is smaller, 13 functions vs. 34 in the original API. No
real loss of functionality (the atomic_int type wasn't really needed), to
the contrary I would say, more functionality now.

I am still open to an odp_counters.h API but think odp_atomics.h should be
public as well.

The bug in ticketlock_lock() is not only on ARMv8. I think it exists on all
architectures that allow loads to be reordered.

-- Ola

On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  Hi,
>
>
>
> OK, there may be a bug in ticketlock on ARMv8. But is the right solution
> to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
> a  odp_sync_loads() – which would ensure correct load ordering over the
> lock (prevent loads moving over it).
>
>
>
> My concern is that API gets big/complex although the common case for the
> application could be simple (only “relaxed”). If other use cases are
> related to building synchronization primitives, it’s a limited problem that
> could be handled with correct usage of load/store barriers (or in assembly
> in an optimized ODP implementation).
>
>
>
> Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> that’s the main use case for applications…
>
>
>
>
>
> -Petri
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Thursday, October 16, 2014 3:05 PM
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
>
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> One specific goal of this updated API is to access shared data correctly
> (in the way defined by C11/C++11 standards, not using some ad-hoc model)
> and to remove the need for explicit (HW and compiler) barriers. The API
> should specify the actual intent of operations, the full memory barrier
> does not. odp_sync_stores() is also not a complete solution as it only
> orders stores, not loads. Now we have been lucky because the
> implementations of odp_sync_stores() have always used a full barrier but
> that's not the specified function.
>
>
>
> Original implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>
>
>
>         while (ticket != ticketlock->cur_ticket)
>
>                 odp_spin();
>
>
>
>         odp_mem_barrier();
>
> }
>
>
>
> New implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>
>
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>
>                 odp_spin();
>
> }
>
>
>
> See the difference? Spot the bug in the original implementation?
>
>
>
> The bug is that the lock() function does not have any acquire (or full)
> barrier between the lock has been acquired (when ticket ==
> lock->cur_ticket) and the return to the caller. This means a subsequent
> load operation in the caller could be speculated before the lock has been
> acquired (while we are still spinning) so read something when the lock is
> held by another thread. The full barriers that are part of the
> atomic_fetch_inc operation don't help.
>
>
>
> Load-acquire will include the necessary barriers to make sure later loads
> are not speculated before the lock is acquired (ticket ==
> lock->cur_ticket). The barriers come automagically when you realize all
> critical-enter operations need some type of acquire operation (e.g.
> load-acquire) and all critical-exit operations need a release
> operation(e.g. store-release).
>
>
>
> -- Ola
>
>
>
> On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
> I think we don’t need  to specify these in three different versions. It
> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> that already today, since we don’t promise (in API documentation) to
> include memory barriers into those calls. I think "relaxed" is the common
> case for _applications_, those would mainly modify counters through this
> API – and not implement synchronization data structures (like the
> ticketlock). If ODP _implementation_ or _application platform_ implements
> such data structure, it’s not huge overhead to put those odp_sync_stores or
> compiler memory barriers there when needed. Application would mainly use
> those (in thousands of places), but those would be implemented only once
> (in few places).
>
>
> Why not just change this …
>
> /**
>  * Fetch and add atomic uint32
>  *
>  * @param ptr    An atomic variable
>  * @param value  A value to be added to the variable
>  *
>  * @return Value of the variable before the operation
>  */
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __sync_fetch_and_add(ptr, value);
> }
>
> … into this …
>
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> }
>
>
> -Petri
>
>
>
> From: lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> Sent: Thursday, October 16, 2014 11:42 AM
> To: Bill Fischofer
> Cc: lng-odp-forward
> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
> These suffixes stand for relaxed, release and acquire. They are important
> concepts in C11/C++11 atomics and memory models. It is all about what
> (observable) ordering is required in multithreaded (multiprocessor)
> environments, happens-before and happens-after relationships.
>
> A relaxed access is independent of all other accesses and need no
> synchronization.
> An acquire access denotes some type of shared resource acquisition. Loads
> and stores after the acquire load must be prevented from moving up (either
> by compiler or by the HW), this is a half-sided barrier. Loads and stores
> from before the acquire are allowed to move down.
> A release access denotes releases of a shared resource. Loads and stores
> before the release store must be prevented from moving down (either by
> compiler or by HW), this is also a half-sided barrier. Loads and stores
> after the release are allowed to move up.
>
> Code that uses atomic variables (e.g. for implementing shared memory data
> structures such as locks and rings) must know which type of atomic
> operations is required. The ODP ticket lock implementation makes a good
> example:
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> {
>         uint32_t ticket;
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>   //the ticket counter does not protect anything to incrementing it can be
> relaxed
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>  //acquiring the currently served position will include (a half-sided)
> barrier so to contain accesses from inside the critical section
>                 odp_spin();
> }
>
>
> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> {
>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> currently server position will also include (a half-sided) barrier to
> contain inside accesses
> }
>
> Implementations may use barriers of some kind inside these primitive
> atomic operations. Some architectures don't even need explicit barriers as
> they have memory access instructions (e.g. load and store) with acquire and
> release semantics. Full barriers are heavy (and semantically an overkill)
> and you want to avoid them if possible. To use full barriers for updates to
> e.g. global statistics counters will affect performance, such updates can
> be relaxed (they still need to be atomic of course).
>
> See these two good presentations Herb Sutter on the C++ standards
> committee.
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>
> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
>
>
Bill Fischofer Oct. 16, 2014, 1:58 p.m. UTC | #11
I agree that it is better to make these changes now than later after more
code would potentially be impacted by them. Anything we can do to help
ensure correctness of parallel/concurrent code will pay many dividends
going forward.  They seem well thought out and self-contained, and most of
these are internal to other APIs rather than being something that will be
widely used in application code.

On Thu, Oct 16, 2014 at 8:45 AM, Ola Liljedahl <ola.liljedahl@linaro.org>
wrote:

> Yes I think redesigning the API to mimic C11 atomics and follow a
> standardised memory model is the way to go. Over time, users will be more
> familiar with the memory models of C11 and C++11 and doing something
> different in ODP will not benefit from this. This patch is all it takes for
> ODP to step into the modern world.
>
> The new atomics API is smaller, 13 functions vs. 34 in the original API.
> No real loss of functionality (the atomic_int type wasn't really needed),
> to the contrary I would say, more functionality now.
>
> I am still open to an odp_counters.h API but think odp_atomics.h should be
> public as well.
>
> The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> all architectures that allow loads to be reordered.
>
> -- Ola
>
> On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
>>  Hi,
>>
>>
>>
>> OK, there may be a bug in ticketlock on ARMv8. But is the right solution
>> to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
>> a  odp_sync_loads() – which would ensure correct load ordering over the
>> lock (prevent loads moving over it).
>>
>>
>>
>> My concern is that API gets big/complex although the common case for the
>> application could be simple (only “relaxed”). If other use cases are
>> related to building synchronization primitives, it’s a limited problem that
>> could be handled with correct usage of load/store barriers (or in assembly
>> in an optimized ODP implementation).
>>
>>
>>
>> Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
>> that’s the main use case for applications…
>>
>>
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> *Sent:* Thursday, October 16, 2014 3:05 PM
>> *To:* Savolainen, Petri (NSN - FI/Espoo)
>> *Cc:* Bill Fischofer; lng-odp-forward
>>
>> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> One specific goal of this updated API is to access shared data correctly
>> (in the way defined by C11/C++11 standards, not using some ad-hoc model)
>> and to remove the need for explicit (HW and compiler) barriers. The API
>> should specify the actual intent of operations, the full memory barrier
>> does not. odp_sync_stores() is also not a complete solution as it only
>> orders stores, not loads. Now we have been lucky because the
>> implementations of odp_sync_stores() have always used a full barrier but
>> that's not the specified function.
>>
>>
>>
>> Original implementation:
>>
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>
>> {
>>
>>         uint32_t ticket;
>>
>>
>>
>>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>
>>
>>
>>         while (ticket != ticketlock->cur_ticket)
>>
>>                 odp_spin();
>>
>>
>>
>>         odp_mem_barrier();
>>
>> }
>>
>>
>>
>> New implementation:
>>
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>
>> {
>>
>>         uint32_t ticket;
>>
>>
>>
>>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>
>>
>>
>>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>
>>                 odp_spin();
>>
>> }
>>
>>
>>
>> See the difference? Spot the bug in the original implementation?
>>
>>
>>
>> The bug is that the lock() function does not have any acquire (or full)
>> barrier between the lock has been acquired (when ticket ==
>> lock->cur_ticket) and the return to the caller. This means a subsequent
>> load operation in the caller could be speculated before the lock has been
>> acquired (while we are still spinning) so read something when the lock is
>> held by another thread. The full barriers that are part of the
>> atomic_fetch_inc operation don't help.
>>
>>
>>
>> Load-acquire will include the necessary barriers to make sure later loads
>> are not speculated before the lock is acquired (ticket ==
>> lock->cur_ticket). The barriers come automagically when you realize all
>> critical-enter operations need some type of acquire operation (e.g.
>> load-acquire) and all critical-exit operations need a release
>> operation(e.g. store-release).
>>
>>
>>
>> -- Ola
>>
>>
>>
>> On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
>> petri.savolainen@nsn.com> wrote:
>>
>> Hi,
>>
>> I think we don’t need  to specify these in three different versions. It
>> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
>> that already today, since we don’t promise (in API documentation) to
>> include memory barriers into those calls. I think "relaxed" is the common
>> case for _applications_, those would mainly modify counters through this
>> API – and not implement synchronization data structures (like the
>> ticketlock). If ODP _implementation_ or _application platform_ implements
>> such data structure, it’s not huge overhead to put those odp_sync_stores or
>> compiler memory barriers there when needed. Application would mainly use
>> those (in thousands of places), but those would be implemented only once
>> (in few places).
>>
>>
>> Why not just change this …
>>
>> /**
>>  * Fetch and add atomic uint32
>>  *
>>  * @param ptr    An atomic variable
>>  * @param value  A value to be added to the variable
>>  *
>>  * @return Value of the variable before the operation
>>  */
>> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>                                                 uint32_t value)
>> {
>>         return __sync_fetch_and_add(ptr, value);
>> }
>>
>> … into this …
>>
>> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>                                                 uint32_t value)
>> {
>>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
>> }
>>
>>
>> -Petri
>>
>>
>>
>> From: lng-odp-bounces@lists.linaro.org [mailto:
>> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> Sent: Thursday, October 16, 2014 11:42 AM
>> To: Bill Fischofer
>> Cc: lng-odp-forward
>> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>> These suffixes stand for relaxed, release and acquire. They are important
>> concepts in C11/C++11 atomics and memory models. It is all about what
>> (observable) ordering is required in multithreaded (multiprocessor)
>> environments, happens-before and happens-after relationships.
>>
>> A relaxed access is independent of all other accesses and need no
>> synchronization.
>> An acquire access denotes some type of shared resource acquisition. Loads
>> and stores after the acquire load must be prevented from moving up (either
>> by compiler or by the HW), this is a half-sided barrier. Loads and stores
>> from before the acquire are allowed to move down.
>> A release access denotes releases of a shared resource. Loads and stores
>> before the release store must be prevented from moving down (either by
>> compiler or by HW), this is also a half-sided barrier. Loads and stores
>> after the release are allowed to move up.
>>
>> Code that uses atomic variables (e.g. for implementing shared memory data
>> structures such as locks and rings) must know which type of atomic
>> operations is required. The ODP ticket lock implementation makes a good
>> example:
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>> {
>>         uint32_t ticket;
>>
>>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>   //the ticket counter does not protect anything to incrementing it can be
>> relaxed
>>
>>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>    //acquiring the currently served position will include (a half-sided)
>> barrier so to contain accesses from inside the critical section
>>                 odp_spin();
>> }
>>
>>
>> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>> {
>>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing
>> the currently server position will also include (a half-sided) barrier to
>> contain inside accesses
>> }
>>
>> Implementations may use barriers of some kind inside these primitive
>> atomic operations. Some architectures don't even need explicit barriers as
>> they have memory access instructions (e.g. load and store) with acquire and
>> release semantics. Full barriers are heavy (and semantically an overkill)
>> and you want to avoid them if possible. To use full barriers for updates to
>> e.g. global statistics counters will affect performance, such updates can
>> be relaxed (they still need to be atomic of course).
>>
>> See these two good presentations Herb Sutter on the C++ standards
>> committee.
>>
>> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>>
>> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>>
>> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
>> wrote:
>> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
>> intended to be generic wouldn't omitting these be better?
>>
>>
>>
>
>
Savolainen, Petri (NSN - FI/Espoo) Oct. 17, 2014, 6:52 a.m. UTC | #12
Hi,

There is less functions now because you deleted e.g. increment/decrement/subtract functions. Inc/dec functions are need to optimize the common case, we have already today ISAs that have optimized instruction for that. Subtract functionality is also needed – why you removed that?  It’s better to have it explicit API for inc/dec/sub that hide it to an integer value. Also not yet sure if we can avoid signed versions - may be int is not needed, but maybe we need int32 or int64 instead.

Also, C11 defines 6 different memory order models – when those remaining models will be added?

When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a lot.

-Petri




From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]

Sent: Thursday, October 16, 2014 4:45 PM
To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

Yes I think redesigning the API to mimic C11 atomics and follow a standardised memory model is the way to go. Over time, users will be more familiar with the memory models of C11 and C++11 and doing something different in ODP will not benefit from this. This patch is all it takes for ODP to step into the modern world.

The new atomics API is smaller, 13 functions vs. 34 in the original API. No real loss of functionality (the atomic_int type wasn't really needed), to the contrary I would say, more functionality now.

I am still open to an odp_counters.h API but think odp_atomics.h should be public as well.

The bug in ticketlock_lock() is not only on ARMv8. I think it exists on all architectures that allow loads to be reordered.

-- Ola

On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

OK, there may be a bug in ticketlock on ARMv8. But is the right solution to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with a  odp_sync_loads() – which would ensure correct load ordering over the lock (prevent loads moving over it).

My concern is that API gets big/complex although the common case for the application could be simple (only “relaxed”). If other use cases are related to building synchronization primitives, it’s a limited problem that could be handled with correct usage of load/store barriers (or in assembly in an optimized ODP implementation).

Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if that’s the main use case for applications…


-Petri


From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org<mailto:ola.liljedahl@linaro.org>]

Sent: Thursday, October 16, 2014 3:05 PM
To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward

Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

One specific goal of this updated API is to access shared data correctly (in the way defined by C11/C++11 standards, not using some ad-hoc model) and to remove the need for explicit (HW and compiler) barriers. The API should specify the actual intent of operations, the full memory barrier does not. odp_sync_stores() is also not a complete solution as it only orders stores, not loads. Now we have been lucky because the implementations of odp_sync_stores() have always used a full barrier but that's not the specified function.

Original implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);

        while (ticket != ticketlock->cur_ticket)
                odp_spin();

        odp_mem_barrier();
}

New implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
                odp_spin();
}

See the difference? Spot the bug in the original implementation?

The bug is that the lock() function does not have any acquire (or full) barrier between the lock has been acquired (when ticket == lock->cur_ticket) and the return to the caller. This means a subsequent load operation in the caller could be speculated before the lock has been acquired (while we are still spinning) so read something when the lock is held by another thread. The full barriers that are part of the atomic_fetch_inc operation don't help.

Load-acquire will include the necessary barriers to make sure later loads are not speculated before the lock is acquired (ticket == lock->cur_ticket). The barriers come automagically when you realize all critical-enter operations need some type of acquire operation (e.g. load-acquire) and all critical-exit operations need a release operation(e.g. store-release).

-- Ola

On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

I think we don’t need  to specify these in three different versions. It should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like that already today, since we don’t promise (in API documentation) to include memory barriers into those calls. I think "relaxed" is the common case for _applications_, those would mainly modify counters through this API – and not implement synchronization data structures (like the ticketlock). If ODP _implementation_ or _application platform_ implements such data structure, it’s not huge overhead to put those odp_sync_stores or compiler memory barriers there when needed. Application would mainly use those (in thousands of places), but those would be implemented only once (in few places).


Why not just change this …

/**
 * Fetch and add atomic uint32
 *
 * @param ptr    An atomic variable
 * @param value  A value to be added to the variable
 *
 * @return Value of the variable before the operation
 */
static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __sync_fetch_and_add(ptr, value);
}

… into this …

static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
}


-Petri



From: lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org> [mailto:lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org>] On Behalf Of ext Ola Liljedahl

Sent: Thursday, October 16, 2014 11:42 AM
To: Bill Fischofer
Cc: lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

These suffixes stand for relaxed, release and acquire. They are important concepts in C11/C++11 atomics and memory models. It is all about what (observable) ordering is required in multithreaded (multiprocessor) environments, happens-before and happens-after relationships.

A relaxed access is independent of all other accesses and need no synchronization.
An acquire access denotes some type of shared resource acquisition. Loads and stores after the acquire load must be prevented from moving up (either by compiler or by the HW), this is a half-sided barrier. Loads and stores from before the acquire are allowed to move down.
A release access denotes releases of a shared resource. Loads and stores before the release store must be prevented from moving down (either by compiler or by HW), this is also a half-sided barrier. Loads and stores after the release are allowed to move up.

Code that uses atomic variables (e.g. for implementing shared memory data structures such as locks and rings) must know which type of atomic operations is required. The ODP ticket lock implementation makes a good example:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);   //the ticket counter does not protect anything to incrementing it can be relaxed

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))    //acquiring the currently served position will include (a half-sided) barrier so to contain accesses from inside the critical section
                odp_spin();
}


void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the currently server position will also include (a half-sided) barrier to contain inside accesses
}

Implementations may use barriers of some kind inside these primitive atomic operations. Some architectures don't even need explicit barriers as they have memory access instructions (e.g. load and store) with acquire and release semantics. Full barriers are heavy (and semantically an overkill) and you want to avoid them if possible. To use full barriers for updates to e.g. global statistics counters will affect performance, such updates can be relaxed (they still need to be atomic of course).

See these two good presentations Herb Sutter on the C++ standards committee.
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2

On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org<mailto:bill.fischofer@linaro.org>> wrote:
Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are intended to be generic wouldn't omitting these be better?
Ola Liljedahl Oct. 17, 2014, 8:22 a.m. UTC | #13
If there is some architecture that has atomic instructions for
incrementing/decrementing by 1 and using those instructions actually
increases performance, then we can add such atomic operations as well.
Anyone can contribute a patch.

Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
not see a strong need for an atomic subtract operation. When do you
actually need atomic operations on signed types? You can always cast the
returned value into a signed type of the same size when you need to
interpret the value as a signed type.

C11 defines a lot of memory models but not all of them might actually be
needed. Consume is there basically only for Alpha which doesn't honor data
dependencies. Full sequentially consistent (SC) is slow and you probably
want to avoid that. I can understand that same atomic operation want to be
both acquire and release but I am waiting for an actual use case to show up.

We could modify the API to take the memory model as a parameter (as C11
atomics does). Each function would then have a switch statement, likely the
compiler would remove redundant code when the memory model parameter is a
constant. Such an API would keep the number of functions independent of the
number of supported memory models. However this style of API also requires
us to support all memory models (or those we chose to support) for all
atomic operations which is probably not required by actual use cases. I
could however provide a header file which uses this style for comparison.
Then the ODP atomic API would be even closer syntactically to C11, this
would be a good thing.

-- Ola


On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  Hi,
>
>
>
> There is less functions now because you deleted e.g.
> increment/decrement/subtract functions. Inc/dec functions are need to
> optimize the common case, we have already today ISAs that have optimized
> instruction for that. Subtract functionality is also needed – why you
> removed that?  It’s better to have it explicit API for inc/dec/sub that
> hide it to an integer value. Also not yet sure if we can avoid signed
> versions - may be int is not needed, but maybe we need int32 or int64
> instead.
>
>
>
> Also, C11 defines 6 different memory order models – when those remaining
> models will be added?
>
>
>
> When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
> lot.
>
>
>
> -Petri
>
>
>
>
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Thursday, October 16, 2014 4:45 PM
>
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> Yes I think redesigning the API to mimic C11 atomics and follow a
> standardised memory model is the way to go. Over time, users will be more
> familiar with the memory models of C11 and C++11 and doing something
> different in ODP will not benefit from this. This patch is all it takes for
> ODP to step into the modern world.
>
>
>
> The new atomics API is smaller, 13 functions vs. 34 in the original API.
> No real loss of functionality (the atomic_int type wasn't really needed),
> to the contrary I would say, more functionality now.
>
>
>
> I am still open to an odp_counters.h API but think odp_atomics.h should be
> public as well.
>
>
>
> The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> all architectures that allow loads to be reordered.
>
>
>
> -- Ola
>
>
>
> On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
>
>
> OK, there may be a bug in ticketlock on ARMv8. But is the right solution
> to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
> a  odp_sync_loads() – which would ensure correct load ordering over the
> lock (prevent loads moving over it).
>
>
>
> My concern is that API gets big/complex although the common case for the
> application could be simple (only “relaxed”). If other use cases are
> related to building synchronization primitives, it’s a limited problem that
> could be handled with correct usage of load/store barriers (or in assembly
> in an optimized ODP implementation).
>
>
>
> Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> that’s the main use case for applications…
>
>
>
>
>
> -Petri
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Thursday, October 16, 2014 3:05 PM
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
>
>
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> One specific goal of this updated API is to access shared data correctly
> (in the way defined by C11/C++11 standards, not using some ad-hoc model)
> and to remove the need for explicit (HW and compiler) barriers. The API
> should specify the actual intent of operations, the full memory barrier
> does not. odp_sync_stores() is also not a complete solution as it only
> orders stores, not loads. Now we have been lucky because the
> implementations of odp_sync_stores() have always used a full barrier but
> that's not the specified function.
>
>
>
> Original implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>
>
>
>         while (ticket != ticketlock->cur_ticket)
>
>                 odp_spin();
>
>
>
>         odp_mem_barrier();
>
> }
>
>
>
> New implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>
>
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>
>                 odp_spin();
>
> }
>
>
>
> See the difference? Spot the bug in the original implementation?
>
>
>
> The bug is that the lock() function does not have any acquire (or full)
> barrier between the lock has been acquired (when ticket ==
> lock->cur_ticket) and the return to the caller. This means a subsequent
> load operation in the caller could be speculated before the lock has been
> acquired (while we are still spinning) so read something when the lock is
> held by another thread. The full barriers that are part of the
> atomic_fetch_inc operation don't help.
>
>
>
> Load-acquire will include the necessary barriers to make sure later loads
> are not speculated before the lock is acquired (ticket ==
> lock->cur_ticket). The barriers come automagically when you realize all
> critical-enter operations need some type of acquire operation (e.g.
> load-acquire) and all critical-exit operations need a release
> operation(e.g. store-release).
>
>
>
> -- Ola
>
>
>
> On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
> I think we don’t need  to specify these in three different versions. It
> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> that already today, since we don’t promise (in API documentation) to
> include memory barriers into those calls. I think "relaxed" is the common
> case for _applications_, those would mainly modify counters through this
> API – and not implement synchronization data structures (like the
> ticketlock). If ODP _implementation_ or _application platform_ implements
> such data structure, it’s not huge overhead to put those odp_sync_stores or
> compiler memory barriers there when needed. Application would mainly use
> those (in thousands of places), but those would be implemented only once
> (in few places).
>
>
> Why not just change this …
>
> /**
>  * Fetch and add atomic uint32
>  *
>  * @param ptr    An atomic variable
>  * @param value  A value to be added to the variable
>  *
>  * @return Value of the variable before the operation
>  */
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __sync_fetch_and_add(ptr, value);
> }
>
> … into this …
>
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> }
>
>
> -Petri
>
>
>
> From: lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> Sent: Thursday, October 16, 2014 11:42 AM
> To: Bill Fischofer
> Cc: lng-odp-forward
> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
> These suffixes stand for relaxed, release and acquire. They are important
> concepts in C11/C++11 atomics and memory models. It is all about what
> (observable) ordering is required in multithreaded (multiprocessor)
> environments, happens-before and happens-after relationships.
>
> A relaxed access is independent of all other accesses and need no
> synchronization.
> An acquire access denotes some type of shared resource acquisition. Loads
> and stores after the acquire load must be prevented from moving up (either
> by compiler or by the HW), this is a half-sided barrier. Loads and stores
> from before the acquire are allowed to move down.
> A release access denotes releases of a shared resource. Loads and stores
> before the release store must be prevented from moving down (either by
> compiler or by HW), this is also a half-sided barrier. Loads and stores
> after the release are allowed to move up.
>
> Code that uses atomic variables (e.g. for implementing shared memory data
> structures such as locks and rings) must know which type of atomic
> operations is required. The ODP ticket lock implementation makes a good
> example:
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> {
>         uint32_t ticket;
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>   //the ticket counter does not protect anything to incrementing it can be
> relaxed
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>  //acquiring the currently served position will include (a half-sided)
> barrier so to contain accesses from inside the critical section
>                 odp_spin();
> }
>
>
> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> {
>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> currently server position will also include (a half-sided) barrier to
> contain inside accesses
> }
>
> Implementations may use barriers of some kind inside these primitive
> atomic operations. Some architectures don't even need explicit barriers as
> they have memory access instructions (e.g. load and store) with acquire and
> release semantics. Full barriers are heavy (and semantically an overkill)
> and you want to avoid them if possible. To use full barriers for updates to
> e.g. global statistics counters will affect performance, such updates can
> be relaxed (they still need to be atomic of course).
>
> See these two good presentations Herb Sutter on the C++ standards
> committee.
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>
> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
>
>
>
>
Savolainen, Petri (NSN - FI/Espoo) Oct. 17, 2014, 8:46 a.m. UTC | #14
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
+ * @note A - B <=> A + (-B)
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ *
+ * @return Value of the atomic variable before the addition
+ */
+static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
+                            uint32_t incr)

This adds an unsigned integer, so –B does not fit nicely in the definition. If you change it to signed, then you limit max add/sub value to 31 bits (a separate substract does not have that limit in API).

I’d optimize for the common case (relaxed) and handle other models with another function with an additional parameter. Also, I’d prefer to keep the current function naming convention e.g. odp_atomic_fetch_add_u32(). So, instead of rewriting the whole API you could consider the current API as the starting point.

-Petri


From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]

Sent: Friday, October 17, 2014 11:23 AM
To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

If there is some architecture that has atomic instructions for incrementing/decrementing by 1 and using those instructions actually increases performance, then we can add such atomic operations as well. Anyone can contribute a patch.

Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did not see a strong need for an atomic subtract operation. When do you actually need atomic operations on signed types? You can always cast the returned value into a signed type of the same size when you need to interpret the value as a signed type.

C11 defines a lot of memory models but not all of them might actually be needed. Consume is there basically only for Alpha which doesn't honor data dependencies. Full sequentially consistent (SC) is slow and you probably want to avoid that. I can understand that same atomic operation want to be both acquire and release but I am waiting for an actual use case to show up.

We could modify the API to take the memory model as a parameter (as C11 atomics does). Each function would then have a switch statement, likely the compiler would remove redundant code when the memory model parameter is a constant. Such an API would keep the number of functions independent of the number of supported memory models. However this style of API also requires us to support all memory models (or those we chose to support) for all atomic operations which is probably not required by actual use cases. I could however provide a header file which uses this style for comparison. Then the ODP atomic API would be even closer syntactically to C11, this would be a good thing.

-- Ola


On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

There is less functions now because you deleted e.g. increment/decrement/subtract functions. Inc/dec functions are need to optimize the common case, we have already today ISAs that have optimized instruction for that. Subtract functionality is also needed – why you removed that?  It’s better to have it explicit API for inc/dec/sub that hide it to an integer value. Also not yet sure if we can avoid signed versions - may be int is not needed, but maybe we need int32 or int64 instead.

Also, C11 defines 6 different memory order models – when those remaining models will be added?

When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a lot.

-Petri




From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org<mailto:ola.liljedahl@linaro.org>]

Sent: Thursday, October 16, 2014 4:45 PM

To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

Yes I think redesigning the API to mimic C11 atomics and follow a standardised memory model is the way to go. Over time, users will be more familiar with the memory models of C11 and C++11 and doing something different in ODP will not benefit from this. This patch is all it takes for ODP to step into the modern world.

The new atomics API is smaller, 13 functions vs. 34 in the original API. No real loss of functionality (the atomic_int type wasn't really needed), to the contrary I would say, more functionality now.

I am still open to an odp_counters.h API but think odp_atomics.h should be public as well.

The bug in ticketlock_lock() is not only on ARMv8. I think it exists on all architectures that allow loads to be reordered.

-- Ola

On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

OK, there may be a bug in ticketlock on ARMv8. But is the right solution to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with a  odp_sync_loads() – which would ensure correct load ordering over the lock (prevent loads moving over it).

My concern is that API gets big/complex although the common case for the application could be simple (only “relaxed”). If other use cases are related to building synchronization primitives, it’s a limited problem that could be handled with correct usage of load/store barriers (or in assembly in an optimized ODP implementation).

Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if that’s the main use case for applications…


-Petri


From: ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org<mailto:ola.liljedahl@linaro.org>]

Sent: Thursday, October 16, 2014 3:05 PM
To: Savolainen, Petri (NSN - FI/Espoo)
Cc: Bill Fischofer; lng-odp-forward

Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

One specific goal of this updated API is to access shared data correctly (in the way defined by C11/C++11 standards, not using some ad-hoc model) and to remove the need for explicit (HW and compiler) barriers. The API should specify the actual intent of operations, the full memory barrier does not. odp_sync_stores() is also not a complete solution as it only orders stores, not loads. Now we have been lucky because the implementations of odp_sync_stores() have always used a full barrier but that's not the specified function.

Original implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);

        while (ticket != ticketlock->cur_ticket)
                odp_spin();

        odp_mem_barrier();
}

New implementation:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
                odp_spin();
}

See the difference? Spot the bug in the original implementation?

The bug is that the lock() function does not have any acquire (or full) barrier between the lock has been acquired (when ticket == lock->cur_ticket) and the return to the caller. This means a subsequent load operation in the caller could be speculated before the lock has been acquired (while we are still spinning) so read something when the lock is held by another thread. The full barriers that are part of the atomic_fetch_inc operation don't help.

Load-acquire will include the necessary barriers to make sure later loads are not speculated before the lock is acquired (ticket == lock->cur_ticket). The barriers come automagically when you realize all critical-enter operations need some type of acquire operation (e.g. load-acquire) and all critical-exit operations need a release operation(e.g. store-release).

-- Ola

On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <petri.savolainen@nsn.com<mailto:petri.savolainen@nsn.com>> wrote:
Hi,

I think we don’t need  to specify these in three different versions. It should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like that already today, since we don’t promise (in API documentation) to include memory barriers into those calls. I think "relaxed" is the common case for _applications_, those would mainly modify counters through this API – and not implement synchronization data structures (like the ticketlock). If ODP _implementation_ or _application platform_ implements such data structure, it’s not huge overhead to put those odp_sync_stores or compiler memory barriers there when needed. Application would mainly use those (in thousands of places), but those would be implemented only once (in few places).


Why not just change this …

/**
 * Fetch and add atomic uint32
 *
 * @param ptr    An atomic variable
 * @param value  A value to be added to the variable
 *
 * @return Value of the variable before the operation
 */
static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __sync_fetch_and_add(ptr, value);
}

… into this …

static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
                                                uint32_t value)
{
        return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
}


-Petri



From: lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org> [mailto:lng-odp-bounces@lists.linaro.org<mailto:lng-odp-bounces@lists.linaro.org>] On Behalf Of ext Ola Liljedahl

Sent: Thursday, October 16, 2014 11:42 AM
To: Bill Fischofer
Cc: lng-odp-forward
Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory model

These suffixes stand for relaxed, release and acquire. They are important concepts in C11/C++11 atomics and memory models. It is all about what (observable) ordering is required in multithreaded (multiprocessor) environments, happens-before and happens-after relationships.

A relaxed access is independent of all other accesses and need no synchronization.
An acquire access denotes some type of shared resource acquisition. Loads and stores after the acquire load must be prevented from moving up (either by compiler or by the HW), this is a half-sided barrier. Loads and stores from before the acquire are allowed to move down.
A release access denotes releases of a shared resource. Loads and stores before the release store must be prevented from moving down (either by compiler or by HW), this is also a half-sided barrier. Loads and stores after the release are allowed to move up.

Code that uses atomic variables (e.g. for implementing shared memory data structures such as locks and rings) must know which type of atomic operations is required. The ODP ticket lock implementation makes a good example:
void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
{
        uint32_t ticket;

        ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);   //the ticket counter does not protect anything to incrementing it can be relaxed

        while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))    //acquiring the currently served position will include (a half-sided) barrier so to contain accesses from inside the critical section
                odp_spin();
}


void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
{
        odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the currently server position will also include (a half-sided) barrier to contain inside accesses
}

Implementations may use barriers of some kind inside these primitive atomic operations. Some architectures don't even need explicit barriers as they have memory access instructions (e.g. load and store) with acquire and release semantics. Full barriers are heavy (and semantically an overkill) and you want to avoid them if possible. To use full barriers for updates to e.g. global statistics counters will affect performance, such updates can be relaxed (they still need to be atomic of course).

See these two good presentations Herb Sutter on the C++ standards committee.
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2

On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org<mailto:bill.fischofer@linaro.org>> wrote:
Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are intended to be generic wouldn't omitting these be better?
Ola Liljedahl Oct. 17, 2014, 9:05 a.m. UTC | #15
On 17 October 2014 10:46, Savolainen, Petri (NSN - FI/Espoo) <
petri.savolainen@nsn.com> wrote:

>  + * Atomic fetch and add to 32-bit atomic variable
>
> + * @note Relaxed memory model, no barriers.
>
> + * @note A - B <=> A + (-B)
>
> + *
>
> + * @param ptr   Pointer to a 32-bit atomic variable
>
> + * @param incr  The value to be added to the atomic variable
>
> + *
>
> + * @return Value of the atomic variable before the addition
>
> + */
>
> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>
> +                            *uint32_t incr*)
>
>
>
> This adds an unsigned integer, so –B does not fit nicely in the
> definition. If you change it to signed, then you limit max add/sub value to
> 31 bits (a separate substract does not have that limit in API).
>
It's just bits, overflow doesn't matter, neither to C nor to the HW that
implements C semantics. We are not returning any condition codes which
could depend on the type of operation being performed. I can add an atomic
(32-bit) subtract operation if this is important to you. There is at least
one usage in ODP linux-generic of an atomic subtraction so the function
would not be unused. I don't see the same need for 64-bit atomic variables,
they are not used internally as building blocks for higher level
synchronization primitives, probably just going to be used for statistics
of different kinds. Another reasons to separate out the counter types and
operations from the more powerful atomic building blocks.



>
>
> I’d optimize for the common case (relaxed) and handle other models with
> another function with an additional parameter. Also, I’d prefer to keep the
> current function naming convention e.g. odp_atomic_fetch_add_u32(). So,
> instead of rewriting the whole API you could consider the current API as
> the starting point.
>
The current API was the starting point. But the
odp_atomic_fetch_add_u32_rlx() style makes for very long names. Combine
that with our style rules that use 8-space tabs for indentation and
horizontal screen estate starts to become scarce. I would actually prefer
even shorter function names. Making the memory model a parameter would cut
off a couple of characters from the function name.


>
> -Petri
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Friday, October 17, 2014 11:23 AM
>
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> If there is some architecture that has atomic instructions for
> incrementing/decrementing by 1 and using those instructions actually
> increases performance, then we can add such atomic operations as well.
> Anyone can contribute a patch.
>
>
>
> Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
> not see a strong need for an atomic subtract operation. When do you
> actually need atomic operations on signed types? You can always cast the
> returned value into a signed type of the same size when you need to
> interpret the value as a signed type.
>
>
>
> C11 defines a lot of memory models but not all of them might actually be
> needed. Consume is there basically only for Alpha which doesn't honor data
> dependencies. Full sequentially consistent (SC) is slow and you probably
> want to avoid that. I can understand that same atomic operation want to be
> both acquire and release but I am waiting for an actual use case to show up.
>
>
>
> We could modify the API to take the memory model as a parameter (as C11
> atomics does). Each function would then have a switch statement, likely the
> compiler would remove redundant code when the memory model parameter is a
> constant. Such an API would keep the number of functions independent of the
> number of supported memory models. However this style of API also requires
> us to support all memory models (or those we chose to support) for all
> atomic operations which is probably not required by actual use cases. I
> could however provide a header file which uses this style for comparison.
> Then the ODP atomic API would be even closer syntactically to C11, this
> would be a good thing.
>
>
>
> -- Ola
>
>
>
>
>
> On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
>
>
> There is less functions now because you deleted e.g.
> increment/decrement/subtract functions. Inc/dec functions are need to
> optimize the common case, we have already today ISAs that have optimized
> instruction for that. Subtract functionality is also needed – why you
> removed that?  It’s better to have it explicit API for inc/dec/sub that
> hide it to an integer value. Also not yet sure if we can avoid signed
> versions - may be int is not needed, but maybe we need int32 or int64
> instead.
>
>
>
> Also, C11 defines 6 different memory order models – when those remaining
> models will be added?
>
>
>
> When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
> lot.
>
>
>
> -Petri
>
>
>
>
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Thursday, October 16, 2014 4:45 PM
>
>
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> Yes I think redesigning the API to mimic C11 atomics and follow a
> standardised memory model is the way to go. Over time, users will be more
> familiar with the memory models of C11 and C++11 and doing something
> different in ODP will not benefit from this. This patch is all it takes for
> ODP to step into the modern world.
>
>
>
> The new atomics API is smaller, 13 functions vs. 34 in the original API.
> No real loss of functionality (the atomic_int type wasn't really needed),
> to the contrary I would say, more functionality now.
>
>
>
> I am still open to an odp_counters.h API but think odp_atomics.h should be
> public as well.
>
>
>
> The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> all architectures that allow loads to be reordered.
>
>
>
> -- Ola
>
>
>
> On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
>
>
> OK, there may be a bug in ticketlock on ARMv8. But is the right solution
> to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
> a  odp_sync_loads() – which would ensure correct load ordering over the
> lock (prevent loads moving over it).
>
>
>
> My concern is that API gets big/complex although the common case for the
> application could be simple (only “relaxed”). If other use cases are
> related to building synchronization primitives, it’s a limited problem that
> could be handled with correct usage of load/store barriers (or in assembly
> in an optimized ODP implementation).
>
>
>
> Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> that’s the main use case for applications…
>
>
>
>
>
> -Petri
>
>
>
>
>
> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> *Sent:* Thursday, October 16, 2014 3:05 PM
> *To:* Savolainen, Petri (NSN - FI/Espoo)
> *Cc:* Bill Fischofer; lng-odp-forward
>
>
> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
>
> One specific goal of this updated API is to access shared data correctly
> (in the way defined by C11/C++11 standards, not using some ad-hoc model)
> and to remove the need for explicit (HW and compiler) barriers. The API
> should specify the actual intent of operations, the full memory barrier
> does not. odp_sync_stores() is also not a complete solution as it only
> orders stores, not loads. Now we have been lucky because the
> implementations of odp_sync_stores() have always used a full barrier but
> that's not the specified function.
>
>
>
> Original implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>
>
>
>         while (ticket != ticketlock->cur_ticket)
>
>                 odp_spin();
>
>
>
>         odp_mem_barrier();
>
> }
>
>
>
> New implementation:
>
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>
> {
>
>         uint32_t ticket;
>
>
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>
>
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>
>                 odp_spin();
>
> }
>
>
>
> See the difference? Spot the bug in the original implementation?
>
>
>
> The bug is that the lock() function does not have any acquire (or full)
> barrier between the lock has been acquired (when ticket ==
> lock->cur_ticket) and the return to the caller. This means a subsequent
> load operation in the caller could be speculated before the lock has been
> acquired (while we are still spinning) so read something when the lock is
> held by another thread. The full barriers that are part of the
> atomic_fetch_inc operation don't help.
>
>
>
> Load-acquire will include the necessary barriers to make sure later loads
> are not speculated before the lock is acquired (ticket ==
> lock->cur_ticket). The barriers come automagically when you realize all
> critical-enter operations need some type of acquire operation (e.g.
> load-acquire) and all critical-exit operations need a release
> operation(e.g. store-release).
>
>
>
> -- Ola
>
>
>
> On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
> Hi,
>
> I think we don’t need  to specify these in three different versions. It
> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> that already today, since we don’t promise (in API documentation) to
> include memory barriers into those calls. I think "relaxed" is the common
> case for _applications_, those would mainly modify counters through this
> API – and not implement synchronization data structures (like the
> ticketlock). If ODP _implementation_ or _application platform_ implements
> such data structure, it’s not huge overhead to put those odp_sync_stores or
> compiler memory barriers there when needed. Application would mainly use
> those (in thousands of places), but those would be implemented only once
> (in few places).
>
>
> Why not just change this …
>
> /**
>  * Fetch and add atomic uint32
>  *
>  * @param ptr    An atomic variable
>  * @param value  A value to be added to the variable
>  *
>  * @return Value of the variable before the operation
>  */
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __sync_fetch_and_add(ptr, value);
> }
>
> … into this …
>
> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>                                                 uint32_t value)
> {
>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> }
>
>
> -Petri
>
>
>
> From: lng-odp-bounces@lists.linaro.org [mailto:
> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> Sent: Thursday, October 16, 2014 11:42 AM
> To: Bill Fischofer
> Cc: lng-odp-forward
> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> model
>
>
> These suffixes stand for relaxed, release and acquire. They are important
> concepts in C11/C++11 atomics and memory models. It is all about what
> (observable) ordering is required in multithreaded (multiprocessor)
> environments, happens-before and happens-after relationships.
>
> A relaxed access is independent of all other accesses and need no
> synchronization.
> An acquire access denotes some type of shared resource acquisition. Loads
> and stores after the acquire load must be prevented from moving up (either
> by compiler or by the HW), this is a half-sided barrier. Loads and stores
> from before the acquire are allowed to move down.
> A release access denotes releases of a shared resource. Loads and stores
> before the release store must be prevented from moving down (either by
> compiler or by HW), this is also a half-sided barrier. Loads and stores
> after the release are allowed to move up.
>
> Code that uses atomic variables (e.g. for implementing shared memory data
> structures such as locks and rings) must know which type of atomic
> operations is required. The ODP ticket lock implementation makes a good
> example:
> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> {
>         uint32_t ticket;
>
>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>   //the ticket counter does not protect anything to incrementing it can be
> relaxed
>
>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>  //acquiring the currently served position will include (a half-sided)
> barrier so to contain accesses from inside the critical section
>                 odp_spin();
> }
>
>
> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> {
>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> currently server position will also include (a half-sided) barrier to
> contain inside accesses
> }
>
> Implementations may use barriers of some kind inside these primitive
> atomic operations. Some architectures don't even need explicit barriers as
> they have memory access instructions (e.g. load and store) with acquire and
> release semantics. Full barriers are heavy (and semantically an overkill)
> and you want to avoid them if possible. To use full barriers for updates to
> e.g. global statistics counters will affect performance, such updates can
> be relaxed (they still need to be atomic of course).
>
> See these two good presentations Herb Sutter on the C++ standards
> committee.
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>
> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> wrote:
> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> intended to be generic wouldn't omitting these be better?
>
>
>
>
>
>
>
Jerin Jacob Oct. 17, 2014, 9:07 a.m. UTC | #16
On Fri, Oct 17, 2014 at 10:22:37AM +0200, Ola Liljedahl wrote:
> If there is some architecture that has atomic instructions for
> incrementing/decrementing by 1 and using those instructions actually
> increases performance, then we can add such atomic operations as well.
> Anyone can contribute a patch.

Octeon has atomic instructions for incrementing/decrementing by 1 and
Inc/dec functions are need to optimize the common case
so we would like keep odp_atomic_fetch_inc*
for the new model also.
I can send a patch for the octeon implementation but we would like keep
the abstraction(dedicated API's for inc/dec)
as exist today for the new model as well.


> 
> Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
> not see a strong need for an atomic subtract operation. When do you
> actually need atomic operations on signed types? You can always cast the
> returned value into a signed type of the same size when you need to
> interpret the value as a signed type.
> 
> C11 defines a lot of memory models but not all of them might actually be
> needed. Consume is there basically only for Alpha which doesn't honor data
> dependencies. Full sequentially consistent (SC) is slow and you probably
> want to avoid that. I can understand that same atomic operation want to be
> both acquire and release but I am waiting for an actual use case to show up.
> 
> We could modify the API to take the memory model as a parameter (as C11
> atomics does). Each function would then have a switch statement, likely the
> compiler would remove redundant code when the memory model parameter is a
> constant. Such an API would keep the number of functions independent of the
> number of supported memory models. However this style of API also requires
> us to support all memory models (or those we chose to support) for all
> atomic operations which is probably not required by actual use cases. I
> could however provide a header file which uses this style for comparison.
> Then the ODP atomic API would be even closer syntactically to C11, this
> would be a good thing.
> 
> -- Ola
> 
> 
> On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
> 
> >  Hi,
> >
> >
> >
> > There is less functions now because you deleted e.g.
> > increment/decrement/subtract functions. Inc/dec functions are need to
> > optimize the common case, we have already today ISAs that have optimized
> > instruction for that. Subtract functionality is also needed – why you
> > removed that?  It’s better to have it explicit API for inc/dec/sub that
> > hide it to an integer value. Also not yet sure if we can avoid signed
> > versions - may be int is not needed, but maybe we need int32 or int64
> > instead.
> >
> >
> >
> > Also, C11 defines 6 different memory order models – when those remaining
> > models will be added?
> >
> >
> >
> > When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
> > lot.
> >
> >
> >
> > -Petri
> >
> >
> >
> >
> >
> >
> >
> >
> >
> > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > *Sent:* Thursday, October 16, 2014 4:45 PM
> >
> > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > *Cc:* Bill Fischofer; lng-odp-forward
> > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> > model
> >
> >
> >
> > Yes I think redesigning the API to mimic C11 atomics and follow a
> > standardised memory model is the way to go. Over time, users will be more
> > familiar with the memory models of C11 and C++11 and doing something
> > different in ODP will not benefit from this. This patch is all it takes for
> > ODP to step into the modern world.
> >
> >
> >
> > The new atomics API is smaller, 13 functions vs. 34 in the original API.
> > No real loss of functionality (the atomic_int type wasn't really needed),
> > to the contrary I would say, more functionality now.
> >
> >
> >
> > I am still open to an odp_counters.h API but think odp_atomics.h should be
> > public as well.
> >
> >
> >
> > The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> > all architectures that allow loads to be reordered.
> >
> >
> >
> > -- Ola
> >
> >
> >
> > On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> > petri.savolainen@nsn.com> wrote:
> >
> > Hi,
> >
> >
> >
> > OK, there may be a bug in ticketlock on ARMv8. But is the right solution
> > to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
> > a  odp_sync_loads() – which would ensure correct load ordering over the
> > lock (prevent loads moving over it).
> >
> >
> >
> > My concern is that API gets big/complex although the common case for the
> > application could be simple (only “relaxed”). If other use cases are
> > related to building synchronization primitives, it’s a limited problem that
> > could be handled with correct usage of load/store barriers (or in assembly
> > in an optimized ODP implementation).
> >
> >
> >
> > Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> > that’s the main use case for applications…
> >
> >
> >
> >
> >
> > -Petri
> >
> >
> >
> >
> >
> > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > *Sent:* Thursday, October 16, 2014 3:05 PM
> > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > *Cc:* Bill Fischofer; lng-odp-forward
> >
> >
> > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> > model
> >
> >
> >
> > One specific goal of this updated API is to access shared data correctly
> > (in the way defined by C11/C++11 standards, not using some ad-hoc model)
> > and to remove the need for explicit (HW and compiler) barriers. The API
> > should specify the actual intent of operations, the full memory barrier
> > does not. odp_sync_stores() is also not a complete solution as it only
> > orders stores, not loads. Now we have been lucky because the
> > implementations of odp_sync_stores() have always used a full barrier but
> > that's not the specified function.
> >
> >
> >
> > Original implementation:
> >
> > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> >
> > {
> >
> >         uint32_t ticket;
> >
> >
> >
> >         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> >
> >
> >
> >         while (ticket != ticketlock->cur_ticket)
> >
> >                 odp_spin();
> >
> >
> >
> >         odp_mem_barrier();
> >
> > }
> >
> >
> >
> > New implementation:
> >
> > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> >
> > {
> >
> >         uint32_t ticket;
> >
> >
> >
> >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
> >
> >
> >
> >         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
> >
> >                 odp_spin();
> >
> > }
> >
> >
> >
> > See the difference? Spot the bug in the original implementation?
> >
> >
> >
> > The bug is that the lock() function does not have any acquire (or full)
> > barrier between the lock has been acquired (when ticket ==
> > lock->cur_ticket) and the return to the caller. This means a subsequent
> > load operation in the caller could be speculated before the lock has been
> > acquired (while we are still spinning) so read something when the lock is
> > held by another thread. The full barriers that are part of the
> > atomic_fetch_inc operation don't help.
> >
> >
> >
> > Load-acquire will include the necessary barriers to make sure later loads
> > are not speculated before the lock is acquired (ticket ==
> > lock->cur_ticket). The barriers come automagically when you realize all
> > critical-enter operations need some type of acquire operation (e.g.
> > load-acquire) and all critical-exit operations need a release
> > operation(e.g. store-release).
> >
> >
> >
> > -- Ola
> >
> >
> >
> > On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> > petri.savolainen@nsn.com> wrote:
> >
> > Hi,
> >
> > I think we don’t need  to specify these in three different versions. It
> > should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
> > that already today, since we don’t promise (in API documentation) to
> > include memory barriers into those calls. I think "relaxed" is the common
> > case for _applications_, those would mainly modify counters through this
> > API – and not implement synchronization data structures (like the
> > ticketlock). If ODP _implementation_ or _application platform_ implements
> > such data structure, it’s not huge overhead to put those odp_sync_stores or
> > compiler memory barriers there when needed. Application would mainly use
> > those (in thousands of places), but those would be implemented only once
> > (in few places).
> >
> >
> > Why not just change this …
> >
> > /**
> >  * Fetch and add atomic uint32
> >  *
> >  * @param ptr    An atomic variable
> >  * @param value  A value to be added to the variable
> >  *
> >  * @return Value of the variable before the operation
> >  */
> > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> >                                                 uint32_t value)
> > {
> >         return __sync_fetch_and_add(ptr, value);
> > }
> >
> > … into this …
> >
> > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> >                                                 uint32_t value)
> > {
> >         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> > }
> >
> >
> > -Petri
> >
> >
> >
> > From: lng-odp-bounces@lists.linaro.org [mailto:
> > lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> > Sent: Thursday, October 16, 2014 11:42 AM
> > To: Bill Fischofer
> > Cc: lng-odp-forward
> > Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> > model
> >
> >
> > These suffixes stand for relaxed, release and acquire. They are important
> > concepts in C11/C++11 atomics and memory models. It is all about what
> > (observable) ordering is required in multithreaded (multiprocessor)
> > environments, happens-before and happens-after relationships.
> >
> > A relaxed access is independent of all other accesses and need no
> > synchronization.
> > An acquire access denotes some type of shared resource acquisition. Loads
> > and stores after the acquire load must be prevented from moving up (either
> > by compiler or by the HW), this is a half-sided barrier. Loads and stores
> > from before the acquire are allowed to move down.
> > A release access denotes releases of a shared resource. Loads and stores
> > before the release store must be prevented from moving down (either by
> > compiler or by HW), this is also a half-sided barrier. Loads and stores
> > after the release are allowed to move up.
> >
> > Code that uses atomic variables (e.g. for implementing shared memory data
> > structures such as locks and rings) must know which type of atomic
> > operations is required. The ODP ticket lock implementation makes a good
> > example:
> > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > {
> >         uint32_t ticket;
> >
> >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
> >   //the ticket counter does not protect anything to incrementing it can be
> > relaxed
> >
> >         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
> >  //acquiring the currently served position will include (a half-sided)
> > barrier so to contain accesses from inside the critical section
> >                 odp_spin();
> > }
> >
> >
> > void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> > {
> >         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing the
> > currently server position will also include (a half-sided) barrier to
> > contain inside accesses
> > }
> >
> > Implementations may use barriers of some kind inside these primitive
> > atomic operations. Some architectures don't even need explicit barriers as
> > they have memory access instructions (e.g. load and store) with acquire and
> > release semantics. Full barriers are heavy (and semantically an overkill)
> > and you want to avoid them if possible. To use full barriers for updates to
> > e.g. global statistics counters will affect performance, such updates can
> > be relaxed (they still need to be atomic of course).
> >
> > See these two good presentations Herb Sutter on the C++ standards
> > committee.
> >
> > http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
> >
> > http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
> >
> > On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> > wrote:
> > Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> > intended to be generic wouldn't omitting these be better?
> >
> >
> >
> >
> >

> _______________________________________________
> lng-odp mailing list
> lng-odp@lists.linaro.org
> http://lists.linaro.org/mailman/listinfo/lng-odp
Ola Liljedahl Oct. 17, 2014, 9:31 a.m. UTC | #17
On 17 October 2014 11:05, Ola Liljedahl <ola.liljedahl@linaro.org> wrote:

>
> On 17 October 2014 10:46, Savolainen, Petri (NSN - FI/Espoo) <
> petri.savolainen@nsn.com> wrote:
>
>>  + * Atomic fetch and add to 32-bit atomic variable
>>
>> + * @note Relaxed memory model, no barriers.
>>
>> + * @note A - B <=> A + (-B)
>>
>> + *
>>
>> + * @param ptr   Pointer to a 32-bit atomic variable
>>
>> + * @param incr  The value to be added to the atomic variable
>>
>> + *
>>
>> + * @return Value of the atomic variable before the addition
>>
>> + */
>>
>> +static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
>>
>> +                            *uint32_t incr*)
>>
>>
>>
>> This adds an unsigned integer, so –B does not fit nicely in the
>> definition. If you change it to signed, then you limit max add/sub value to
>> 31 bits (a separate substract does not have that limit in API).
>>
> It's just bits, overflow doesn't matter, neither to C nor to the HW that
> implements C semantics. We are not returning any condition codes which
> could depend on the type of operation being performed. I can add an atomic
> (32-bit) subtract operation if this is important to you. There is at least
> one usage in ODP linux-generic of an atomic subtraction so the function
> would not be unused. I don't see the same need for 64-bit atomic variables,
> they are not used internally as building blocks for higher level
> synchronization primitives, probably just going to be used for statistics
> of different kinds. Another reasons to separate out the counter types and
> operations from the more powerful atomic building blocks.
>

x86 has an xadd instruction but not any corresponding xsub instruction. For
the same reason I assume, it is not really needed.

GCC 4.9 on ARM turns the (A + (-B)) expression (from
odp_atomic32_fetch_add_rls(ptr, -decr)) into a subtract instruction (not
using any negate instruction). Smart compiler.

By adding a range of redundant functions, we indeed increase the size of
the API. What is most important here?

-- Ola



>
>
>>
>>
>> I’d optimize for the common case (relaxed) and handle other models with
>> another function with an additional parameter. Also, I’d prefer to keep the
>> current function naming convention e.g. odp_atomic_fetch_add_u32(). So,
>> instead of rewriting the whole API you could consider the current API as
>> the starting point.
>>
> The current API was the starting point. But the
> odp_atomic_fetch_add_u32_rlx() style makes for very long names. Combine
> that with our style rules that use 8-space tabs for indentation and
> horizontal screen estate starts to become scarce. I would actually prefer
> even shorter function names. Making the memory model a parameter would cut
> off a couple of characters from the function name.
>
>
>>
>> -Petri
>>
>>
>>
>>
>>
>> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> *Sent:* Friday, October 17, 2014 11:23 AM
>>
>> *To:* Savolainen, Petri (NSN - FI/Espoo)
>> *Cc:* Bill Fischofer; lng-odp-forward
>> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> If there is some architecture that has atomic instructions for
>> incrementing/decrementing by 1 and using those instructions actually
>> increases performance, then we can add such atomic operations as well.
>> Anyone can contribute a patch.
>>
>>
>>
>> Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
>> not see a strong need for an atomic subtract operation. When do you
>> actually need atomic operations on signed types? You can always cast the
>> returned value into a signed type of the same size when you need to
>> interpret the value as a signed type.
>>
>>
>>
>> C11 defines a lot of memory models but not all of them might actually be
>> needed. Consume is there basically only for Alpha which doesn't honor data
>> dependencies. Full sequentially consistent (SC) is slow and you probably
>> want to avoid that. I can understand that same atomic operation want to be
>> both acquire and release but I am waiting for an actual use case to show up.
>>
>>
>>
>> We could modify the API to take the memory model as a parameter (as C11
>> atomics does). Each function would then have a switch statement, likely the
>> compiler would remove redundant code when the memory model parameter is a
>> constant. Such an API would keep the number of functions independent of the
>> number of supported memory models. However this style of API also requires
>> us to support all memory models (or those we chose to support) for all
>> atomic operations which is probably not required by actual use cases. I
>> could however provide a header file which uses this style for comparison.
>> Then the ODP atomic API would be even closer syntactically to C11, this
>> would be a good thing.
>>
>>
>>
>> -- Ola
>>
>>
>>
>>
>>
>> On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
>> petri.savolainen@nsn.com> wrote:
>>
>> Hi,
>>
>>
>>
>> There is less functions now because you deleted e.g.
>> increment/decrement/subtract functions. Inc/dec functions are need to
>> optimize the common case, we have already today ISAs that have optimized
>> instruction for that. Subtract functionality is also needed – why you
>> removed that?  It’s better to have it explicit API for inc/dec/sub that
>> hide it to an integer value. Also not yet sure if we can avoid signed
>> versions - may be int is not needed, but maybe we need int32 or int64
>> instead.
>>
>>
>>
>> Also, C11 defines 6 different memory order models – when those remaining
>> models will be added?
>>
>>
>>
>> When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
>> lot.
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>>
>>
>>
>>
>> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> *Sent:* Thursday, October 16, 2014 4:45 PM
>>
>>
>> *To:* Savolainen, Petri (NSN - FI/Espoo)
>> *Cc:* Bill Fischofer; lng-odp-forward
>> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> Yes I think redesigning the API to mimic C11 atomics and follow a
>> standardised memory model is the way to go. Over time, users will be more
>> familiar with the memory models of C11 and C++11 and doing something
>> different in ODP will not benefit from this. This patch is all it takes for
>> ODP to step into the modern world.
>>
>>
>>
>> The new atomics API is smaller, 13 functions vs. 34 in the original API.
>> No real loss of functionality (the atomic_int type wasn't really needed),
>> to the contrary I would say, more functionality now.
>>
>>
>>
>> I am still open to an odp_counters.h API but think odp_atomics.h should
>> be public as well.
>>
>>
>>
>> The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
>> all architectures that allow loads to be reordered.
>>
>>
>>
>> -- Ola
>>
>>
>>
>> On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
>> petri.savolainen@nsn.com> wrote:
>>
>> Hi,
>>
>>
>>
>> OK, there may be a bug in ticketlock on ARMv8. But is the right solution
>> to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier() with
>> a  odp_sync_loads() – which would ensure correct load ordering over the
>> lock (prevent loads moving over it).
>>
>>
>>
>> My concern is that API gets big/complex although the common case for the
>> application could be simple (only “relaxed”). If other use cases are
>> related to building synchronization primitives, it’s a limited problem that
>> could be handled with correct usage of load/store barriers (or in assembly
>> in an optimized ODP implementation).
>>
>>
>>
>> Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
>> that’s the main use case for applications…
>>
>>
>>
>>
>>
>> -Petri
>>
>>
>>
>>
>>
>> *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
>> *Sent:* Thursday, October 16, 2014 3:05 PM
>> *To:* Savolainen, Petri (NSN - FI/Espoo)
>> *Cc:* Bill Fischofer; lng-odp-forward
>>
>>
>> *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>>
>> One specific goal of this updated API is to access shared data correctly
>> (in the way defined by C11/C++11 standards, not using some ad-hoc model)
>> and to remove the need for explicit (HW and compiler) barriers. The API
>> should specify the actual intent of operations, the full memory barrier
>> does not. odp_sync_stores() is also not a complete solution as it only
>> orders stores, not loads. Now we have been lucky because the
>> implementations of odp_sync_stores() have always used a full barrier but
>> that's not the specified function.
>>
>>
>>
>> Original implementation:
>>
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>
>> {
>>
>>         uint32_t ticket;
>>
>>
>>
>>         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
>>
>>
>>
>>         while (ticket != ticketlock->cur_ticket)
>>
>>                 odp_spin();
>>
>>
>>
>>         odp_mem_barrier();
>>
>> }
>>
>>
>>
>> New implementation:
>>
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>>
>> {
>>
>>         uint32_t ticket;
>>
>>
>>
>>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>
>>
>>
>>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>
>>                 odp_spin();
>>
>> }
>>
>>
>>
>> See the difference? Spot the bug in the original implementation?
>>
>>
>>
>> The bug is that the lock() function does not have any acquire (or full)
>> barrier between the lock has been acquired (when ticket ==
>> lock->cur_ticket) and the return to the caller. This means a subsequent
>> load operation in the caller could be speculated before the lock has been
>> acquired (while we are still spinning) so read something when the lock is
>> held by another thread. The full barriers that are part of the
>> atomic_fetch_inc operation don't help.
>>
>>
>>
>> Load-acquire will include the necessary barriers to make sure later loads
>> are not speculated before the lock is acquired (ticket ==
>> lock->cur_ticket). The barriers come automagically when you realize all
>> critical-enter operations need some type of acquire operation (e.g.
>> load-acquire) and all critical-exit operations need a release
>> operation(e.g. store-release).
>>
>>
>>
>> -- Ola
>>
>>
>>
>> On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
>> petri.savolainen@nsn.com> wrote:
>>
>> Hi,
>>
>> I think we don’t need  to specify these in three different versions. It
>> should be enough that odp_atomic_xxx is defined as “relaxed”, as it's like
>> that already today, since we don’t promise (in API documentation) to
>> include memory barriers into those calls. I think "relaxed" is the common
>> case for _applications_, those would mainly modify counters through this
>> API – and not implement synchronization data structures (like the
>> ticketlock). If ODP _implementation_ or _application platform_ implements
>> such data structure, it’s not huge overhead to put those odp_sync_stores or
>> compiler memory barriers there when needed. Application would mainly use
>> those (in thousands of places), but those would be implemented only once
>> (in few places).
>>
>>
>> Why not just change this …
>>
>> /**
>>  * Fetch and add atomic uint32
>>  *
>>  * @param ptr    An atomic variable
>>  * @param value  A value to be added to the variable
>>  *
>>  * @return Value of the variable before the operation
>>  */
>> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>                                                 uint32_t value)
>> {
>>         return __sync_fetch_and_add(ptr, value);
>> }
>>
>> … into this …
>>
>> static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
>>                                                 uint32_t value)
>> {
>>         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
>> }
>>
>>
>> -Petri
>>
>>
>>
>> From: lng-odp-bounces@lists.linaro.org [mailto:
>> lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
>> Sent: Thursday, October 16, 2014 11:42 AM
>> To: Bill Fischofer
>> Cc: lng-odp-forward
>> Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
>> model
>>
>>
>> These suffixes stand for relaxed, release and acquire. They are important
>> concepts in C11/C++11 atomics and memory models. It is all about what
>> (observable) ordering is required in multithreaded (multiprocessor)
>> environments, happens-before and happens-after relationships.
>>
>> A relaxed access is independent of all other accesses and need no
>> synchronization.
>> An acquire access denotes some type of shared resource acquisition. Loads
>> and stores after the acquire load must be prevented from moving up (either
>> by compiler or by the HW), this is a half-sided barrier. Loads and stores
>> from before the acquire are allowed to move down.
>> A release access denotes releases of a shared resource. Loads and stores
>> before the release store must be prevented from moving down (either by
>> compiler or by HW), this is also a half-sided barrier. Loads and stores
>> after the release are allowed to move up.
>>
>> Code that uses atomic variables (e.g. for implementing shared memory data
>> structures such as locks and rings) must know which type of atomic
>> operations is required. The ODP ticket lock implementation makes a good
>> example:
>> void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
>> {
>>         uint32_t ticket;
>>
>>         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
>>   //the ticket counter does not protect anything to incrementing it can be
>> relaxed
>>
>>         while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
>>    //acquiring the currently served position will include (a half-sided)
>> barrier so to contain accesses from inside the critical section
>>                 odp_spin();
>> }
>>
>>
>> void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
>> {
>>         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing
>> the currently server position will also include (a half-sided) barrier to
>> contain inside accesses
>> }
>>
>> Implementations may use barriers of some kind inside these primitive
>> atomic operations. Some architectures don't even need explicit barriers as
>> they have memory access instructions (e.g. load and store) with acquire and
>> release semantics. Full barriers are heavy (and semantically an overkill)
>> and you want to avoid them if possible. To use full barriers for updates to
>> e.g. global statistics counters will affect performance, such updates can
>> be relaxed (they still need to be atomic of course).
>>
>> See these two good presentations Herb Sutter on the C++ standards
>> committee.
>>
>> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
>>
>> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
>>
>> On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
>> wrote:
>> Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
>> intended to be generic wouldn't omitting these be better?
>>
>>
>>
>>
>>
>>
>>
>
>
Ola Liljedahl Oct. 17, 2014, 9:34 a.m. UTC | #18
On 17 October 2014 11:07, Jerin Jacob <jerin.jacob@caviumnetworks.com>
wrote:

> On Fri, Oct 17, 2014 at 10:22:37AM +0200, Ola Liljedahl wrote:
> > If there is some architecture that has atomic instructions for
> > incrementing/decrementing by 1 and using those instructions actually
> > increases performance, then we can add such atomic operations as well.
> > Anyone can contribute a patch.
>
> Octeon has atomic instructions for incrementing/decrementing by 1 and
> Inc/dec functions are need to optimize the common case
> so we would like keep odp_atomic_fetch_inc*
> for the new model also.
> I can send a patch for the octeon implementation but we would like keep
> the abstraction(dedicated API's for inc/dec)
> as exist today for the new model as well.
>
OK this is useful feedback.

Possibly odp_atomic_fetch_inc() functions replaces the need for
odp_atomic_fetch_add() operations (returning the old value). Statistics
counter operations don't have to return any old value so would only need
odp_atomic_add() operations.

-- Ola


>
>
> >
> > Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
> > not see a strong need for an atomic subtract operation. When do you
> > actually need atomic operations on signed types? You can always cast the
> > returned value into a signed type of the same size when you need to
> > interpret the value as a signed type.
> >
> > C11 defines a lot of memory models but not all of them might actually be
> > needed. Consume is there basically only for Alpha which doesn't honor
> data
> > dependencies. Full sequentially consistent (SC) is slow and you probably
> > want to avoid that. I can understand that same atomic operation want to
> be
> > both acquire and release but I am waiting for an actual use case to show
> up.
> >
> > We could modify the API to take the memory model as a parameter (as C11
> > atomics does). Each function would then have a switch statement, likely
> the
> > compiler would remove redundant code when the memory model parameter is a
> > constant. Such an API would keep the number of functions independent of
> the
> > number of supported memory models. However this style of API also
> requires
> > us to support all memory models (or those we chose to support) for all
> > atomic operations which is probably not required by actual use cases. I
> > could however provide a header file which uses this style for comparison.
> > Then the ODP atomic API would be even closer syntactically to C11, this
> > would be a good thing.
> >
> > -- Ola
> >
> >
> > On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
> > petri.savolainen@nsn.com> wrote:
> >
> > >  Hi,
> > >
> > >
> > >
> > > There is less functions now because you deleted e.g.
> > > increment/decrement/subtract functions. Inc/dec functions are need to
> > > optimize the common case, we have already today ISAs that have
> optimized
> > > instruction for that. Subtract functionality is also needed – why you
> > > removed that?  It’s better to have it explicit API for inc/dec/sub that
> > > hide it to an integer value. Also not yet sure if we can avoid signed
> > > versions - may be int is not needed, but maybe we need int32 or int64
> > > instead.
> > >
> > >
> > >
> > > Also, C11 defines 6 different memory order models – when those
> remaining
> > > models will be added?
> > >
> > >
> > >
> > > When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
> > > lot.
> > >
> > >
> > >
> > > -Petri
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > >
> > > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > > *Sent:* Thursday, October 16, 2014 4:45 PM
> > >
> > > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > > *Cc:* Bill Fischofer; lng-odp-forward
> > > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11
> memory
> > > model
> > >
> > >
> > >
> > > Yes I think redesigning the API to mimic C11 atomics and follow a
> > > standardised memory model is the way to go. Over time, users will be
> more
> > > familiar with the memory models of C11 and C++11 and doing something
> > > different in ODP will not benefit from this. This patch is all it
> takes for
> > > ODP to step into the modern world.
> > >
> > >
> > >
> > > The new atomics API is smaller, 13 functions vs. 34 in the original
> API.
> > > No real loss of functionality (the atomic_int type wasn't really
> needed),
> > > to the contrary I would say, more functionality now.
> > >
> > >
> > >
> > > I am still open to an odp_counters.h API but think odp_atomics.h
> should be
> > > public as well.
> > >
> > >
> > >
> > > The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> > > all architectures that allow loads to be reordered.
> > >
> > >
> > >
> > > -- Ola
> > >
> > >
> > >
> > > On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> > > petri.savolainen@nsn.com> wrote:
> > >
> > > Hi,
> > >
> > >
> > >
> > > OK, there may be a bug in ticketlock on ARMv8. But is the right
> solution
> > > to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier()
> with
> > > a  odp_sync_loads() – which would ensure correct load ordering over the
> > > lock (prevent loads moving over it).
> > >
> > >
> > >
> > > My concern is that API gets big/complex although the common case for
> the
> > > application could be simple (only “relaxed”). If other use cases are
> > > related to building synchronization primitives, it’s a limited problem
> that
> > > could be handled with correct usage of load/store barriers (or in
> assembly
> > > in an optimized ODP implementation).
> > >
> > >
> > >
> > > Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> > > that’s the main use case for applications…
> > >
> > >
> > >
> > >
> > >
> > > -Petri
> > >
> > >
> > >
> > >
> > >
> > > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > > *Sent:* Thursday, October 16, 2014 3:05 PM
> > > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > > *Cc:* Bill Fischofer; lng-odp-forward
> > >
> > >
> > > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11
> memory
> > > model
> > >
> > >
> > >
> > > One specific goal of this updated API is to access shared data
> correctly
> > > (in the way defined by C11/C++11 standards, not using some ad-hoc
> model)
> > > and to remove the need for explicit (HW and compiler) barriers. The API
> > > should specify the actual intent of operations, the full memory barrier
> > > does not. odp_sync_stores() is also not a complete solution as it only
> > > orders stores, not loads. Now we have been lucky because the
> > > implementations of odp_sync_stores() have always used a full barrier
> but
> > > that's not the specified function.
> > >
> > >
> > >
> > > Original implementation:
> > >
> > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > >
> > > {
> > >
> > >         uint32_t ticket;
> > >
> > >
> > >
> > >         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> > >
> > >
> > >
> > >         while (ticket != ticketlock->cur_ticket)
> > >
> > >                 odp_spin();
> > >
> > >
> > >
> > >         odp_mem_barrier();
> > >
> > > }
> > >
> > >
> > >
> > > New implementation:
> > >
> > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > >
> > > {
> > >
> > >         uint32_t ticket;
> > >
> > >
> > >
> > >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket,
> 1);
> > >
> > >
> > >
> > >         while (ticket !=
> odp_atomic32_load_acq(&ticketlock->cur_ticket))
> > >
> > >                 odp_spin();
> > >
> > > }
> > >
> > >
> > >
> > > See the difference? Spot the bug in the original implementation?
> > >
> > >
> > >
> > > The bug is that the lock() function does not have any acquire (or full)
> > > barrier between the lock has been acquired (when ticket ==
> > > lock->cur_ticket) and the return to the caller. This means a subsequent
> > > load operation in the caller could be speculated before the lock has
> been
> > > acquired (while we are still spinning) so read something when the lock
> is
> > > held by another thread. The full barriers that are part of the
> > > atomic_fetch_inc operation don't help.
> > >
> > >
> > >
> > > Load-acquire will include the necessary barriers to make sure later
> loads
> > > are not speculated before the lock is acquired (ticket ==
> > > lock->cur_ticket). The barriers come automagically when you realize all
> > > critical-enter operations need some type of acquire operation (e.g.
> > > load-acquire) and all critical-exit operations need a release
> > > operation(e.g. store-release).
> > >
> > >
> > >
> > > -- Ola
> > >
> > >
> > >
> > > On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> > > petri.savolainen@nsn.com> wrote:
> > >
> > > Hi,
> > >
> > > I think we don’t need  to specify these in three different versions. It
> > > should be enough that odp_atomic_xxx is defined as “relaxed”, as it's
> like
> > > that already today, since we don’t promise (in API documentation) to
> > > include memory barriers into those calls. I think "relaxed" is the
> common
> > > case for _applications_, those would mainly modify counters through
> this
> > > API – and not implement synchronization data structures (like the
> > > ticketlock). If ODP _implementation_ or _application platform_
> implements
> > > such data structure, it’s not huge overhead to put those
> odp_sync_stores or
> > > compiler memory barriers there when needed. Application would mainly
> use
> > > those (in thousands of places), but those would be implemented only
> once
> > > (in few places).
> > >
> > >
> > > Why not just change this …
> > >
> > > /**
> > >  * Fetch and add atomic uint32
> > >  *
> > >  * @param ptr    An atomic variable
> > >  * @param value  A value to be added to the variable
> > >  *
> > >  * @return Value of the variable before the operation
> > >  */
> > > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> > >                                                 uint32_t value)
> > > {
> > >         return __sync_fetch_and_add(ptr, value);
> > > }
> > >
> > > … into this …
> > >
> > > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> > >                                                 uint32_t value)
> > > {
> > >         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> > > }
> > >
> > >
> > > -Petri
> > >
> > >
> > >
> > > From: lng-odp-bounces@lists.linaro.org [mailto:
> > > lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> > > Sent: Thursday, October 16, 2014 11:42 AM
> > > To: Bill Fischofer
> > > Cc: lng-odp-forward
> > > Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> > > model
> > >
> > >
> > > These suffixes stand for relaxed, release and acquire. They are
> important
> > > concepts in C11/C++11 atomics and memory models. It is all about what
> > > (observable) ordering is required in multithreaded (multiprocessor)
> > > environments, happens-before and happens-after relationships.
> > >
> > > A relaxed access is independent of all other accesses and need no
> > > synchronization.
> > > An acquire access denotes some type of shared resource acquisition.
> Loads
> > > and stores after the acquire load must be prevented from moving up
> (either
> > > by compiler or by the HW), this is a half-sided barrier. Loads and
> stores
> > > from before the acquire are allowed to move down.
> > > A release access denotes releases of a shared resource. Loads and
> stores
> > > before the release store must be prevented from moving down (either by
> > > compiler or by HW), this is also a half-sided barrier. Loads and stores
> > > after the release are allowed to move up.
> > >
> > > Code that uses atomic variables (e.g. for implementing shared memory
> data
> > > structures such as locks and rings) must know which type of atomic
> > > operations is required. The ODP ticket lock implementation makes a good
> > > example:
> > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > > {
> > >         uint32_t ticket;
> > >
> > >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket,
> 1);
> > >   //the ticket counter does not protect anything to incrementing it
> can be
> > > relaxed
> > >
> > >         while (ticket !=
> odp_atomic32_load_acq(&ticketlock->cur_ticket))
> > >  //acquiring the currently served position will include (a half-sided)
> > > barrier so to contain accesses from inside the critical section
> > >                 odp_spin();
> > > }
> > >
> > >
> > > void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> > > {
> > >         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing
> the
> > > currently server position will also include (a half-sided) barrier to
> > > contain inside accesses
> > > }
> > >
> > > Implementations may use barriers of some kind inside these primitive
> > > atomic operations. Some architectures don't even need explicit
> barriers as
> > > they have memory access instructions (e.g. load and store) with
> acquire and
> > > release semantics. Full barriers are heavy (and semantically an
> overkill)
> > > and you want to avoid them if possible. To use full barriers for
> updates to
> > > e.g. global statistics counters will affect performance, such updates
> can
> > > be relaxed (they still need to be atomic of course).
> > >
> > > See these two good presentations Herb Sutter on the C++ standards
> > > committee.
> > >
> > >
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
> > >
> > >
> http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
> > >
> > > On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> > > wrote:
> > > Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> > > intended to be generic wouldn't omitting these be better?
> > >
> > >
> > >
> > >
> > >
>
> > _______________________________________________
> > lng-odp mailing list
> > lng-odp@lists.linaro.org
> > http://lists.linaro.org/mailman/listinfo/lng-odp
>
>
Jerin Jacob Oct. 17, 2014, 9:46 a.m. UTC | #19
On Fri, Oct 17, 2014 at 11:34:36AM +0200, Ola Liljedahl wrote:
> On 17 October 2014 11:07, Jerin Jacob <jerin.jacob@caviumnetworks.com>
> wrote:
> 
> > On Fri, Oct 17, 2014 at 10:22:37AM +0200, Ola Liljedahl wrote:
> > > If there is some architecture that has atomic instructions for
> > > incrementing/decrementing by 1 and using those instructions actually
> > > increases performance, then we can add such atomic operations as well.
> > > Anyone can contribute a patch.
> >
> > Octeon has atomic instructions for incrementing/decrementing by 1 and
> > Inc/dec functions are need to optimize the common case
> > so we would like keep odp_atomic_fetch_inc*
> > for the new model also.
> > I can send a patch for the octeon implementation but we would like keep
> > the abstraction(dedicated API's for inc/dec)
> > as exist today for the new model as well.
> >
> OK this is useful feedback.
> 
> Possibly odp_atomic_fetch_inc() functions replaces the need for
> odp_atomic_fetch_add() operations (returning the old value). Statistics
> counter operations don't have to return any old value so would only need
> odp_atomic_add() operations.

But there are use cases that need "fetch" type also
example: existing odp_ticketlock_lock implementation

> 
> -- Ola
> 
> 
> >
> >
> > >
> > > Subtraction can be expressed using addition (A - B <=> A + (-B)) so I did
> > > not see a strong need for an atomic subtract operation. When do you
> > > actually need atomic operations on signed types? You can always cast the
> > > returned value into a signed type of the same size when you need to
> > > interpret the value as a signed type.
> > >
> > > C11 defines a lot of memory models but not all of them might actually be
> > > needed. Consume is there basically only for Alpha which doesn't honor
> > data
> > > dependencies. Full sequentially consistent (SC) is slow and you probably
> > > want to avoid that. I can understand that same atomic operation want to
> > be
> > > both acquire and release but I am waiting for an actual use case to show
> > up.
> > >
> > > We could modify the API to take the memory model as a parameter (as C11
> > > atomics does). Each function would then have a switch statement, likely
> > the
> > > compiler would remove redundant code when the memory model parameter is a
> > > constant. Such an API would keep the number of functions independent of
> > the
> > > number of supported memory models. However this style of API also
> > requires
> > > us to support all memory models (or those we chose to support) for all
> > > atomic operations which is probably not required by actual use cases. I
> > > could however provide a header file which uses this style for comparison.
> > > Then the ODP atomic API would be even closer syntactically to C11, this
> > > would be a good thing.
> > >
> > > -- Ola
> > >
> > >
> > > On 17 October 2014 08:52, Savolainen, Petri (NSN - FI/Espoo) <
> > > petri.savolainen@nsn.com> wrote:
> > >
> > > >  Hi,
> > > >
> > > >
> > > >
> > > > There is less functions now because you deleted e.g.
> > > > increment/decrement/subtract functions. Inc/dec functions are need to
> > > > optimize the common case, we have already today ISAs that have
> > optimized
> > > > instruction for that. Subtract functionality is also needed – why you
> > > > removed that?  It’s better to have it explicit API for inc/dec/sub that
> > > > hide it to an integer value. Also not yet sure if we can avoid signed
> > > > versions - may be int is not needed, but maybe we need int32 or int64
> > > > instead.
> > > >
> > > >
> > > >
> > > > Also, C11 defines 6 different memory order models – when those
> > remaining
> > > > models will be added?
> > > >
> > > >
> > > >
> > > > When you add up those and multiply by 2, 3 … 6 (memory models) – it’s a
> > > > lot.
> > > >
> > > >
> > > >
> > > > -Petri
> > > >
> > > >
> > > >
> > > >
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > > > *Sent:* Thursday, October 16, 2014 4:45 PM
> > > >
> > > > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > > > *Cc:* Bill Fischofer; lng-odp-forward
> > > > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11
> > memory
> > > > model
> > > >
> > > >
> > > >
> > > > Yes I think redesigning the API to mimic C11 atomics and follow a
> > > > standardised memory model is the way to go. Over time, users will be
> > more
> > > > familiar with the memory models of C11 and C++11 and doing something
> > > > different in ODP will not benefit from this. This patch is all it
> > takes for
> > > > ODP to step into the modern world.
> > > >
> > > >
> > > >
> > > > The new atomics API is smaller, 13 functions vs. 34 in the original
> > API.
> > > > No real loss of functionality (the atomic_int type wasn't really
> > needed),
> > > > to the contrary I would say, more functionality now.
> > > >
> > > >
> > > >
> > > > I am still open to an odp_counters.h API but think odp_atomics.h
> > should be
> > > > public as well.
> > > >
> > > >
> > > >
> > > > The bug in ticketlock_lock() is not only on ARMv8. I think it exists on
> > > > all architectures that allow loads to be reordered.
> > > >
> > > >
> > > >
> > > > -- Ola
> > > >
> > > >
> > > >
> > > > On 16 October 2014 15:29, Savolainen, Petri (NSN - FI/Espoo) <
> > > > petri.savolainen@nsn.com> wrote:
> > > >
> > > > Hi,
> > > >
> > > >
> > > >
> > > > OK, there may be a bug in ticketlock on ARMv8. But is the right
> > solution
> > > > to rewrite the whole odp_atomic.h ? Or maybe replace odp_mem_barrier()
> > with
> > > > a  odp_sync_loads() – which would ensure correct load ordering over the
> > > > lock (prevent loads moving over it).
> > > >
> > > >
> > > >
> > > > My concern is that API gets big/complex although the common case for
> > the
> > > > application could be simple (only “relaxed”). If other use cases are
> > > > related to building synchronization primitives, it’s a limited problem
> > that
> > > > could be handled with correct usage of load/store barriers (or in
> > assembly
> > > > in an optimized ODP implementation).
> > > >
> > > >
> > > >
> > > > Also odp_atomic.h could be replaced by e.g. odp_counter.h in future, if
> > > > that’s the main use case for applications…
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > -Petri
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > *From:* ext Ola Liljedahl [mailto:ola.liljedahl@linaro.org]
> > > > *Sent:* Thursday, October 16, 2014 3:05 PM
> > > > *To:* Savolainen, Petri (NSN - FI/Espoo)
> > > > *Cc:* Bill Fischofer; lng-odp-forward
> > > >
> > > >
> > > > *Subject:* Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11
> > memory
> > > > model
> > > >
> > > >
> > > >
> > > > One specific goal of this updated API is to access shared data
> > correctly
> > > > (in the way defined by C11/C++11 standards, not using some ad-hoc
> > model)
> > > > and to remove the need for explicit (HW and compiler) barriers. The API
> > > > should specify the actual intent of operations, the full memory barrier
> > > > does not. odp_sync_stores() is also not a complete solution as it only
> > > > orders stores, not loads. Now we have been lucky because the
> > > > implementations of odp_sync_stores() have always used a full barrier
> > but
> > > > that's not the specified function.
> > > >
> > > >
> > > >
> > > > Original implementation:
> > > >
> > > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > > >
> > > > {
> > > >
> > > >         uint32_t ticket;
> > > >
> > > >
> > > >
> > > >         ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
> > > >
> > > >
> > > >
> > > >         while (ticket != ticketlock->cur_ticket)
> > > >
> > > >                 odp_spin();
> > > >
> > > >
> > > >
> > > >         odp_mem_barrier();
> > > >
> > > > }
> > > >
> > > >
> > > >
> > > > New implementation:
> > > >
> > > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > > >
> > > > {
> > > >
> > > >         uint32_t ticket;
> > > >
> > > >
> > > >
> > > >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket,
> > 1);
> > > >
> > > >
> > > >
> > > >         while (ticket !=
> > odp_atomic32_load_acq(&ticketlock->cur_ticket))
> > > >
> > > >                 odp_spin();
> > > >
> > > > }
> > > >
> > > >
> > > >
> > > > See the difference? Spot the bug in the original implementation?
> > > >
> > > >
> > > >
> > > > The bug is that the lock() function does not have any acquire (or full)
> > > > barrier between the lock has been acquired (when ticket ==
> > > > lock->cur_ticket) and the return to the caller. This means a subsequent
> > > > load operation in the caller could be speculated before the lock has
> > been
> > > > acquired (while we are still spinning) so read something when the lock
> > is
> > > > held by another thread. The full barriers that are part of the
> > > > atomic_fetch_inc operation don't help.
> > > >
> > > >
> > > >
> > > > Load-acquire will include the necessary barriers to make sure later
> > loads
> > > > are not speculated before the lock is acquired (ticket ==
> > > > lock->cur_ticket). The barriers come automagically when you realize all
> > > > critical-enter operations need some type of acquire operation (e.g.
> > > > load-acquire) and all critical-exit operations need a release
> > > > operation(e.g. store-release).
> > > >
> > > >
> > > >
> > > > -- Ola
> > > >
> > > >
> > > >
> > > > On 16 October 2014 11:57, Savolainen, Petri (NSN - FI/Espoo) <
> > > > petri.savolainen@nsn.com> wrote:
> > > >
> > > > Hi,
> > > >
> > > > I think we don’t need  to specify these in three different versions. It
> > > > should be enough that odp_atomic_xxx is defined as “relaxed”, as it's
> > like
> > > > that already today, since we don’t promise (in API documentation) to
> > > > include memory barriers into those calls. I think "relaxed" is the
> > common
> > > > case for _applications_, those would mainly modify counters through
> > this
> > > > API – and not implement synchronization data structures (like the
> > > > ticketlock). If ODP _implementation_ or _application platform_
> > implements
> > > > such data structure, it’s not huge overhead to put those
> > odp_sync_stores or
> > > > compiler memory barriers there when needed. Application would mainly
> > use
> > > > those (in thousands of places), but those would be implemented only
> > once
> > > > (in few places).
> > > >
> > > >
> > > > Why not just change this …
> > > >
> > > > /**
> > > >  * Fetch and add atomic uint32
> > > >  *
> > > >  * @param ptr    An atomic variable
> > > >  * @param value  A value to be added to the variable
> > > >  *
> > > >  * @return Value of the variable before the operation
> > > >  */
> > > > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> > > >                                                 uint32_t value)
> > > > {
> > > >         return __sync_fetch_and_add(ptr, value);
> > > > }
> > > >
> > > > … into this …
> > > >
> > > > static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
> > > >                                                 uint32_t value)
> > > > {
> > > >         return __atomic_fetch_add (ptr, value, __ATOMIC_RELAXED)
> > > > }
> > > >
> > > >
> > > > -Petri
> > > >
> > > >
> > > >
> > > > From: lng-odp-bounces@lists.linaro.org [mailto:
> > > > lng-odp-bounces@lists.linaro.org] On Behalf Of ext Ola Liljedahl
> > > > Sent: Thursday, October 16, 2014 11:42 AM
> > > > To: Bill Fischofer
> > > > Cc: lng-odp-forward
> > > > Subject: Re: [lng-odp] [ODP/PATCH v1] Look ma, no barriers! C11 memory
> > > > model
> > > >
> > > >
> > > > These suffixes stand for relaxed, release and acquire. They are
> > important
> > > > concepts in C11/C++11 atomics and memory models. It is all about what
> > > > (observable) ordering is required in multithreaded (multiprocessor)
> > > > environments, happens-before and happens-after relationships.
> > > >
> > > > A relaxed access is independent of all other accesses and need no
> > > > synchronization.
> > > > An acquire access denotes some type of shared resource acquisition.
> > Loads
> > > > and stores after the acquire load must be prevented from moving up
> > (either
> > > > by compiler or by the HW), this is a half-sided barrier. Loads and
> > stores
> > > > from before the acquire are allowed to move down.
> > > > A release access denotes releases of a shared resource. Loads and
> > stores
> > > > before the release store must be prevented from moving down (either by
> > > > compiler or by HW), this is also a half-sided barrier. Loads and stores
> > > > after the release are allowed to move up.
> > > >
> > > > Code that uses atomic variables (e.g. for implementing shared memory
> > data
> > > > structures such as locks and rings) must know which type of atomic
> > > > operations is required. The ODP ticket lock implementation makes a good
> > > > example:
> > > > void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
> > > > {
> > > >         uint32_t ticket;
> > > >
> > > >         ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket,
> > 1);
> > > >   //the ticket counter does not protect anything to incrementing it
> > can be
> > > > relaxed
> > > >
> > > >         while (ticket !=
> > odp_atomic32_load_acq(&ticketlock->cur_ticket))
> > > >  //acquiring the currently served position will include (a half-sided)
> > > > barrier so to contain accesses from inside the critical section
> > > >                 odp_spin();
> > > > }
> > > >
> > > >
> > > > void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
> > > > {
> > > >         odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);  //releasing
> > the
> > > > currently server position will also include (a half-sided) barrier to
> > > > contain inside accesses
> > > > }
> > > >
> > > > Implementations may use barriers of some kind inside these primitive
> > > > atomic operations. Some architectures don't even need explicit
> > barriers as
> > > > they have memory access instructions (e.g. load and store) with
> > acquire and
> > > > release semantics. Full barriers are heavy (and semantically an
> > overkill)
> > > > and you want to avoid them if possible. To use full barriers for
> > updates to
> > > > e.g. global statistics counters will affect performance, such updates
> > can
> > > > be relaxed (they still need to be atomic of course).
> > > >
> > > > See these two good presentations Herb Sutter on the C++ standards
> > > > committee.
> > > >
> > > >
> > http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-1-of-2
> > > >
> > > >
> > http://channel9.msdn.com/Shows/Going+Deep/Cpp-and-Beyond-2012-Herb-Sutter-atomic-Weapons-2-of-2
> > > >
> > > > On 16 October 2014 01:18, Bill Fischofer <bill.fischofer@linaro.org>
> > > > wrote:
> > > > Why the odd suffixes (_rlx, _rls, _acq) on these routines?  If they are
> > > > intended to be generic wouldn't omitting these be better?
> > > >
> > > >
> > > >
> > > >
> > > >
> >
> > > _______________________________________________
> > > lng-odp mailing list
> > > lng-odp@lists.linaro.org
> > > http://lists.linaro.org/mailman/listinfo/lng-odp
> >
> >
diff mbox

Patch

diff --git a/example/generator/odp_generator.c b/example/generator/odp_generator.c
index eb8b340..cf2d77b 100644
--- a/example/generator/odp_generator.c
+++ b/example/generator/odp_generator.c
@@ -62,10 +62,10 @@  typedef struct {
  * counters
 */
 static struct {
-	odp_atomic_u64_t seq;	/**< ip seq to be send */
-	odp_atomic_u64_t ip;	/**< ip packets */
-	odp_atomic_u64_t udp;	/**< udp packets */
-	odp_atomic_u64_t icmp;	/**< icmp packets */
+	odp_atomic64_t seq;	/**< ip seq to be send */
+	odp_atomic64_t ip;	/**< ip packets */
+	odp_atomic64_t udp;	/**< udp packets */
+	odp_atomic64_t icmp;	/**< icmp packets */
 } counters;
 
 /** * Thread specific arguments
@@ -201,7 +201,7 @@  static void pack_udp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_UDPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_UDP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xFFFF;
+	seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xFFFF;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -258,7 +258,7 @@  static void pack_icmp_pkt(odp_buffer_t obuf)
 	ip->tot_len = odp_cpu_to_be_16(args->appl.payload + ODPH_ICMPHDR_LEN +
 				       ODPH_IPV4HDR_LEN);
 	ip->proto = ODPH_IPPROTO_ICMP;
-	seq = odp_atomic_fetch_add_u64(&counters.seq, 1) % 0xffff;
+	seq = odp_atomic64_fetch_add_rlx(&counters.seq, 1) % 0xffff;
 	ip->id = odp_cpu_to_be_16(seq);
 	ip->chksum = 0;
 	odph_ipv4_csum_update(pkt);
@@ -334,13 +334,15 @@  static void *gen_send_thread(void *arg)
 		}
 
 		if (args->appl.interval != 0) {
+			uint64_t seq = odp_atomic64_load_rlx(&counters.seq);
 			printf("  [%02i] send pkt no:%ju seq %ju\n",
-			       thr, counters.seq, counters.seq%0xffff);
+			       thr, seq, seq%0xffff);
 			/* TODO use odp timer */
 			usleep(args->appl.interval * 1000);
 		}
-		if (args->appl.number != -1 && counters.seq
-		    >= (unsigned int)args->appl.number) {
+		if (args->appl.number != -1 &&
+		    odp_atomic64_load_rlx(&counters.seq) >=
+		    (unsigned int)args->appl.number) {
 			break;
 		}
 	}
@@ -348,7 +350,8 @@  static void *gen_send_thread(void *arg)
 	/* receive number of reply pks until timeout */
 	if (args->appl.mode == APPL_MODE_PING && args->appl.number > 0) {
 		while (args->appl.timeout >= 0) {
-			if (counters.icmp >= (unsigned int)args->appl.number)
+			if (odp_atomic64_load_rlx(&counters.icmp) >=
+			    (unsigned int)args->appl.number)
 				break;
 			/* TODO use odp timer */
 			sleep(1);
@@ -358,10 +361,12 @@  static void *gen_send_thread(void *arg)
 
 	/* print info */
 	if (args->appl.mode == APPL_MODE_UDP) {
-		printf("  [%02i] total send: %ju\n", thr, counters.seq);
+		printf("  [%02i] total send: %ju\n", thr,
+		       odp_atomic64_load_rlx(&counters.seq));
 	} else if (args->appl.mode == APPL_MODE_PING) {
 		printf("  [%02i] total send: %ju total receive: %ju\n",
-		       thr, counters.seq, counters.icmp);
+		       thr, odp_atomic64_load_rlx(&counters.seq),
+		       odp_atomic64_load_rlx(&counters.icmp));
 	}
 	return arg;
 }
@@ -395,7 +400,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 		if (!odp_packet_inflag_ipv4(pkt))
 			continue;
 
-		odp_atomic_inc_u64(&counters.ip);
+		odp_atomic64_add_rlx(&counters.ip, 1);
 		rlen += sprintf(msg, "receive Packet proto:IP ");
 		buf = odp_buffer_addr(odp_buffer_from_packet(pkt));
 		ip = (odph_ipv4hdr_t *)(buf + odp_packet_l3_offset(pkt));
@@ -405,7 +410,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 
 		/* udp */
 		if (ip->proto == ODPH_IPPROTO_UDP) {
-			odp_atomic_inc_u64(&counters.udp);
+			odp_atomic64_add_rlx(&counters.udp, 1);
 			udp = (odph_udphdr_t *)(buf + offset);
 			rlen += sprintf(msg + rlen, "UDP payload %d ",
 					odp_be_to_cpu_16(udp->length) -
@@ -417,7 +422,7 @@  static void print_pkts(int thr, odp_packet_t pkt_tbl[], unsigned len)
 			icmp = (odph_icmphdr_t *)(buf + offset);
 			/* echo reply */
 			if (icmp->type == ICMP_ECHOREPLY) {
-				odp_atomic_inc_u64(&counters.icmp);
+				odp_atomic64_add_rlx(&counters.icmp, 1);
 				memcpy(&tvsend, buf + offset + ODPH_ICMPHDR_LEN,
 				       sizeof(struct timeval));
 				/* TODO This should be changed to use an
@@ -530,10 +535,10 @@  int main(int argc, char *argv[])
 	}
 
 	/* init counters */
-	odp_atomic_init_u64(&counters.seq);
-	odp_atomic_init_u64(&counters.ip);
-	odp_atomic_init_u64(&counters.udp);
-	odp_atomic_init_u64(&counters.icmp);
+	odp_atomic64_store_rlx(&counters.seq, 0);
+	odp_atomic64_store_rlx(&counters.ip, 0);
+	odp_atomic64_store_rlx(&counters.udp, 0);
+	odp_atomic64_store_rlx(&counters.icmp, 0);
 
 	/* Reserve memory for args from shared mem */
 	shm = odp_shm_reserve("shm_args", sizeof(args_t),
diff --git a/example/ipsec/odp_ipsec.c b/example/ipsec/odp_ipsec.c
index 2f2dc19..76c27d0 100644
--- a/example/ipsec/odp_ipsec.c
+++ b/example/ipsec/odp_ipsec.c
@@ -1223,7 +1223,7 @@  main(int argc, char *argv[])
 	printf("Num worker threads: %i\n", num_workers);
 
 	/* Create a barrier to synchronize thread startup */
-	odp_barrier_init_count(&sync_barrier, num_workers);
+	odp_barrier_init(&sync_barrier, num_workers);
 
 	/*
 	 * By default core #0 runs Linux kernel background tasks.
diff --git a/example/odp_example/odp_example.c b/example/odp_example/odp_example.c
index 0e9aa3d..c473395 100644
--- a/example/odp_example/odp_example.c
+++ b/example/odp_example/odp_example.c
@@ -1120,7 +1120,7 @@  int main(int argc, char *argv[])
 	odp_shm_print_all();
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&globals->barrier, num_workers);
+	odp_barrier_init(&globals->barrier, num_workers);
 
 	if (args.proc_mode) {
 		int ret;
diff --git a/example/timer/odp_timer_test.c b/example/timer/odp_timer_test.c
index 78b2ae2..dfbeae9 100644
--- a/example/timer/odp_timer_test.c
+++ b/example/timer/odp_timer_test.c
@@ -372,7 +372,7 @@  int main(int argc, char *argv[])
 	printf("\n");
 
 	/* Barrier to sync test case execution */
-	odp_barrier_init_count(&test_barrier, num_workers);
+	odp_barrier_init(&test_barrier, num_workers);
 
 	/* Create and launch worker threads */
 	odph_linux_pthread_create(thread_tbl, num_workers, first_core,
diff --git a/helper/include/odph_ring.h b/helper/include/odph_ring.h
index 76c1db8..5e78b34 100644
--- a/helper/include/odph_ring.h
+++ b/helper/include/odph_ring.h
@@ -138,8 +138,8 @@  typedef struct odph_ring {
 		uint32_t sp_enqueue;     /* True, if single producer. */
 		uint32_t size;           /* Size of ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Producer head. */
-		uint32_t tail;		/* Producer tail. */
+		odp_atomic32_t head;	/* Producer head. */
+		odp_atomic32_t tail;	/* Producer tail. */
 	} prod ODP_ALIGNED_CACHE;
 
 	/** @private Consumer */
@@ -147,8 +147,8 @@  typedef struct odph_ring {
 		uint32_t sc_dequeue;     /* True, if single consumer. */
 		uint32_t size;           /* Size of the ring. */
 		uint32_t mask;           /* Mask (size-1) of ring. */
-		uint32_t head;		/* Consumer head. */
-		uint32_t tail;		/* Consumer tail. */
+		odp_atomic32_t head;	/* Consumer head. */
+		odp_atomic32_t tail;	/* Consumer tail. */
 	} cons ODP_ALIGNED_CACHE;
 
 	/** @private Memory space of ring starts here. */
diff --git a/platform/linux-generic/include/api/odp_atomic.h b/platform/linux-generic/include/api/odp_atomic.h
index 0cc4cf4..89f183c 100644
--- a/platform/linux-generic/include/api/odp_atomic.h
+++ b/platform/linux-generic/include/api/odp_atomic.h
@@ -4,463 +4,559 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
-
 /**
  * @file
  *
- * ODP atomic operations
+ * ODP atomic types and operations, semantically a subset of C11 atomics.
+ * Scalar variable wrapped in a struct to avoid accessing scalar directly
+ * without using the required access functions.
+ * Atomic functions must be used to operate on atomic variables!
  */
 
 #ifndef ODP_ATOMIC_H_
 #define ODP_ATOMIC_H_
 
+#include <stdint.h>
+#include <odp_align.h>
+#include <odp_hints.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#include <odp_std_types.h>
-
-
-/**
- * Atomic integer
- */
-typedef volatile int32_t odp_atomic_int_t;
-
 /**
- * Atomic unsigned integer 64 bits
+ * 32-bit (unsigned) atomic type
  */
-typedef volatile uint64_t odp_atomic_u64_t;
+typedef struct {
+	uint32_t v; /**< Actual storage for the atomic variable */
+} odp_atomic32_t
+ODP_ALIGNED(sizeof(uint32_t)); /* Enforce alignement! */
 
 /**
- * Atomic unsigned integer 32 bits
+ * 64-bit (unsigned) atomic type
  */
-typedef volatile uint32_t odp_atomic_u32_t;
-
+typedef struct {
+	uint64_t v; /**< Actual storage for the atomic variable */
+} odp_atomic64_t
+ODP_ALIGNED(sizeof(uint64_t)); /* Enforce alignement! */
 
-/**
- * Initialize atomic integer
- *
- * @param ptr    An integer atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_int(odp_atomic_int_t *ptr)
-{
-	*ptr = 0;
-}
-
-/**
- * Load value of atomic integer
- *
- * @param ptr    An atomic variable
- *
- * @return atomic integer value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline int odp_atomic_load_int(odp_atomic_int_t *ptr)
-{
-	return *ptr;
-}
+/*****************************************************************************
+ * Just a few helpers
+ *****************************************************************************/
 
-/**
- * Store value to atomic integer
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_store_int(odp_atomic_int_t *ptr, int new_value)
-{
-	*ptr = new_value;
-}
-
-/**
- * Fetch and add atomic integer
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_add_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
-
-/**
- * Fetch and subtract atomic integer
- *
- * @param ptr    An atomic integer variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_sub_int(odp_atomic_int_t *ptr, int value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
-
-/**
- * Fetch and increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_inc_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Increment atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_add_int(ptr, 1);
-}
-
-/**
- * Fetch and decrement atomic integer by 1
- *
- * @param ptr    An atomic int variable
- *
- * @return Value of the variable before the operation
- */
-static inline int odp_atomic_fetch_dec_int(odp_atomic_int_t *ptr)
-{
-	return odp_atomic_fetch_sub_int(ptr, 1);
-}
-
-/**
- * Decrement atomic integer by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_int(odp_atomic_int_t *ptr)
-{
-	odp_atomic_fetch_sub_int(ptr, 1);
-}
+#ifdef __OCTEON__
+/* OCTEON Write Memory Barrier */
+#define COMPILER_HW_BARRIER() __asm __volatile( \
+	/* Double syncw to work around errata */ \
+	".set push\n\t.set arch=octeon\n\tsyncw\n\tsyncw\n\t.set pop" \
+	: : : "memory")
+/* syncw is also used to flush the write buffer which makes stores visible
+ * quicker which should be beneficial to release operations */
+#define OCTEON_FLUSH() __asm __volatile( \
+	".set push\n\t.set arch=octeon\n\tsyncw\n\t.set pop" \
+	: : : "memory")
+#else
+/* __sync_synchronize() generates the right insn for ARMv6t2 and ARMv7-a */
+/** Compiler and hardware full memory barrier */
+#define COMPILER_HW_BARRIER() __sync_synchronize()
+/** Flush write buffer on OCTEON */
+#define OCTEON_FLUSH() (void)0
+#endif
 
-/**
- * Initialize atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @note The operation is not synchronized with other threads
- */
-static inline void odp_atomic_init_u32(odp_atomic_u32_t *ptr)
-{
-	*ptr = 0;
-}
+/** Compiler memory barrier */
+#define COMPILER_BARRIER() __asm __volatile("" : : : "memory")
 
-/**
- * Load value of atomic uint32
- *
- * @param ptr    An atomic variable
- *
- * @return atomic uint32 value
- *
- * @note The operation is not synchronized with other threads
- */
-static inline uint32_t odp_atomic_load_u32(odp_atomic_u32_t *ptr)
-{
-	return *ptr;
-}
+/*****************************************************************************
+ * Operations on 32-bit atomics
+ * odp_atomic32_load_rlx
+ * odp_atomic32_store_rlx
+ * odp_atomic32_load_acq
+ * odp_atomic32_store_rls
+ * odp_atomic32_cmp_and_swap_rlx - return old value
+ * odp_atomic32_fetch_add_rlx - return old value
+ * odp_atomic32_fetch_add_rls - return old value
+ * odp_atomic32_add_rlx - no return value
+ * odp_atomic32_add_rls - no return value
+ *****************************************************************************/
 
 /**
- * Store value to atomic uint32
+ * Relaxed atomic load of 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
  *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * @param ptr   Pointer to a 32-bit atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * @return Value of the variable
  */
-static inline void odp_atomic_store_u32(odp_atomic_u32_t *ptr,
-					uint32_t new_value)
+static inline uint32_t odp_atomic32_load_rlx(const odp_atomic32_t *ptr)
 {
-	*ptr = new_value;
+	uint32_t val;
+	COMPILER_BARRIER();
+	/* Read of aligned word is atomic */
+	val = ptr->v;
+	COMPILER_BARRIER();
+	return val;
 }
 
 /**
- * Fetch and add atomic uint32
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * Relaxed atomic store of 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
  *
- * @return Value of the variable before the operation
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param val   Value to write to the variable
  */
-static inline uint32_t odp_atomic_fetch_add_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
+static inline void odp_atomic32_store_rlx(odp_atomic32_t *ptr, uint32_t val)
 {
-	return __sync_fetch_and_add(ptr, value);
+	COMPILER_BARRIER();
+	/* Write of aligned word is atomic */
+	ptr->v = val;
+	COMPILER_BARRIER();
 }
 
 /**
- * Fetch and subtract uint32
+ * Atomic load-acquire of 32-bit atomic variable
+ * @note SC-load-acquire barrier, later accesses cannot move before
+ * the load-acquire access.
  *
- * @param ptr    An atomic variable
- * @param value  A value to be sub to the variable
+ * @param ptr   Pointer to a 32-bit atomic variable
  *
- * @return Value of the variable before the operation
+ * @return Value of the variable
  */
-static inline uint32_t odp_atomic_fetch_sub_u32(odp_atomic_u32_t *ptr,
-						uint32_t value)
+static inline uint32_t odp_atomic32_load_acq(const odp_atomic32_t *ptr)
 {
-	return __sync_fetch_and_sub(ptr, value);
+#if defined __aarch64__
+	uint32_t val;
+	__asm __volatile("ldar %w0, [%1]"
+		 : "=&r"(val)
+				 : "r"(&ptr->v)
+				 : "memory");
+	return val;
+#elif defined __arm__  || defined __mips64__ || defined __x86_64__
+	/* Read of aligned word is atomic */
+	uint32_t val = ptr->v;
+	/* To prevent later accesses from moving up */
+	/* FIXME: Herb Sutter claims HW barrier not needed on x86? */
+	COMPILER_HW_BARRIER();
+	return val;
+#else
+#warning odp_atomic32_load_acq() may not be efficiently implemented
+	/* Assume read of aligned word is atomic */
+	uint32_t val = ptr->v;
+	/* To prevent later accesses from moving up */
+	COMPILER_HW_BARRIER();
+	return val;
+#endif
 }
 
 /**
- * Fetch and increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __OCTEON__
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	uint32_t ret;
-
-	__asm__ __volatile__ ("syncws");
-	__asm__ __volatile__ ("lai %0,(%2)" : "=r" (ret), "+m" (ptr) :
-			      "r" (ptr));
-
-	return ret;
-}
-
+ * Atomic store-release of 32-bit atomic variable
+ * @note SC-store-release barrier, earlier accesses cannot move after
+ * store-release access.
+ *
+ * @param ptr  Pointer to a 32-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ */
+static inline void odp_atomic32_store_rls(odp_atomic32_t *ptr, uint32_t val)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	/* Compiler and HW barrier to prevent earlier accesses from moving
+	 * down */
+	COMPILER_HW_BARRIER();
+	/* Write of aligned word is atomic */
+	ptr->v = val;
+	/* Compiler and HW barrier to prevent this store from moving down after
+	 * a later load-acquire and thus create overlapping critical sections.
+	 * Herb Sutter thinks this is needed */
+	COMPILER_HW_BARRIER();
+#elif defined __aarch64__
+	__asm __volatile("stlr %w0, [%1]"
+		 :
+		 : "r"(val), "r"(&ptr->v)
+				 : "memory");
+#elif defined __mips64__
+	/* Compiler and HW barrier to prevent earlier accesses from moving
+	 * down */
+	COMPILER_HW_BARRIER();
+	/* Write of aligned word is atomic */
+	ptr->v = val;
+	/* Compiler and HW barrier to prevent this store from moving down after
+	 * a later load-acquire and thus create overlapping critical sections.
+	 * Herb Sutter thinks this is needed */
+	COMPILER_HW_BARRIER();
+#elif defined __x86_64__
+	/* This is actually an atomic exchange operation */
+	/* Generates good code on x86_64 */
+	(void)__sync_lock_test_and_set(&ptr->v, val);
 #else
-
-static inline uint32_t odp_atomic_fetch_inc_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_add_u32(ptr, 1);
-}
-
+#warning odp_atomic32_store_rls() may not be efficiently implemented
+	/* This is actually an atomic exchange operation */
+	(void)__sync_lock_test_and_set(&ptr->v, val);
 #endif
-
-/**
- * Increment atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_add_u32(ptr, 1);
 }
 
-/**
- * Fetch and decrement uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint32_t odp_atomic_fetch_dec_u32(odp_atomic_u32_t *ptr)
-{
-	return odp_atomic_fetch_sub_u32(ptr, 1);
-}
 
 /**
- * Decrement atomic uint32 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u32(odp_atomic_u32_t *ptr)
-{
-	odp_atomic_fetch_sub_u32(ptr, 1);
+ * Atomic compare and swap of 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
+ * @note Not compare-and-set! Called should compare return value with expected
+ * parameter to check if swap operation succeeded.
+ *
+ * @param ptr  Pointer to a 32-bit atomic variable
+ * @param exp  Expected old value
+ * @param val  New value
+ * @return Actual old value, if different from 'exp' then swap failed
+ */
+static inline uint32_t
+odp_atomic32_cmp_and_swap_rlx(odp_atomic32_t *ptr,
+			      uint32_t exp,
+			      uint32_t val)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t old;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%1]"
+		 : "=&r"(old)
+					 : "r"(&ptr->v)
+					 : "memory");
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Clear exclusive access monitor */
+			__asm __volatile("clrex");
+			break;
+		}
+		/* Current value is as expected, attempt to write new value */
+		__asm __volatile("strex %0, %1, [%2]"
+		 : "=&r"(status)
+					 : "r"(val), "r"(&ptr->v)
+					 : "memory");
+		/* Restart the loop so we can re-read the previous value */
+	} while (odp_unlikely(status != 0));
+	return old;
+#elif defined __aarch64__
+	uint32_t old;
+	int status;
+	do {
+		__asm __volatile("ldxr %w0, [%1]"
+		 : "=&r"(old)
+					 : "r"(&ptr->v)
+					 : "memory");
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			/* Clear exclusive access monitor */
+			__asm __volatile("clrex");
+			break;
+		}
+		/* Current value is as expected, attempt to write new value */
+		__asm __volatile("stxr %w0, %w1, [%2]"
+		 : "=&r"(status)
+					 : "r"(val), "r"(&ptr->v)
+					 : "memory");
+		/* Restart the loop so we can re-read the previous value */
+	} while (odp_unlikely(status != 0));
+	return old;
+#elif defined __mips64__
+	uint32_t old, new_val;
+	do {
+		__asm __volatile("llw %0, [%1]"
+		 : "=&r"(old)
+					 : "r"(&ptr->v)
+					 : "memory");
+		if (odp_unlikely(old != exp)) {
+			/* Value has changed, can't proceed */
+			break;
+		}
+		/* Current value is as expected, attempt to write new value */
+		new_val = val;
+		__asm __volatile("scw %0, [%1]"
+		 : "+&r"(new_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(new_val == 0));
+	return old;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_val_compare_and_swap(&ptr->v, exp, val);
+#else
+#warning odp_atomic32_cmp_and_swap_rlx() may not be efficiently implemented
+	return __sync_val_compare_and_swap(&ptr->v, exp, val);
+#endif
 }
 
 /**
- * Atomic compare and set for 32bit
- *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
- */
-static inline int
-odp_atomic_cmpset_u32(odp_atomic_u32_t *dst, uint32_t exp, uint32_t src)
-{
-	return __sync_bool_compare_and_swap(dst, exp, src);
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
+ * @note A - B <=> A + (-B)
+ *
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ *
+ * @return Value of the atomic variable before the addition
+ */
+static inline uint32_t odp_atomic32_fetch_add_rlx(odp_atomic32_t *ptr,
+		uint32_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint32_t old_val, new_val;
+	int status;
+	do {
+		__asm __volatile("ldrex %0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("strex %0, %1, [%2]"
+		 : "=&r"(status)
+					 : "r"(new_val), "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(status != 0));
+	return old_val;
+#elif defined __aarch64__
+	uint32_t old_val, new_val;
+	int status;
+	do {
+		__asm __volatile("ldxr %w0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("stxr %w0, %w1, [%2]"
+		 : "=&r"(status)
+					 : "r"(new_val), "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(status != 0));
+	return old_val;
+#elif defined __mips64__
+	uint32_t old_val, new_val;
+	do {
+		__asm __volatile("llw %0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("scw %0, [%1]"
+		 : "+&r"(new_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(new_val == 0));
+	return old_val;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_fetch_and_add(&ptr->v, incr);
+#else
+#warning odp_atomic32_fetch_add_rlx() may not be efficiently implemented
+	return __sync_fetch_and_add(&ptr->v, incr);
+#endif
 }
 
 /**
- * Initialize atomic uint64
+ * Atomic fetch and add to 32-bit atomic variable
+ * @note Sequential consistent memory model, barriers before and after the
+ * @note A - B <=> A + (-B)
  *
- * @param ptr    An atomic variable
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
  *
- * @note The operation is not synchronized with other threads
+ * @return Value of the atomic variable before the addition
  */
-static inline void odp_atomic_init_u64(odp_atomic_u64_t *ptr)
+static inline uint32_t odp_atomic32_fetch_add_rls(odp_atomic32_t *ptr,
+		uint32_t incr)
 {
-	*ptr = 0;
+#if defined __arm__ /* A32/T32 ISA */
+	COMPILER_HW_BARRIER();
+	return odp_atomic32_fetch_add_rlx(ptr, incr);
+#elif defined __aarch64__
+	/* We basically get acquire/release semantics */
+	return __sync_fetch_and_add(&ptr->v, incr);
+#elif defined __mips64__
+	uint32_t old;
+	COMPILER_HW_BARRIER();
+	old = odp_atomic32_fetch_add_rlx(ptr, incr);
+	OCTEON_FLUSH();
+	return old;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_fetch_and_add(&ptr->v, incr);
+#else
+#warning odp_atomic32_fetch_add_rls() may not be efficiently implemented
+	return __sync_fetch_and_add(&ptr->v, incr);
+#endif
 }
 
 /**
- * Load value of atomic uint64
- *
- * @param ptr    An atomic variable
+ * Atomic add to 32-bit atomic variable
+ * @note Relaxed memory model, no barriers.
  *
- * @return atomic uint64 value
- *
- * @note The operation is not synchronized with other threads
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
  */
-static inline uint64_t odp_atomic_load_u64(odp_atomic_u64_t *ptr)
+static inline void odp_atomic32_add_rlx(odp_atomic32_t *ptr,
+					uint32_t incr)
 {
-	return *ptr;
+	/* Use odp_atomic32_fetch_add_rlx() for now */
+	(void)odp_atomic32_fetch_add_rlx(ptr, incr);
 }
 
 /**
- * Store value to atomic uint64
- *
- * @param ptr        An atomic variable
- * @param new_value  Store new_value to a variable
+ * Atomic add to 32-bit atomic variable
+ * @note Sequential consistent memory model, barriers before and after the
+ * operation.
  *
- * @note The operation is not synchronized with other threads
+ * @param ptr   Pointer to a 32-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
  */
-static inline void odp_atomic_store_u64(odp_atomic_u64_t *ptr,
-					uint64_t new_value)
+static inline void odp_atomic32_add_rls(odp_atomic32_t *ptr, uint32_t incr)
 {
-	*ptr = new_value;
+	/* Use odp_atomic32_fetch_add_rls() for now */
+	(void)odp_atomic32_fetch_add_rls(ptr, incr);
 }
 
-/**
- * Add atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
- *
- */
-static inline void odp_atomic_add_u64(odp_atomic_u64_t *ptr, uint64_t value)
-{
-	__sync_fetch_and_add(ptr, value);
-}
+/*****************************************************************************
+ * Operations on 64-bit atomics
+ * odp_atomic64_load_rlx
+ * odp_atomic64_store_rlx
+ * odp_atomic64_fetch_add_rlx
+ * odp_atomic64_add_rlx
+ *****************************************************************************/
 
 /**
- * Fetch and add atomic uint64
+ * Relaxed atomic load of 64-bit atomic variable
+ * @note Relaxed memory model, no barriers.
  *
- * @param ptr    An atomic variable
- * @param value  A value to be added to the variable
+ * @param ptr   Pointer to a 64-bit atomic variable
  *
- * @return Value of the variable before the operation
+ * @return Value of the atomic variable
  */
-
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
+static inline uint64_t odp_atomic64_load_rlx(odp_atomic64_t *ptr)
 {
-	return __sync_fetch_and_add((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t val;
+	__asm __volatile("ldrexd %0, %H0, [%1]\n\t"
+			 "clrex" /* Clear exclusive access monitor */
+		 : "=&r"(val)
+				 : "r"(&ptr->v)
+				 : );
+	return val;
+#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
+	/* Read of aligned quad/double word is atomic */
+	return ptr->v;
 #else
-static inline uint64_t odp_atomic_fetch_add_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_add(ptr, value);
-}
+#warning odp_atomic64_load_rlx() may not be efficiently implemented
+	return __sync_fetch_and_or(&ptr->v, 0);
 #endif
-/**
- * Subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- */
-static inline void odp_atomic_sub_u64(odp_atomic_u64_t *ptr, uint64_t value)
-{
-	__sync_fetch_and_sub(ptr, value);
 }
 
 /**
- * Fetch and subtract atomic uint64
- *
- * @param ptr    An atomic variable
- * @param value  A value to be subtracted from the variable
- *
- * @return Value of the variable before the operation
- */
-#if defined __powerpc__ && !defined __powerpc64__
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub((odp_atomic_u32_t *)ptr,
-				    (uint32_t)value);
-}
+ * Relaxed atomic store of 64-bit atomic variable
+ * @note Relaxed memory model, no barriers.
+ *
+ * @param ptr  Pointer to a 64-bit atomic variable
+ * @param val  Value to write to the atomic variable
+ */
+static inline void odp_atomic64_store_rlx(odp_atomic64_t *ptr,
+		uint64_t val)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val;
+	int status;
+	do {
+		/* Read atomic variable exclusively so we can write to it
+		 * later */
+		__asm __volatile("ldrexd %0, %H0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		(void)old_val; /* Ignore old value */
+		/* Attempt to write the new value */
+		__asm __volatile("strexd %0, %1, %H1, [%2]"
+		 : "=&r"(status)
+					 : "r"(val), "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+#elif defined __x86_64__ || defined __aarch64__ || defined __mips64__
+	/* Write of aligned quad/double word is atomic */
+	ptr->v = val;
 #else
-static inline uint64_t odp_atomic_fetch_sub_u64(odp_atomic_u64_t *ptr,
-						uint64_t value)
-{
-	return __sync_fetch_and_sub(ptr, value);
-}
+#warning odp_atomic64_store_rlx() may not be efficiently implemented
+	/* This is actually an atomic exchange operation */
+	(void)__sync_lock_test_and_set(&ptr->v, val);
 #endif
-/**
- * Fetch and increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint64_t odp_atomic_fetch_inc_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_add_u64(ptr, 1);
-}
-
-/**
- * Increment atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_inc_u64(odp_atomic_u64_t *ptr)
-{
-	odp_atomic_fetch_add_u64(ptr, 1);
-}
-
-/**
- * Fetch and decrement atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- * @return Value of the variable before the operation
- */
-static inline uint64_t odp_atomic_fetch_dec_u64(odp_atomic_u64_t *ptr)
-{
-	return odp_atomic_fetch_sub_u64(ptr, 1);
 }
 
 /**
- * Decrement atomic uint64 by 1
- *
- * @param ptr    An atomic variable
- *
- */
-static inline void odp_atomic_dec_u64(odp_atomic_u64_t *ptr)
-{
-	odp_atomic_fetch_sub_u64(ptr, 1);
+ * Atomic fetch and add to 64-bit atomic variable
+ * @note Relaxed memory model, no barriers.
+ *
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
+ *
+ * @return Value of the atomic variable before the addition
+ */
+static inline uint64_t odp_atomic64_fetch_add_rlx(odp_atomic64_t *ptr,
+		uint64_t incr)
+{
+#if defined __arm__ /* A32/T32 ISA */
+	uint64_t old_val, new_val;
+	int status;
+	do {
+		__asm __volatile("ldrexd %0, %H0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("strexd %0, %1, %H1, [%2]"
+		 : "=&r"(status)
+					 : "r"(new_val), "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+	return old_val;
+#elif defined __aarch64__
+	uint64_t old_val, new_val;
+	int status;
+	do {
+		__asm __volatile("ldxr %x0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("stxr %w0, %x1, [%2]"
+		 : "=&r"(status)
+					 : "r"(new_val), "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(status != 0)); /* Retry until write succeeds */
+	return old_val;
+#elif defined __mips64__
+	uint64_t old_val, new_val;
+	do {
+		__asm __volatile("ll %0, [%1]"
+		 : "=&r"(old_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+		new_val = old_val + incr;
+		__asm __volatile("sc %0, [%1]"
+		 : "+&r"(new_val)
+					 : "r"(&ptr->v)
+					 : "memory");
+	} while (odp_unlikely(new_val == 0));
+	return old_val;
+#elif defined __x86_64__
+	/* Generates good code on x86_64 */
+	return __sync_fetch_and_add(&ptr->v, incr);
+#else
+#warning odp_atomic64_fetch_add_rlx() may not be efficiently implemented
+	return __sync_fetch_and_add(&ptr->v, incr);
+#endif
 }
 
 /**
- * Atomic compare and set for 64bit
+ * Atomic add to 64-bit atomic variable
+ * @note Relaxed memory model, no barriers.
  *
- * @param dst destination location into which the value will be written.
- * @param exp expected value.
- * @param src new value.
- * @return Non-zero on success; 0 on failure.
+ * @param ptr   Pointer to a 64-bit atomic variable
+ * @param incr  The value to be added to the atomic variable
  */
-static inline int
-odp_atomic_cmpset_u64(odp_atomic_u64_t *dst, uint64_t exp, uint64_t src)
+static inline void odp_atomic64_add_rlx(odp_atomic64_t *ptr, uint64_t incr)
 {
-	return __sync_bool_compare_and_swap(dst, exp, src);
+	(void)odp_atomic64_fetch_add_rlx(ptr, incr);
 }
 
 #ifdef __cplusplus
diff --git a/platform/linux-generic/include/api/odp_barrier.h b/platform/linux-generic/include/api/odp_barrier.h
index a7b3215..f8eae9a 100644
--- a/platform/linux-generic/include/api/odp_barrier.h
+++ b/platform/linux-generic/include/api/odp_barrier.h
@@ -27,18 +27,18 @@  extern "C" {
  * ODP execution barrier
  */
 typedef struct odp_barrier_t {
-	int              count;  /**< @private Thread count */
-	odp_atomic_int_t bar;    /**< @private Barrier counter */
+	uint32_t       num_threads;  /**< @private Thread count (constant) */
+	odp_atomic32_t in_barrier;   /**< @private Threaads in barrier */
 } odp_barrier_t;
 
 
 /**
  * Init barrier with thread count
  *
- * @param barrier    Barrier
- * @param count      Thread count
+ * @param barrier     Barrier
+ * @param num_threads Number of threads which share the barrier
  */
-void odp_barrier_init_count(odp_barrier_t *barrier, int count);
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads);
 
 
 /**
diff --git a/platform/linux-generic/include/api/odp_rwlock.h b/platform/linux-generic/include/api/odp_rwlock.h
index 252ebb2..ff8a9a2 100644
--- a/platform/linux-generic/include/api/odp_rwlock.h
+++ b/platform/linux-generic/include/api/odp_rwlock.h
@@ -10,26 +10,30 @@ 
 /**
  * @file
  *
- * ODP RW Locks
+ * ODP read/write lock
+ * RW lock support multiple concurrent reads but only one (exclusive) writer.
  */
 
+#include <odp_atomic.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /**
  * The odp_rwlock_t type.
- * write lock count is -1,
- * read lock count > 0
+ * write lock is ~0U
+ * read lock count >0 && <~0U
  */
 typedef struct {
-	volatile int32_t cnt; /**< -1 Write lock,
-				> 0 for Read lock. */
+	odp_atomic32_t cnt; /**< == 0: unlocked,
+				 == ~0: locked for write,
+				 > 0 number of concurrent read locks */
 } odp_rwlock_t;
 
 
 /**
- * Initialize the rwlock to an unlocked state.
+ * Initialize the rwlock to the unlocked state.
  *
  * @param rwlock pointer to the RW Lock.
  */
@@ -50,14 +54,14 @@  void odp_rwlock_read_lock(odp_rwlock_t *rwlock);
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock);
 
 /**
- * Aquire a write lock.
+ * Aquire the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock);
 
 /**
- * Release a write lock.
+ * Release the write lock.
  *
  * @param rwlock pointer to a RW Lock.
  */
diff --git a/platform/linux-generic/include/api/odp_ticketlock.h b/platform/linux-generic/include/api/odp_ticketlock.h
index 6277a18..c4b5e34 100644
--- a/platform/linux-generic/include/api/odp_ticketlock.h
+++ b/platform/linux-generic/include/api/odp_ticketlock.h
@@ -27,8 +27,8 @@  extern "C" {
  * ODP ticketlock
  */
 typedef struct odp_ticketlock_t {
-	odp_atomic_u32_t  next_ticket; /**< @private Next ticket */
-	volatile uint32_t cur_ticket;  /**< @private Current ticket */
+	odp_atomic32_t next_ticket; /**< @private Next ticket */
+	odp_atomic32_t cur_ticket;  /**< @private Current ticket */
 } odp_ticketlock_t;
 
 
diff --git a/platform/linux-generic/include/odp_buffer_internal.h b/platform/linux-generic/include/odp_buffer_internal.h
index 2002b51..530ab96 100644
--- a/platform/linux-generic/include/odp_buffer_internal.h
+++ b/platform/linux-generic/include/odp_buffer_internal.h
@@ -88,7 +88,7 @@  typedef struct odp_buffer_hdr_t {
 	uint32_t                 index;	     /* buf index in the pool */
 	size_t                   size;       /* max data size */
 	size_t                   cur_offset; /* current offset */
-	odp_atomic_int_t         ref_count;  /* reference count */
+	odp_atomic32_t           ref_count;  /* reference count */
 	odp_buffer_scatter_t     scatter;    /* Scatter/gather list */
 	int                      type;       /* type of next header */
 	odp_buffer_pool_t        pool_hdl;   /* buffer pool handle */
diff --git a/platform/linux-generic/odp_barrier.c b/platform/linux-generic/odp_barrier.c
index a82b294..6c3b884 100644
--- a/platform/linux-generic/odp_barrier.c
+++ b/platform/linux-generic/odp_barrier.c
@@ -8,41 +8,48 @@ 
 #include <odp_sync.h>
 #include <odp_spin_internal.h>
 
-void odp_barrier_init_count(odp_barrier_t *barrier, int count)
+void odp_barrier_init(odp_barrier_t *barrier, uint32_t num_threads)
 {
-	barrier->count = count;
-	barrier->bar = 0;
-	odp_sync_stores();
+	barrier->num_threads = num_threads; /* Constant after initialisation */
+	odp_atomic32_store_rls(&barrier->in_barrier, 0);
 }
 
 /*
  * Efficient barrier_sync -
  *
  *   Barriers are initialized with a count of the number of callers
- *   that must sync on the barrier before any may proceed.
+ *   that must sync on (enter) the barrier before any may proceed (exit).
  *
  *   To avoid race conditions and to permit the barrier to be fully
  *   reusable, the barrier value cycles between 0..2*count-1. When
- *   synchronizing the wasless variable simply tracks which half of
+ *   synchronizing the waslow variable simply tracks which half of
  *   the cycle the barrier was in upon entry.  Exit is when the
  *   barrier crosses to the other half of the cycle.
  */
 
 void odp_barrier_sync(odp_barrier_t *barrier)
 {
-	int count;
-	int wasless;
+	uint32_t count;
+	bool waslow;
 
-	odp_sync_stores();
-	wasless = barrier->bar < barrier->count;
-	count = odp_atomic_fetch_inc_int(&barrier->bar);
+	/* FIXME do we need acquire barrier as well? */
+	/* Increase threads in_barrier count, this will automatically release
+	 * the other threads when lower/upper range is switched */
+	count = odp_atomic32_fetch_add_rls(&barrier->in_barrier, 1);
+	/* Compute lower or higher range indicator */
+	waslow = count < barrier->num_threads;
 
-	if (count == 2*barrier->count-1) {
-		barrier->bar = 0;
-	} else {
-		while ((barrier->bar < barrier->count) == wasless)
-			odp_spin();
+	/* Check if in_barrier count has "wrapped" */
+	if (count == 2 * barrier->num_threads - 1) {
+		/* Manually wrap the counter */
+		odp_atomic32_add_rls(&barrier->in_barrier,
+				     (uint32_t)(-2*(int)barrier->num_threads));
+		/* We don't need to wait below, return immediately */
+		return;
+	}
+	/* Wait for counter to change half */
+	while ((odp_atomic32_load_rlx(&barrier->in_barrier) <
+	       barrier->num_threads) == waslow) {
+		odp_spin();
 	}
-
-	odp_mem_barrier();
 }
diff --git a/platform/linux-generic/odp_buffer.c b/platform/linux-generic/odp_buffer.c
index e54e0e7..a5939f3 100644
--- a/platform/linux-generic/odp_buffer.c
+++ b/platform/linux-generic/odp_buffer.c
@@ -73,7 +73,8 @@  int odp_buffer_snprint(char *str, size_t n, odp_buffer_t buf)
 	len += snprintf(&str[len], n-len,
 			"  cur_offset   %zu\n",       hdr->cur_offset);
 	len += snprintf(&str[len], n-len,
-			"  ref_count    %i\n",        hdr->ref_count);
+			"  ref_count    %u\n",
+			odp_atomic32_load_rlx(&hdr->ref_count));
 	len += snprintf(&str[len], n-len,
 			"  type         %i\n",        hdr->type);
 	len += snprintf(&str[len], n-len,
diff --git a/platform/linux-generic/odp_crypto.c b/platform/linux-generic/odp_crypto.c
index b37ad6b..d9fff10 100644
--- a/platform/linux-generic/odp_crypto.c
+++ b/platform/linux-generic/odp_crypto.c
@@ -26,7 +26,7 @@ 
 #define MAX_SESSIONS 32
 
 typedef struct {
-	odp_atomic_u32_t next;
+	odp_atomic32_t   next;
 	uint32_t         max;
 	odp_crypto_generic_session_t sessions[0];
 } odp_crypto_global_t;
@@ -58,7 +58,7 @@  odp_crypto_generic_session_t *alloc_session(void)
 	uint32_t idx;
 	odp_crypto_generic_session_t *session = NULL;
 
-	idx = odp_atomic_fetch_inc_u32(&global->next);
+	idx = odp_atomic32_fetch_add_rlx(&global->next, 1);
 	if (idx < global->max) {
 		session = &global->sessions[idx];
 		session->index = idx;
diff --git a/platform/linux-generic/odp_queue.c b/platform/linux-generic/odp_queue.c
index 1318bcd..08c0d29 100644
--- a/platform/linux-generic/odp_queue.c
+++ b/platform/linux-generic/odp_queue.c
@@ -214,8 +214,13 @@  int odp_queue_set_context(odp_queue_t handle, void *context)
 {
 	queue_entry_t *queue;
 	queue = queue_to_qentry(handle);
+	/* Setting a new queue context can be viewed as a release operation,
+	 * all writes to the context must be observable before the context
+	 * is made observable */
 	odp_sync_stores();
-	queue->s.param.context = context;
+	queue->s.param.context = context; /* Store-release */
+	/* Ensure queue modification is globally visible before we return
+	 * and the application might cause the queue to be scheduled */
 	odp_sync_stores();
 	return 0;
 }
diff --git a/platform/linux-generic/odp_ring.c b/platform/linux-generic/odp_ring.c
index 632aa66..d1ec825 100644
--- a/platform/linux-generic/odp_ring.c
+++ b/platform/linux-generic/odp_ring.c
@@ -187,10 +187,10 @@  odph_ring_create(const char *name, unsigned count, unsigned flags)
 		r->cons.size = count;
 		r->prod.mask = count-1;
 		r->cons.mask = count-1;
-		r->prod.head = 0;
-		r->cons.head = 0;
-		r->prod.tail = 0;
-		r->cons.tail = 0;
+		odp_atomic32_store_rlx(&r->prod.head, 0);
+		odp_atomic32_store_rlx(&r->cons.head, 0);
+		odp_atomic32_store_rlx(&r->prod.tail, 0);
+		odp_atomic32_store_rlx(&r->cons.tail, 0);
 
 		TAILQ_INSERT_TAIL(&odp_ring_list, r, next);
 	} else {
@@ -227,7 +227,7 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t prod_head, prod_next;
 	uint32_t cons_tail, free_entries;
 	const unsigned max = n;
-	int success;
+	bool ok;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 	int ret;
@@ -237,8 +237,8 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		/* Reset n to the initial burst count */
 		n = max;
 
-		prod_head = r->prod.head;
-		cons_tail = r->cons.tail;
+		prod_head = odp_atomic32_load_rlx(&r->prod.head);
+		cons_tail = odp_atomic32_load_acq(&r->cons.tail);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -259,13 +259,13 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		}
 
 		prod_next = prod_head + n;
-		success = odp_atomic_cmpset_u32(&r->prod.head, prod_head,
-					      prod_next);
-	} while (odp_unlikely(success == 0));
+		ok = odp_atomic32_cmp_and_swap_rlx(&r->prod.head,
+						   prod_head,
+						   prod_next) == prod_head;
+	} while (odp_unlikely(!ok));
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -279,10 +279,10 @@  int __odph_ring_mp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	 * If there are other enqueues in progress that preceeded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->prod.tail != prod_head))
+	while (odp_unlikely(odp_atomic32_load_rlx(&r->prod.tail) != prod_head))
 		odp_spin();
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store_rls(&r->prod.tail, prod_next);
 	return ret;
 }
 
@@ -298,8 +298,8 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	uint32_t mask = r->prod.mask;
 	int ret;
 
-	prod_head = r->prod.head;
-	cons_tail = r->cons.tail;
+	prod_head = odp_atomic32_load_rlx(&r->prod.head);
+	cons_tail = odp_atomic32_load_acq(&r->cons.tail);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * prod_head > cons_tail). So 'free_entries' is always between 0
@@ -320,11 +320,10 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 	}
 
 	prod_next = prod_head + n;
-	r->prod.head = prod_next;
+	odp_atomic32_store_rlx(&r->prod.head, prod_next);
 
 	/* write entries in ring */
 	ENQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/* if we exceed the watermark */
 	if (odp_unlikely(((mask + 1) - free_entries + n) > r->prod.watermark)) {
@@ -334,7 +333,7 @@  int __odph_ring_sp_do_enqueue(odph_ring_t *r, void * const *obj_table,
 		ret = (behavior == ODPH_RING_QUEUE_FIXED) ? 0 : n;
 	}
 
-	r->prod.tail = prod_next;
+	odp_atomic32_store_rls(&r->prod.tail, prod_next);
 	return ret;
 }
 
@@ -348,7 +347,7 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 	uint32_t cons_head, prod_tail;
 	uint32_t cons_next, entries;
 	const unsigned max = n;
-	int success;
+	bool ok;
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
@@ -357,8 +356,8 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		/* Restore n as it may change every loop */
 		n = max;
 
-		cons_head = r->cons.head;
-		prod_tail = r->prod.tail;
+		cons_head = odp_atomic32_load_rlx(&r->cons.head);
+		prod_tail = odp_atomic32_load_acq(&r->prod.tail);
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * cons_head > prod_tail). So 'entries' is always between 0
@@ -378,22 +377,22 @@  int __odph_ring_mc_do_dequeue(odph_ring_t *r, void **obj_table,
 		}
 
 		cons_next = cons_head + n;
-		success = odp_atomic_cmpset_u32(&r->cons.head, cons_head,
-					      cons_next);
-	} while (odp_unlikely(success == 0));
+		ok = odp_atomic32_cmp_and_swap_rlx(&r->cons.head,
+						   cons_head,
+						   cons_next) == cons_head;
+	} while (odp_unlikely(!ok));
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
 	/*
 	 * If there are other dequeues in progress that preceded us,
 	 * we need to wait for them to complete
 	 */
-	while (odp_unlikely(r->cons.tail != cons_head))
+	while (odp_unlikely(odp_atomic32_load_rlx(&r->cons.tail) != cons_head))
 		odp_spin();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store_rls(&r->cons.tail, cons_next);
 
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
@@ -409,8 +408,8 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	unsigned i;
 	uint32_t mask = r->prod.mask;
 
-	cons_head = r->cons.head;
-	prod_tail = r->prod.tail;
+	cons_head = odp_atomic32_load_rlx(&r->cons.head);
+	prod_tail = odp_atomic32_load_acq(&r->prod.tail);
 	/* The subtraction is done between two unsigned 32bits value
 	 * (the result is always modulo 32 bits even if we have
 	 * cons_head > prod_tail). So 'entries' is always between 0
@@ -429,13 +428,12 @@  int __odph_ring_sc_do_dequeue(odph_ring_t *r, void **obj_table,
 	}
 
 	cons_next = cons_head + n;
-	r->cons.head = cons_next;
+	odp_atomic32_store_rlx(&r->cons.head, cons_next);
 
 	/* copy in table */
 	DEQUEUE_PTRS();
-	odp_mem_barrier();
 
-	r->cons.tail = cons_next;
+	odp_atomic32_store_rls(&r->cons.tail, cons_next);
 	return behavior == ODPH_RING_QUEUE_FIXED ? 0 : n;
 }
 
@@ -482,8 +480,8 @@  int odph_ring_sc_dequeue_bulk(odph_ring_t *r, void **obj_table, unsigned n)
  */
 int odph_ring_full(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
+	uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
 	return (((cons_tail - prod_tail - 1) & r->prod.mask) == 0);
 }
 
@@ -492,8 +490,8 @@  int odph_ring_full(const odph_ring_t *r)
  */
 int odph_ring_empty(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
+	uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
 	return !!(cons_tail == prod_tail);
 }
 
@@ -502,8 +500,8 @@  int odph_ring_empty(const odph_ring_t *r)
  */
 unsigned odph_ring_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
+	uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
 	return (prod_tail - cons_tail) & r->prod.mask;
 }
 
@@ -512,8 +510,8 @@  unsigned odph_ring_count(const odph_ring_t *r)
  */
 unsigned odph_ring_free_count(const odph_ring_t *r)
 {
-	uint32_t prod_tail = r->prod.tail;
-	uint32_t cons_tail = r->cons.tail;
+	uint32_t prod_tail = odp_atomic32_load_rlx(&r->prod.tail);
+	uint32_t cons_tail = odp_atomic32_load_rlx(&r->cons.tail);
 	return (cons_tail - prod_tail - 1) & r->prod.mask;
 }
 
@@ -523,10 +521,10 @@  void odph_ring_dump(const odph_ring_t *r)
 	ODP_DBG("ring <%s>@%p\n", r->name, r);
 	ODP_DBG("  flags=%x\n", r->flags);
 	ODP_DBG("  size=%"PRIu32"\n", r->prod.size);
-	ODP_DBG("  ct=%"PRIu32"\n", r->cons.tail);
-	ODP_DBG("  ch=%"PRIu32"\n", r->cons.head);
-	ODP_DBG("  pt=%"PRIu32"\n", r->prod.tail);
-	ODP_DBG("  ph=%"PRIu32"\n", r->prod.head);
+	ODP_DBG("  ct=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.tail));
+	ODP_DBG("  ch=%"PRIu32"\n", odp_atomic32_load_rlx(&r->cons.head));
+	ODP_DBG("  pt=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.tail));
+	ODP_DBG("  ph=%"PRIu32"\n", odp_atomic32_load_rlx(&r->prod.head));
 	ODP_DBG("  used=%u\n", odph_ring_count(r));
 	ODP_DBG("  avail=%u\n", odph_ring_free_count(r));
 	if (r->prod.watermark == r->prod.size)
diff --git a/platform/linux-generic/odp_rwlock.c b/platform/linux-generic/odp_rwlock.c
index 11c8dd7..ba0a7ca 100644
--- a/platform/linux-generic/odp_rwlock.c
+++ b/platform/linux-generic/odp_rwlock.c
@@ -4,58 +4,56 @@ 
  * SPDX-License-Identifier:     BSD-3-Clause
  */
 
+#include <stdbool.h>
 #include <odp_atomic.h>
 #include <odp_rwlock.h>
-
 #include <odp_spin_internal.h>
 
 void odp_rwlock_init(odp_rwlock_t *rwlock)
 {
-	rwlock->cnt = 0;
+	odp_atomic32_store_rlx(&rwlock->cnt, 0);
 }
 
 void odp_rwlock_read_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int  is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
+	bool gotit;
+	do {
+		uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
 		/* waiting for read lock */
-		if (cnt < 0) {
+		if ((int32_t)cnt < 0) {
 			odp_spin();
 			continue;
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      cnt, cnt + 1);
-	}
+		/* Attempt to take another read lock */
+		gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt,
+						      cnt, cnt + 1) == cnt;
+	} while (!gotit);
 }
 
 void odp_rwlock_read_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_dec_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release one read lock by subtracting 1 */
+	odp_atomic32_add_rls(&rwlock->cnt, (uint32_t)-1);
 }
 
 void odp_rwlock_write_lock(odp_rwlock_t *rwlock)
 {
-	int32_t cnt;
-	int is_locked = 0;
-
-	while (is_locked == 0) {
-		cnt = rwlock->cnt;
-		/* lock aquired, wait */
+	bool gotit;
+	do {
+		uint32_t cnt = odp_atomic32_load_acq(&rwlock->cnt);
 		if (cnt != 0) {
+			/* Lock is busy */
 			odp_spin();
 			continue;
 		}
-		is_locked = odp_atomic_cmpset_u32(
-					(volatile uint32_t *)&rwlock->cnt,
-					      0, -1);
-	}
+		/* Attempt to take write lock */
+		gotit = odp_atomic32_cmp_and_swap_rlx(&rwlock->cnt, 0,
+						      (uint32_t)-1) == 0;
+	} while (!gotit);
 }
 
 void odp_rwlock_write_unlock(odp_rwlock_t *rwlock)
 {
-	odp_atomic_inc_u32((odp_atomic_u32_t *)(intptr_t)&rwlock->cnt);
+	/* Release the write lock by adding 1 */
+	odp_atomic32_add_rls(&rwlock->cnt, 1);
 }
diff --git a/platform/linux-generic/odp_thread.c b/platform/linux-generic/odp_thread.c
index b869b27..569b235 100644
--- a/platform/linux-generic/odp_thread.c
+++ b/platform/linux-generic/odp_thread.c
@@ -31,7 +31,7 @@  typedef struct {
 
 typedef struct {
 	thread_state_t   thr[ODP_CONFIG_MAX_THREADS];
-	odp_atomic_int_t num;
+	odp_atomic32_t   num;
 
 } thread_globals_t;
 
@@ -67,7 +67,7 @@  static int thread_id(void)
 	int id;
 	int cpu;
 
-	id = odp_atomic_fetch_add_int(&thread_globals->num, 1);
+	id = (int)odp_atomic32_fetch_add_rlx(&thread_globals->num, 1);
 
 	if (id >= ODP_CONFIG_MAX_THREADS) {
 		ODP_ERR("Too many threads\n");
@@ -77,7 +77,7 @@  static int thread_id(void)
 	cpu = sched_getcpu();
 
 	if (cpu < 0) {
-		ODP_ERR("getcpu failed\n");
+		ODP_ERR("sched_getcpu failed\n");
 		return -1;
 	}
 
diff --git a/platform/linux-generic/odp_ticketlock.c b/platform/linux-generic/odp_ticketlock.c
index be5b885..cadc0e0 100644
--- a/platform/linux-generic/odp_ticketlock.c
+++ b/platform/linux-generic/odp_ticketlock.c
@@ -12,9 +12,8 @@ 
 
 void odp_ticketlock_init(odp_ticketlock_t *ticketlock)
 {
-	ticketlock->next_ticket = 0;
-	ticketlock->cur_ticket  = 0;
-	odp_sync_stores();
+	odp_atomic32_store_rlx(&ticketlock->next_ticket, 0);
+	odp_atomic32_store_rlx(&ticketlock->cur_ticket, 0);
 }
 
 
@@ -22,30 +21,14 @@  void odp_ticketlock_lock(odp_ticketlock_t *ticketlock)
 {
 	uint32_t ticket;
 
-	ticket = odp_atomic_fetch_inc_u32(&ticketlock->next_ticket);
+	ticket = odp_atomic32_fetch_add_rlx(&ticketlock->next_ticket, 1);
 
-	while (ticket != ticketlock->cur_ticket)
+	while (ticket != odp_atomic32_load_acq(&ticketlock->cur_ticket))
 		odp_spin();
-
-	odp_mem_barrier();
 }
 
 
 void odp_ticketlock_unlock(odp_ticketlock_t *ticketlock)
 {
-	odp_sync_stores();
-
-	ticketlock->cur_ticket++;
-
-#if defined __OCTEON__
-	odp_sync_stores();
-#else
-	odp_mem_barrier();
-#endif
-}
-
-
-int odp_ticketlock_is_locked(odp_ticketlock_t *ticketlock)
-{
-	return ticketlock->cur_ticket != ticketlock->next_ticket;
+	odp_atomic32_add_rls(&ticketlock->cur_ticket, 1);
 }
diff --git a/platform/linux-generic/odp_timer.c b/platform/linux-generic/odp_timer.c
index 313c713..938429f 100644
--- a/platform/linux-generic/odp_timer.c
+++ b/platform/linux-generic/odp_timer.c
@@ -32,8 +32,8 @@  typedef struct {
 
 typedef struct {
 	int               allocated;
-	volatile int      active;
-	volatile uint64_t cur_tick;
+	odp_atomic32_t    active;
+	odp_atomic64_t    cur_tick;
 	timer_t           timerid;
 	odp_timer_t       timer_hdl;
 	odp_buffer_pool_t pool;
@@ -150,16 +150,14 @@  static void notify_function(union sigval sigval)
 
 	timer = sigval.sival_ptr;
 
-	if (timer->active == 0) {
+	if (odp_atomic32_load_rlx(&timer->active) == 0) {
 		ODP_DBG("Timer (%u) not active\n", timer->timer_hdl);
 		return;
 	}
 
 	/* ODP_DBG("Tick\n"); */
 
-	cur_tick = timer->cur_tick++;
-
-	odp_sync_stores();
+	cur_tick = odp_atomic64_fetch_add_rlx(&timer->cur_tick, 1);
 
 	tick = &timer->tick[cur_tick % MAX_TICKS];
 
@@ -318,8 +316,7 @@  odp_timer_t odp_timer_create(const char *name, odp_buffer_pool_t pool,
 		timer->tick[i].list = NULL;
 	}
 
-	timer->active = 1;
-	odp_sync_stores();
+	odp_atomic32_store_rls(&timer->active, 1);
 
 	timer_start(timer);
 
@@ -340,7 +337,7 @@  odp_timer_tmo_t odp_timer_absolute_tmo(odp_timer_t timer_hdl, uint64_t tmo_tick,
 	id = (int)timer_hdl - 1;
 	timer = &odp_timer.timer[id];
 
-	cur_tick = timer->cur_tick;
+	cur_tick = odp_atomic64_load_rlx(&timer->cur_tick);
 	if (tmo_tick <= cur_tick) {
 		ODP_DBG("timeout too close\n");
 		return ODP_TIMER_TMO_INVALID;
@@ -416,7 +413,7 @@  uint64_t odp_timer_current_tick(odp_timer_t timer_hdl)
 	uint32_t id;
 
 	id = timer_hdl - 1;
-	return odp_timer.timer[id].cur_tick;
+	return odp_atomic64_load_rlx(&odp_timer.timer[id].cur_tick);
 }
 
 odp_timeout_t odp_timeout_from_buffer(odp_buffer_t buf)
diff --git a/test/api_test/odp_atomic_test.c b/test/api_test/odp_atomic_test.c
index 9019d4f..4d27b32 100644
--- a/test/api_test/odp_atomic_test.c
+++ b/test/api_test/odp_atomic_test.c
@@ -10,17 +10,14 @@ 
 #include <odp_common.h>
 #include <odp_atomic_test.h>
 
-static odp_atomic_int_t a32;
-static odp_atomic_u32_t a32u;
-static odp_atomic_u64_t a64u;
+static odp_atomic32_t a32u;
+static odp_atomic64_t a64u;
 
-static odp_atomic_int_t numthrds;
+static odp_barrier_t barrier;
 
 static const char * const test_name[] = {
 	"dummy",
 	"test atomic basic ops add/sub/inc/dec",
-	"test atomic inc/dec of signed word",
-	"test atomic add/sub of signed word",
 	"test atomic inc/dec of unsigned word",
 	"test atomic add/sub of unsigned word",
 	"test atomic inc/dec of unsigned double word",
@@ -31,39 +28,29 @@  static struct timeval tv0[MAX_WORKERS], tv1[MAX_WORKERS];
 
 static void usage(void)
 {
-	printf("\n./odp_atomic -t <testcase> -n <num of pthread>,\n\n"
+	printf("\n./odp_atomic -t <testcase> -n <num of threads>\n\n"
 	       "\t<testcase> is\n"
 	       "\t\t1 - Test mix(does inc,dec,add,sub on 32/64 bit)\n"
-	       "\t\t2 - Test inc dec of signed word\n"
-	       "\t\t3 - Test add sub of signed word\n"
-	       "\t\t4 - Test inc dec of unsigned word\n"
-	       "\t\t5 - Test add sub of unsigned word\n"
-	       "\t\t6 - Test inc dec of double word\n"
-	       "\t\t7 - Test add sub of double word\n"
-	       "\t<num of pthread> is optional\n"
-	       "\t\t<1 - 31> - no of pthreads to start\n"
+	       "\t\t2 - Test inc dec of unsigned word\n"
+	       "\t\t3 - Test add sub of unsigned word\n"
+	       "\t\t4 - Test inc dec of double word\n"
+	       "\t\t5 - Test add sub of double word\n"
+	       "\t<num of thread> is optional\n"
+	       "\t\t<1 - 31> - no of threads to start\n"
 	       "\t\tif user doesn't specify this option, then\n"
-	       "\t\tno of pthreads created is equivalent to no of cores\n"
+	       "\t\tno of threads created is equivalent to no of cores\n"
 	       "\t\tavailable in the system\n"
 	       "\tExample usage:\n"
 	       "\t\t./odp_atomic -t 2\n"
 	       "\t\t./odp_atomic -t 3 -n 12\n");
 }
 
-void test_atomic_inc_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_int(&a32);
-}
-
 void test_atomic_inc_u32(void)
 {
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u32(&a32u);
+		odp_atomic32_add_rlx(&a32u, 1);
 }
 
 void test_atomic_inc_64(void)
@@ -71,15 +58,7 @@  void test_atomic_inc_64(void)
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_inc_u64(&a64u);
-}
-
-void test_atomic_dec_32(void)
-{
-	int i;
-
-	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_int(&a32);
+		odp_atomic64_add_rlx(&a64u, 1);
 }
 
 void test_atomic_dec_u32(void)
@@ -87,7 +66,7 @@  void test_atomic_dec_u32(void)
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u32(&a32u);
+		odp_atomic32_add_rlx(&a32u, (uint32_t)-1);
 }
 
 void test_atomic_dec_64(void)
@@ -95,15 +74,7 @@  void test_atomic_dec_64(void)
 	int i;
 
 	for (i = 0; i < CNT; i++)
-		odp_atomic_dec_u64(&a64u);
-}
-
-void test_atomic_add_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_int(&a32, ADD_SUB_CNT);
+		odp_atomic64_add_rlx(&a64u, (uint64_t)-1);
 }
 
 void test_atomic_add_u32(void)
@@ -111,7 +82,7 @@  void test_atomic_add_u32(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u32(&a32u, ADD_SUB_CNT);
+		odp_atomic32_fetch_add_rlx(&a32u, ADD_SUB_CNT);
 }
 
 void test_atomic_add_64(void)
@@ -119,15 +90,7 @@  void test_atomic_add_64(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_add_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_sub_32(void)
-{
-	int i;
-
-	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_int(&a32, ADD_SUB_CNT);
+		odp_atomic64_fetch_add_rlx(&a64u, ADD_SUB_CNT);
 }
 
 void test_atomic_sub_u32(void)
@@ -135,7 +98,7 @@  void test_atomic_sub_u32(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u32(&a32u, ADD_SUB_CNT);
+		odp_atomic32_fetch_add_rlx(&a32u, -ADD_SUB_CNT);
 }
 
 void test_atomic_sub_64(void)
@@ -143,19 +106,7 @@  void test_atomic_sub_64(void)
 	int i;
 
 	for (i = 0; i < (CNT / ADD_SUB_CNT); i++)
-		odp_atomic_fetch_sub_u64(&a64u, ADD_SUB_CNT);
-}
-
-void test_atomic_inc_dec_32(void)
-{
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-}
-
-void test_atomic_add_sub_32(void)
-{
-	test_atomic_add_32();
-	test_atomic_sub_32();
+		odp_atomic64_fetch_add_rlx(&a64u, -ADD_SUB_CNT);
 }
 
 void test_atomic_inc_dec_u32(void)
@@ -188,11 +139,6 @@  void test_atomic_add_sub_64(void)
  */
 void test_atomic_basic(void)
 {
-	test_atomic_inc_32();
-	test_atomic_dec_32();
-	test_atomic_add_32();
-	test_atomic_sub_32();
-
 	test_atomic_inc_u32();
 	test_atomic_dec_u32();
 	test_atomic_add_u32();
@@ -206,31 +152,24 @@  void test_atomic_basic(void)
 
 void test_atomic_init(void)
 {
-	odp_atomic_init_int(&a32);
-	odp_atomic_init_u32(&a32u);
-	odp_atomic_init_u64(&a64u);
+	odp_atomic32_store_rlx(&a32u, 0);
+	odp_atomic64_store_rlx(&a64u, 0);
 }
 
 void test_atomic_store(void)
 {
-	odp_atomic_store_int(&a32, S32_INIT_VAL);
-	odp_atomic_store_u32(&a32u, U32_INIT_VAL);
-	odp_atomic_store_u64(&a64u, U64_INIT_VAL);
+	odp_atomic32_store_rlx(&a32u, U32_INIT_VAL);
+	odp_atomic64_store_rlx(&a64u, U64_INIT_VAL);
 }
 
 int test_atomic_validate(void)
 {
-	if (odp_atomic_load_int(&a32) != S32_INIT_VAL) {
-		ODP_ERR("Atomic signed 32 usual functions failed\n");
-		return -1;
-	}
-
-	if (odp_atomic_load_u32(&a32u) != U32_INIT_VAL) {
+	if (odp_atomic32_load_rlx(&a32u) != U32_INIT_VAL) {
 		ODP_ERR("Atomic u32 usual functions failed\n");
 		return -1;
 	}
 
-	if (odp_atomic_load_u64(&a64u) != U64_INIT_VAL) {
+	if (odp_atomic64_load_rlx(&a64u) != U64_INIT_VAL) {
 		ODP_ERR("Atomic u64 usual functions failed\n");
 		return -1;
 	}
@@ -247,11 +186,8 @@  static void *run_thread(void *arg)
 
 	ODP_DBG("Thread %i starts\n", thr);
 
-	odp_atomic_inc_int(&numthrds);
-
-	/* Wait here until all pthreads are created */
-	while (*(volatile int *)&numthrds < parg->numthrds)
-		;
+	/* Wait here until all threads have arrived */
+	odp_barrier_sync(&barrier);
 
 	gettimeofday(&tv0[thr], NULL);
 
@@ -259,12 +195,6 @@  static void *run_thread(void *arg)
 	case TEST_MIX:
 		test_atomic_basic();
 		break;
-	case TEST_INC_DEC_S32:
-		test_atomic_inc_dec_32();
-		break;
-	case TEST_ADD_SUB_S32:
-		test_atomic_add_sub_32();
-		break;
 	case TEST_INC_DEC_U32:
 		test_atomic_inc_dec_u32();
 		break;
@@ -327,7 +257,6 @@  int main(int argc, char *argv[])
 	if (pthrdnum == 0)
 		pthrdnum = odp_sys_core_count();
 
-	odp_atomic_init_int(&numthrds);
 	test_atomic_init();
 	test_atomic_store();
 
@@ -342,6 +271,7 @@  int main(int argc, char *argv[])
 		usage();
 		goto err_exit;
 	}
+	odp_barrier_init(&barrier, pthrdnum);
 	odp_test_thread_create(run_thread, &thrdarg);
 
 	odp_test_thread_exit(&thrdarg);
diff --git a/test/api_test/odp_atomic_test.h b/test/api_test/odp_atomic_test.h
index 7814da5..aaa9d34 100644
--- a/test/api_test/odp_atomic_test.h
+++ b/test/api_test/odp_atomic_test.h
@@ -18,14 +18,11 @@ 
 #define ADD_SUB_CNT	5
 
 #define	CNT 500000
-#define	S32_INIT_VAL	(1UL << 10)
 #define	U32_INIT_VAL	(1UL << 10)
 #define	U64_INIT_VAL	(1ULL << 33)
 
 typedef enum {
 	TEST_MIX = 1, /* Must be first test case num */
-	TEST_INC_DEC_S32,
-	TEST_ADD_SUB_S32,
 	TEST_INC_DEC_U32,
 	TEST_ADD_SUB_U32,
 	TEST_INC_DEC_64,
@@ -34,16 +31,10 @@  typedef enum {
 } odp_test_atomic_t;
 
 
-void test_atomic_inc_dec_32(void);
-void test_atomic_add_sub_32(void);
 void test_atomic_inc_dec_u32(void);
 void test_atomic_add_sub_u32(void);
 void test_atomic_inc_dec_64(void);
 void test_atomic_add_sub_64(void);
-void test_atomic_inc_32(void);
-void test_atomic_dec_32(void);
-void test_atomic_add_32(void);
-void test_atomic_sub_32(void);
 void test_atomic_inc_u32(void);
 void test_atomic_dec_u32(void);
 void test_atomic_add_u32(void);