Message ID | 20170428120958.17526-10-petri.savolainen@linaro.org |
---|---|
State | Accepted |
Commit | fbe34c754b7abe15100779ce9833a7f200517d9f |
Headers | show |
Series | Use HW time counter | expand |
On 04/28 15:09:56, Petri Savolainen wrote: > Use 64 bit HW time counter when available. It is used on > x86 when invariant TSC CPU flag indicates that TSC frequency > is constant. Otherwise, the system time is used as before. Direct > HW time counter usage avoids system call, and related latency > and performance issues. > > Signed-off-by: Petri Savolainen <petri.savolainen@linaro.org> > --- > platform/linux-generic/Makefile.am | 1 + > platform/linux-generic/arch/arm/odp_cpu_arch.c | 16 ++ > platform/linux-generic/arch/default/odp_cpu_arch.c | 16 ++ > platform/linux-generic/arch/mips64/odp_cpu_arch.c | 16 ++ > platform/linux-generic/arch/powerpc/odp_cpu_arch.c | 16 ++ > platform/linux-generic/arch/x86/cpu_flags.c | 9 + > platform/linux-generic/arch/x86/odp_cpu_arch.c | 59 ++++ > .../include/odp/api/plat/time_types.h | 23 +- > platform/linux-generic/include/odp_time_internal.h | 24 ++ > platform/linux-generic/odp_time.c | 300 ++++++++++++++++----- > 10 files changed, 415 insertions(+), 65 deletions(-) > create mode 100644 platform/linux-generic/include/odp_time_internal.h > > diff --git a/platform/linux-generic/Makefile.am b/platform/linux-generic/Makefile.am > index ab74c14c..cd7afba2 100644 > --- a/platform/linux-generic/Makefile.am > +++ b/platform/linux-generic/Makefile.am > @@ -172,6 +172,7 @@ noinst_HEADERS = \ > ${srcdir}/include/odp_schedule_if.h \ > ${srcdir}/include/odp_sorted_list_internal.h \ > ${srcdir}/include/odp_shm_internal.h \ > + ${srcdir}/include/odp_time_internal.h \ > ${srcdir}/include/odp_timer_internal.h \ > ${srcdir}/include/odp_timer_wheel_internal.h \ > ${srcdir}/include/odp_traffic_mngr_internal.h \ > diff --git a/platform/linux-generic/arch/arm/odp_cpu_arch.c b/platform/linux-generic/arch/arm/odp_cpu_arch.c > index 2ac223e0..c31f9084 100644 > --- a/platform/linux-generic/arch/arm/odp_cpu_arch.c > +++ b/platform/linux-generic/arch/arm/odp_cpu_arch.c > @@ -13,6 +13,7 @@ > #include <odp/api/hints.h> > #include <odp/api/system_info.h> > #include <odp_debug_internal.h> > +#include <odp_time_internal.h> > > #define GIGA 1000000000 > > @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) > { > return 1; > } > + > +int cpu_has_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time_freq(void) > +{ > + return 0; > +} > diff --git a/platform/linux-generic/arch/default/odp_cpu_arch.c b/platform/linux-generic/arch/default/odp_cpu_arch.c > index 2ac223e0..c31f9084 100644 > --- a/platform/linux-generic/arch/default/odp_cpu_arch.c > +++ b/platform/linux-generic/arch/default/odp_cpu_arch.c > @@ -13,6 +13,7 @@ > #include <odp/api/hints.h> > #include <odp/api/system_info.h> > #include <odp_debug_internal.h> > +#include <odp_time_internal.h> > > #define GIGA 1000000000 > > @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) > { > return 1; > } > + > +int cpu_has_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time_freq(void) > +{ > + return 0; > +} > diff --git a/platform/linux-generic/arch/mips64/odp_cpu_arch.c b/platform/linux-generic/arch/mips64/odp_cpu_arch.c > index 646acf9c..f7eafa0f 100644 > --- a/platform/linux-generic/arch/mips64/odp_cpu_arch.c > +++ b/platform/linux-generic/arch/mips64/odp_cpu_arch.c > @@ -7,6 +7,7 @@ > #include <odp/api/cpu.h> > #include <odp/api/hints.h> > #include <odp/api/system_info.h> > +#include <odp_time_internal.h> > > uint64_t odp_cpu_cycles(void) > { > @@ -29,3 +30,18 @@ uint64_t odp_cpu_cycles_resolution(void) > { > return 1; > } > + > +int cpu_has_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time_freq(void) > +{ > + return 0; > +} > diff --git a/platform/linux-generic/arch/powerpc/odp_cpu_arch.c b/platform/linux-generic/arch/powerpc/odp_cpu_arch.c > index 2ac223e0..c31f9084 100644 > --- a/platform/linux-generic/arch/powerpc/odp_cpu_arch.c > +++ b/platform/linux-generic/arch/powerpc/odp_cpu_arch.c > @@ -13,6 +13,7 @@ > #include <odp/api/hints.h> > #include <odp/api/system_info.h> > #include <odp_debug_internal.h> > +#include <odp_time_internal.h> > > #define GIGA 1000000000 > > @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) > { > return 1; > } > + > +int cpu_has_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time(void) > +{ > + return 0; > +} > + > +uint64_t cpu_global_time_freq(void) > +{ > + return 0; > +} > diff --git a/platform/linux-generic/arch/x86/cpu_flags.c b/platform/linux-generic/arch/x86/cpu_flags.c > index 954dac27..a492a35b 100644 > --- a/platform/linux-generic/arch/x86/cpu_flags.c > +++ b/platform/linux-generic/arch/x86/cpu_flags.c > @@ -39,6 +39,7 @@ > > #include <arch/x86/cpu_flags.h> > #include <odp_debug_internal.h> > +#include <odp_time_internal.h> > #include <stdio.h> > #include <stdint.h> > > @@ -357,3 +358,11 @@ void cpu_flags_print_all(void) > str[len] = '\0'; > ODP_PRINT("%s", str); > } > + > +int cpu_has_global_time(void) > +{ > + if (cpu_get_flag_enabled(RTE_CPUFLAG_INVTSC) > 0) > + return 1; > + > + return 0; > +} > diff --git a/platform/linux-generic/arch/x86/odp_cpu_arch.c b/platform/linux-generic/arch/x86/odp_cpu_arch.c > index c8cf27b6..9ba601a3 100644 > --- a/platform/linux-generic/arch/x86/odp_cpu_arch.c > +++ b/platform/linux-generic/arch/x86/odp_cpu_arch.c > @@ -3,7 +3,14 @@ > * > * SPDX-License-Identifier: BSD-3-Clause > */ > + > +#include <odp_posix_extensions.h> > + > #include <odp/api/cpu.h> > +#include <odp_time_internal.h> > +#include <odp_debug_internal.h> > + > +#include <time.h> > > uint64_t odp_cpu_cycles(void) > { > @@ -31,3 +38,55 @@ uint64_t odp_cpu_cycles_resolution(void) > { > return 1; > } > + > +uint64_t cpu_global_time(void) > +{ > + return odp_cpu_cycles(); > +} > + > +#define SEC_IN_NS 1000000000ULL > + > +/* Measure TSC frequency. Frequency information registers are defined for x86, > + * but those are often not enumerated. */ > +uint64_t cpu_global_time_freq(void) > +{ > + struct timespec sleep, ts1, ts2; > + uint64_t t1, t2, ts_nsec, cycles, hz; > + int i; > + uint64_t avg = 0; > + int rounds = 4; > + > + for (i = 0; i < rounds; i++) { > + sleep.tv_sec = 0; > + sleep.tv_nsec = SEC_IN_NS / 10; > + > + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts1)) { > + ODP_DBG("clock_gettime failed\n"); > + return 0; > + } > + > + t1 = cpu_global_time(); > + > + if (nanosleep(&sleep, NULL) < 0) { > + ODP_DBG("nanosleep failed\n"); > + return 0; > + } > + > + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts2)) { > + ODP_DBG("clock_gettime failed\n"); > + return 0; > + } > + > + t2 = cpu_global_time(); > + > + ts_nsec = (ts2.tv_sec - ts1.tv_sec) * SEC_IN_NS; > + ts_nsec += ts2.tv_nsec - ts1.tv_nsec; > + > + cycles = t2 - t1; > + > + hz = (cycles * SEC_IN_NS) / ts_nsec; > + avg += hz; > + } > + > + return avg / rounds; > +} > diff --git a/platform/linux-generic/include/odp/api/plat/time_types.h b/platform/linux-generic/include/odp/api/plat/time_types.h > index 4847f3b1..1cafb1f7 100644 > --- a/platform/linux-generic/include/odp/api/plat/time_types.h > +++ b/platform/linux-generic/include/odp/api/plat/time_types.h > @@ -26,11 +26,28 @@ extern "C" { > * the linux timespec structure, which is dependent on POSIX extension level. > */ > typedef struct odp_time_t { > - int64_t tv_sec; /**< @internal Seconds */ > - int64_t tv_nsec; /**< @internal Nanoseconds */ > + union { > + /** @internal Posix timespec */ > + struct { > + /** @internal Seconds */ > + int64_t tv_sec; > + > + /** @internal Nanoseconds */ > + int64_t tv_nsec; > + } spec; > + > + /** @internal HW time counter */ > + struct { > + /** @internal Counter value */ > + uint64_t count; > + > + /** @internal Reserved */ > + uint64_t reserved; > + } hw; > + }; > } odp_time_t; > > -#define ODP_TIME_NULL ((odp_time_t){0, 0}) > +#define ODP_TIME_NULL ((odp_time_t){.spec = {0, 0} }) > > /** > * @} > diff --git a/platform/linux-generic/include/odp_time_internal.h b/platform/linux-generic/include/odp_time_internal.h > new file mode 100644 > index 00000000..99ac7977 > --- /dev/null > +++ b/platform/linux-generic/include/odp_time_internal.h > @@ -0,0 +1,24 @@ > +/* Copyright (c) 2017, Linaro Limited > + * All rights reserved. > + * > + * SPDX-License-Identifier: BSD-3-Clause > + */ > + > +#ifndef ODP_TIME_INTERNAL_H_ > +#define ODP_TIME_INTERNAL_H_ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#include <stdint.h> > + > +int cpu_has_global_time(void); > +uint64_t cpu_global_time(void); > +uint64_t cpu_global_time_freq(void); > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif > diff --git a/platform/linux-generic/odp_time.c b/platform/linux-generic/odp_time.c > index 0e5966c0..ac82175d 100644 > --- a/platform/linux-generic/odp_time.c > +++ b/platform/linux-generic/odp_time.c > @@ -10,36 +10,39 @@ > #include <odp/api/time.h> > #include <odp/api/hints.h> > #include <odp_debug_internal.h> > +#include <odp_time_internal.h> > +#include <string.h> > +#include <inttypes.h> > > -static odp_time_t start_time; > +typedef struct time_global_t { > + odp_time_t start_time; > + int use_hw; > + uint64_t hw_start; > + uint64_t hw_freq_hz; > +} time_global_t; > > -static inline > -uint64_t time_to_ns(odp_time_t time) > -{ > - uint64_t ns; > - > - ns = time.tv_sec * ODP_TIME_SEC_IN_NS; > - ns += time.tv_nsec; > +static time_global_t global; > > - return ns; > -} > +/* > + * Posix timespec based functions > + */ > > -static inline odp_time_t time_diff(odp_time_t t2, odp_time_t t1) > +static inline odp_time_t time_spec_diff(odp_time_t t2, odp_time_t t1) > { > odp_time_t time; > > - time.tv_sec = t2.tv_sec - t1.tv_sec; > - time.tv_nsec = t2.tv_nsec - t1.tv_nsec; > + time.spec.tv_sec = t2.spec.tv_sec - t1.spec.tv_sec; > + time.spec.tv_nsec = t2.spec.tv_nsec - t1.spec.tv_nsec; > > - if (time.tv_nsec < 0) { > - time.tv_nsec += ODP_TIME_SEC_IN_NS; > - --time.tv_sec; > + if (time.spec.tv_nsec < 0) { > + time.spec.tv_nsec += ODP_TIME_SEC_IN_NS; > + --time.spec.tv_sec; > } > > return time; > } > > -static inline odp_time_t time_local(void) > +static inline odp_time_t time_spec_cur(void) > { > int ret; > odp_time_t time; > @@ -49,77 +52,234 @@ static inline odp_time_t time_local(void) > if (odp_unlikely(ret != 0)) > ODP_ABORT("clock_gettime failed\n"); > > - time.tv_sec = sys_time.tv_sec; > - time.tv_nsec = sys_time.tv_nsec; > + time.spec.tv_sec = sys_time.tv_sec; > + time.spec.tv_nsec = sys_time.tv_nsec; > > - return time_diff(time, start_time); > + return time_spec_diff(time, global.start_time); > } > > -static inline int time_cmp(odp_time_t t2, odp_time_t t1) > +static inline uint64_t time_spec_res(void) > { > - if (t2.tv_sec < t1.tv_sec) > + int ret; > + struct timespec tres; > + > + ret = clock_getres(CLOCK_MONOTONIC_RAW, &tres); > + if (odp_unlikely(ret != 0)) > + ODP_ABORT("clock_getres failed\n"); > + > + return ODP_TIME_SEC_IN_NS / (uint64_t)tres.tv_nsec; > +} > + > +static inline int time_spec_cmp(odp_time_t t2, odp_time_t t1) > +{ > + if (t2.spec.tv_sec < t1.spec.tv_sec) > return -1; > > - if (t2.tv_sec > t1.tv_sec) > + if (t2.spec.tv_sec > t1.spec.tv_sec) > return 1; > > - return t2.tv_nsec - t1.tv_nsec; > + return t2.spec.tv_nsec - t1.spec.tv_nsec; > } > > -static inline odp_time_t time_sum(odp_time_t t1, odp_time_t t2) > +static inline odp_time_t time_spec_sum(odp_time_t t1, odp_time_t t2) > { > odp_time_t time; > > - time.tv_sec = t2.tv_sec + t1.tv_sec; > - time.tv_nsec = t2.tv_nsec + t1.tv_nsec; > + time.spec.tv_sec = t2.spec.tv_sec + t1.spec.tv_sec; > + time.spec.tv_nsec = t2.spec.tv_nsec + t1.spec.tv_nsec; > > - if (time.tv_nsec >= (long)ODP_TIME_SEC_IN_NS) { > - time.tv_nsec -= ODP_TIME_SEC_IN_NS; > - ++time.tv_sec; > + if (time.spec.tv_nsec >= (long)ODP_TIME_SEC_IN_NS) { > + time.spec.tv_nsec -= ODP_TIME_SEC_IN_NS; > + ++time.spec.tv_sec; > } > > return time; > } > > -static inline odp_time_t time_local_from_ns(uint64_t ns) > +static inline uint64_t time_spec_to_ns(odp_time_t time) > +{ > + uint64_t ns; > + > + ns = time.spec.tv_sec * ODP_TIME_SEC_IN_NS; > + ns += time.spec.tv_nsec; > + > + return ns; > +} > + > +static inline odp_time_t time_spec_from_ns(uint64_t ns) > { > odp_time_t time; > > - time.tv_sec = ns / ODP_TIME_SEC_IN_NS; > - time.tv_nsec = ns - time.tv_sec * ODP_TIME_SEC_IN_NS; > + time.spec.tv_sec = ns / ODP_TIME_SEC_IN_NS; > + time.spec.tv_nsec = ns - time.spec.tv_sec * ODP_TIME_SEC_IN_NS; > > return time; > } > > -static inline void time_wait_until(odp_time_t time) > +/* > + * HW time counter based functions > + */ > + > +static inline odp_time_t time_hw_cur(void) > { > - odp_time_t cur; > + odp_time_t time; > > - do { > - cur = time_local(); > - } while (time_cmp(time, cur) > 0); > + time.hw.count = cpu_global_time() - global.hw_start; Computing the offset is unnecessarily expensive. The simplest and lowest overhead solution is to just store the value read from HW and convert at a later point in time. But, this no longer represents what odp_time_t represents. That is why I introduced odp_tick_t in the timer RFC and design doc posted to the list *several* times. > + > + return time; > } > > -static inline uint64_t time_local_res(void) > +static inline uint64_t time_hw_res(void) > { > - int ret; > - struct timespec tres; > + /* Promise a bit lower resolution than average cycle counter > + * frequency */ > + return global.hw_freq_hz / 10; > +} > > - ret = clock_getres(CLOCK_MONOTONIC_RAW, &tres); > - if (odp_unlikely(ret != 0)) > - ODP_ABORT("clock_getres failed\n"); > +static inline int time_hw_cmp(odp_time_t t2, odp_time_t t1) > +{ > + if (odp_likely(t2.hw.count > t1.hw.count)) > + return 1; > > - return ODP_TIME_SEC_IN_NS / (uint64_t)tres.tv_nsec; > + if (t2.hw.count < t1.hw.count) > + return -1; > + > + return 0; > +} > + > +static inline odp_time_t time_hw_diff(odp_time_t t2, odp_time_t t1) > +{ > + odp_time_t time; > + > + time.hw.count = t2.hw.count - t1.hw.count; > + > + return time; > +} > + > +static inline odp_time_t time_hw_sum(odp_time_t t1, odp_time_t t2) > +{ > + odp_time_t time; > + > + time.hw.count = t1.hw.count + t2.hw.count; > + > + return time; > +} > + > +static inline uint64_t time_hw_to_ns(odp_time_t time) > +{ > + uint64_t nsec; > + uint64_t freq_hz = global.hw_freq_hz; > + uint64_t count = time.hw.count; > + uint64_t sec = 0; > + > + if (count >= freq_hz) { > + sec = count / freq_hz; > + count = count - sec * freq_hz; > + } > + > + nsec = (ODP_TIME_SEC_IN_NS * count) / freq_hz; > + > + return (sec * ODP_TIME_SEC_IN_NS) + nsec; > +} > + > +static inline odp_time_t time_hw_from_ns(uint64_t ns) > +{ > + odp_time_t time; > + uint64_t count; > + uint64_t freq_hz = global.hw_freq_hz; > + uint64_t sec = 0; > + > + if (ns >= ODP_TIME_SEC_IN_NS) { > + sec = ns / ODP_TIME_SEC_IN_NS; > + ns = ns - sec * ODP_TIME_SEC_IN_NS; > + } > + > + count = sec * freq_hz; > + count += (ns * freq_hz) / ODP_TIME_SEC_IN_NS; > + > + time.hw.reserved = 0; > + time.hw.count = count; > + > + return time; > +} > + > +/* > + * Common functions > + */ > + > +static inline odp_time_t time_cur(void) > +{ > + if (global.use_hw) > + return time_hw_cur(); > + > + return time_spec_cur(); > +} > + > +static inline uint64_t time_res(void) > +{ > + if (global.use_hw) > + return time_hw_res(); > + > + return time_spec_res(); > +} > + > +static inline int time_cmp(odp_time_t t2, odp_time_t t1) > +{ > + if (global.use_hw) > + return time_hw_cmp(t2, t1); > + > + return time_spec_cmp(t2, t1); > +} > + > +static inline odp_time_t time_diff(odp_time_t t2, odp_time_t t1) > +{ > + if (global.use_hw) > + return time_hw_diff(t2, t1); > + > + return time_spec_diff(t2, t1); > +} > + > +static inline odp_time_t time_sum(odp_time_t t1, odp_time_t t2) > +{ > + if (global.use_hw) > + return time_hw_sum(t1, t2); > + > + return time_spec_sum(t1, t2); > +} > + > +static inline uint64_t time_to_ns(odp_time_t time) > +{ > + if (global.use_hw) > + return time_hw_to_ns(time); > + > + return time_spec_to_ns(time); > +} > + > +static inline odp_time_t time_from_ns(uint64_t ns) > +{ > + if (global.use_hw) > + return time_hw_from_ns(ns); > + > + return time_spec_from_ns(ns); > +} > + > +static inline void time_wait_until(odp_time_t time) > +{ > + odp_time_t cur; > + > + do { > + cur = time_cur(); > + } while (time_cmp(time, cur) > 0); > } > > odp_time_t odp_time_local(void) > { > - return time_local(); > + return time_cur(); > } > > odp_time_t odp_time_global(void) > { > - return time_local(); > + return time_cur(); > } > > odp_time_t odp_time_diff(odp_time_t t2, odp_time_t t1) > @@ -134,12 +294,12 @@ uint64_t odp_time_to_ns(odp_time_t time) > > odp_time_t odp_time_local_from_ns(uint64_t ns) > { > - return time_local_from_ns(ns); > + return time_from_ns(ns); > } > > odp_time_t odp_time_global_from_ns(uint64_t ns) > { > - return time_local_from_ns(ns); > + return time_from_ns(ns); > } > > int odp_time_cmp(odp_time_t t2, odp_time_t t1) > @@ -154,18 +314,18 @@ odp_time_t odp_time_sum(odp_time_t t1, odp_time_t t2) > > uint64_t odp_time_local_res(void) > { > - return time_local_res(); > + return time_res(); > } > > uint64_t odp_time_global_res(void) > { > - return time_local_res(); > + return time_res(); > } > > void odp_time_wait_ns(uint64_t ns) > { > - odp_time_t cur = time_local(); > - odp_time_t wait = time_local_from_ns(ns); > + odp_time_t cur = time_cur(); > + odp_time_t wait = time_from_ns(ns); > odp_time_t end_time = time_sum(cur, wait); > > time_wait_until(end_time); > @@ -178,15 +338,31 @@ void odp_time_wait_until(odp_time_t time) > > int odp_time_init_global(void) > { > - int ret; > - struct timespec time; > - > - ret = clock_gettime(CLOCK_MONOTONIC_RAW, &time); > - if (ret) { > - start_time = ODP_TIME_NULL; > - } else { > - start_time.tv_sec = time.tv_sec; > - start_time.tv_nsec = time.tv_nsec; > + struct timespec sys_time; > + int ret = 0; > + > + memset(&global, 0, sizeof(time_global_t)); > + > + if (cpu_has_global_time()) { > + global.use_hw = 1; > + global.hw_freq_hz = cpu_global_time_freq(); > + > + if (global.hw_freq_hz == 0) > + return -1; > + > + printf("HW time counter freq: %" PRIu64 " hz\n\n", > + global.hw_freq_hz); > + > + global.hw_start = cpu_global_time(); > + return 0; > + } > + > + global.start_time = ODP_TIME_NULL; > + > + ret = clock_gettime(CLOCK_MONOTONIC_RAW, &sys_time); > + if (ret == 0) { > + global.start_time.spec.tv_sec = sys_time.tv_sec; > + global.start_time.spec.tv_nsec = sys_time.tv_nsec; > } > > return ret; > -- > 2.11.0 >
> > +static inline odp_time_t time_hw_cur(void) > > { > > - odp_time_t cur; > > + odp_time_t time; > > > > - do { > > - cur = time_local(); > > - } while (time_cmp(time, cur) > 0); > > + time.hw.count = cpu_global_time() - global.hw_start; > > Computing the offset is unnecessarily expensive. The simplest and lowest > overhead solution is to just store the value read from HW and convert > at a later point in time. But, this no longer represents what odp_time_t > represents. That is why I introduced odp_tick_t in the timer RFC and > design doc posted to the list *several* times. Purpose of this set is not to change the API, but optimize the implementation. The most optimal solution would be to zero the HW counter in ODP time init and then here return the register value. If this function is called often, global.hw_start stays in L1 cache (it's a constant) and overhead of the subtract is a matter of a CPU cycle or two. API change for a CPU cycle or two is not economical. It does not matter too much in practice, if the subtract is done here or during conversion to nsec. The other changes of this set matter more: TSC counter vs system call, and 128 bits vs 64 bits storage (for memory foot print). Also further optimizations are always possible, but this level of changes are needed for the current API to use HW counter and pack timespec into 64 bits. For example, a next step could be to inline these functions. -Petri
On 05/02 16:03:45, Savolainen, Petri (Nokia - FI/Espoo) wrote: > > > +static inline odp_time_t time_hw_cur(void) > > > { > > > - odp_time_t cur; > > > + odp_time_t time; > > > > > > - do { > > > - cur = time_local(); > > > - } while (time_cmp(time, cur) > 0); > > > + time.hw.count = cpu_global_time() - global.hw_start; > > > > Computing the offset is unnecessarily expensive. The simplest and lowest > > overhead solution is to just store the value read from HW and convert > > at a later point in time. But, this no longer represents what odp_time_t > > represents. That is why I introduced odp_tick_t in the timer RFC and > > design doc posted to the list *several* times. > > Purpose of this set is not to change the API, but optimize the implementation. The point is that you can do better if you don't focus on optimizing what exists today. > The most optimal solution would be to zero the HW counter in ODP time init and then here return the register value. That is only if you think about the previous design. > If this function is called often, global.hw_start stays in L1 cache (it's a constant) and overhead of the subtract is a matter of a CPU cycle or two. API change for a CPU cycle or two is not economical. It does not matter too much in practice, if the subtract is done here or during conversion to nsec. The other changes of this set matter more: TSC counter vs system call, and 128 bits vs 64 bits storage (for memory foot print). Also further optimizations are always possible, but this level of changes are needed for the current API to use HW counter and pack timespec into 64 bits. For example, a next step could be to inline these functions. Why didn't you articulate these concerns earlier? There have been plenty of times where I have posted code, a doc, and benchmarks indicating that I am active in this area. I can understand that apps are not flexible enough to adapt to an API change, but that should not gate contributions and collaboration. > -Petri > >
diff --git a/platform/linux-generic/Makefile.am b/platform/linux-generic/Makefile.am index ab74c14c..cd7afba2 100644 --- a/platform/linux-generic/Makefile.am +++ b/platform/linux-generic/Makefile.am @@ -172,6 +172,7 @@ noinst_HEADERS = \ ${srcdir}/include/odp_schedule_if.h \ ${srcdir}/include/odp_sorted_list_internal.h \ ${srcdir}/include/odp_shm_internal.h \ + ${srcdir}/include/odp_time_internal.h \ ${srcdir}/include/odp_timer_internal.h \ ${srcdir}/include/odp_timer_wheel_internal.h \ ${srcdir}/include/odp_traffic_mngr_internal.h \ diff --git a/platform/linux-generic/arch/arm/odp_cpu_arch.c b/platform/linux-generic/arch/arm/odp_cpu_arch.c index 2ac223e0..c31f9084 100644 --- a/platform/linux-generic/arch/arm/odp_cpu_arch.c +++ b/platform/linux-generic/arch/arm/odp_cpu_arch.c @@ -13,6 +13,7 @@ #include <odp/api/hints.h> #include <odp/api/system_info.h> #include <odp_debug_internal.h> +#include <odp_time_internal.h> #define GIGA 1000000000 @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) { return 1; } + +int cpu_has_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time_freq(void) +{ + return 0; +} diff --git a/platform/linux-generic/arch/default/odp_cpu_arch.c b/platform/linux-generic/arch/default/odp_cpu_arch.c index 2ac223e0..c31f9084 100644 --- a/platform/linux-generic/arch/default/odp_cpu_arch.c +++ b/platform/linux-generic/arch/default/odp_cpu_arch.c @@ -13,6 +13,7 @@ #include <odp/api/hints.h> #include <odp/api/system_info.h> #include <odp_debug_internal.h> +#include <odp_time_internal.h> #define GIGA 1000000000 @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) { return 1; } + +int cpu_has_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time_freq(void) +{ + return 0; +} diff --git a/platform/linux-generic/arch/mips64/odp_cpu_arch.c b/platform/linux-generic/arch/mips64/odp_cpu_arch.c index 646acf9c..f7eafa0f 100644 --- a/platform/linux-generic/arch/mips64/odp_cpu_arch.c +++ b/platform/linux-generic/arch/mips64/odp_cpu_arch.c @@ -7,6 +7,7 @@ #include <odp/api/cpu.h> #include <odp/api/hints.h> #include <odp/api/system_info.h> +#include <odp_time_internal.h> uint64_t odp_cpu_cycles(void) { @@ -29,3 +30,18 @@ uint64_t odp_cpu_cycles_resolution(void) { return 1; } + +int cpu_has_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time_freq(void) +{ + return 0; +} diff --git a/platform/linux-generic/arch/powerpc/odp_cpu_arch.c b/platform/linux-generic/arch/powerpc/odp_cpu_arch.c index 2ac223e0..c31f9084 100644 --- a/platform/linux-generic/arch/powerpc/odp_cpu_arch.c +++ b/platform/linux-generic/arch/powerpc/odp_cpu_arch.c @@ -13,6 +13,7 @@ #include <odp/api/hints.h> #include <odp/api/system_info.h> #include <odp_debug_internal.h> +#include <odp_time_internal.h> #define GIGA 1000000000 @@ -46,3 +47,18 @@ uint64_t odp_cpu_cycles_resolution(void) { return 1; } + +int cpu_has_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time(void) +{ + return 0; +} + +uint64_t cpu_global_time_freq(void) +{ + return 0; +} diff --git a/platform/linux-generic/arch/x86/cpu_flags.c b/platform/linux-generic/arch/x86/cpu_flags.c index 954dac27..a492a35b 100644 --- a/platform/linux-generic/arch/x86/cpu_flags.c +++ b/platform/linux-generic/arch/x86/cpu_flags.c @@ -39,6 +39,7 @@ #include <arch/x86/cpu_flags.h> #include <odp_debug_internal.h> +#include <odp_time_internal.h> #include <stdio.h> #include <stdint.h> @@ -357,3 +358,11 @@ void cpu_flags_print_all(void) str[len] = '\0'; ODP_PRINT("%s", str); } + +int cpu_has_global_time(void) +{ + if (cpu_get_flag_enabled(RTE_CPUFLAG_INVTSC) > 0) + return 1; + + return 0; +} diff --git a/platform/linux-generic/arch/x86/odp_cpu_arch.c b/platform/linux-generic/arch/x86/odp_cpu_arch.c index c8cf27b6..9ba601a3 100644 --- a/platform/linux-generic/arch/x86/odp_cpu_arch.c +++ b/platform/linux-generic/arch/x86/odp_cpu_arch.c @@ -3,7 +3,14 @@ * * SPDX-License-Identifier: BSD-3-Clause */ + +#include <odp_posix_extensions.h> + #include <odp/api/cpu.h> +#include <odp_time_internal.h> +#include <odp_debug_internal.h> + +#include <time.h> uint64_t odp_cpu_cycles(void) { @@ -31,3 +38,55 @@ uint64_t odp_cpu_cycles_resolution(void) { return 1; } + +uint64_t cpu_global_time(void) +{ + return odp_cpu_cycles(); +} + +#define SEC_IN_NS 1000000000ULL + +/* Measure TSC frequency. Frequency information registers are defined for x86, + * but those are often not enumerated. */ +uint64_t cpu_global_time_freq(void) +{ + struct timespec sleep, ts1, ts2; + uint64_t t1, t2, ts_nsec, cycles, hz; + int i; + uint64_t avg = 0; + int rounds = 4; + + for (i = 0; i < rounds; i++) { + sleep.tv_sec = 0; + sleep.tv_nsec = SEC_IN_NS / 10; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts1)) { + ODP_DBG("clock_gettime failed\n"); + return 0; + } + + t1 = cpu_global_time(); + + if (nanosleep(&sleep, NULL) < 0) { + ODP_DBG("nanosleep failed\n"); + return 0; + } + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts2)) { + ODP_DBG("clock_gettime failed\n"); + return 0; + } + + t2 = cpu_global_time(); + + ts_nsec = (ts2.tv_sec - ts1.tv_sec) * SEC_IN_NS; + ts_nsec += ts2.tv_nsec - ts1.tv_nsec; + + cycles = t2 - t1; + + hz = (cycles * SEC_IN_NS) / ts_nsec; + avg += hz; + } + + return avg / rounds; +} diff --git a/platform/linux-generic/include/odp/api/plat/time_types.h b/platform/linux-generic/include/odp/api/plat/time_types.h index 4847f3b1..1cafb1f7 100644 --- a/platform/linux-generic/include/odp/api/plat/time_types.h +++ b/platform/linux-generic/include/odp/api/plat/time_types.h @@ -26,11 +26,28 @@ extern "C" { * the linux timespec structure, which is dependent on POSIX extension level. */ typedef struct odp_time_t { - int64_t tv_sec; /**< @internal Seconds */ - int64_t tv_nsec; /**< @internal Nanoseconds */ + union { + /** @internal Posix timespec */ + struct { + /** @internal Seconds */ + int64_t tv_sec; + + /** @internal Nanoseconds */ + int64_t tv_nsec; + } spec; + + /** @internal HW time counter */ + struct { + /** @internal Counter value */ + uint64_t count; + + /** @internal Reserved */ + uint64_t reserved; + } hw; + }; } odp_time_t; -#define ODP_TIME_NULL ((odp_time_t){0, 0}) +#define ODP_TIME_NULL ((odp_time_t){.spec = {0, 0} }) /** * @} diff --git a/platform/linux-generic/include/odp_time_internal.h b/platform/linux-generic/include/odp_time_internal.h new file mode 100644 index 00000000..99ac7977 --- /dev/null +++ b/platform/linux-generic/include/odp_time_internal.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2017, Linaro Limited + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef ODP_TIME_INTERNAL_H_ +#define ODP_TIME_INTERNAL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +int cpu_has_global_time(void); +uint64_t cpu_global_time(void); +uint64_t cpu_global_time_freq(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/platform/linux-generic/odp_time.c b/platform/linux-generic/odp_time.c index 0e5966c0..ac82175d 100644 --- a/platform/linux-generic/odp_time.c +++ b/platform/linux-generic/odp_time.c @@ -10,36 +10,39 @@ #include <odp/api/time.h> #include <odp/api/hints.h> #include <odp_debug_internal.h> +#include <odp_time_internal.h> +#include <string.h> +#include <inttypes.h> -static odp_time_t start_time; +typedef struct time_global_t { + odp_time_t start_time; + int use_hw; + uint64_t hw_start; + uint64_t hw_freq_hz; +} time_global_t; -static inline -uint64_t time_to_ns(odp_time_t time) -{ - uint64_t ns; - - ns = time.tv_sec * ODP_TIME_SEC_IN_NS; - ns += time.tv_nsec; +static time_global_t global; - return ns; -} +/* + * Posix timespec based functions + */ -static inline odp_time_t time_diff(odp_time_t t2, odp_time_t t1) +static inline odp_time_t time_spec_diff(odp_time_t t2, odp_time_t t1) { odp_time_t time; - time.tv_sec = t2.tv_sec - t1.tv_sec; - time.tv_nsec = t2.tv_nsec - t1.tv_nsec; + time.spec.tv_sec = t2.spec.tv_sec - t1.spec.tv_sec; + time.spec.tv_nsec = t2.spec.tv_nsec - t1.spec.tv_nsec; - if (time.tv_nsec < 0) { - time.tv_nsec += ODP_TIME_SEC_IN_NS; - --time.tv_sec; + if (time.spec.tv_nsec < 0) { + time.spec.tv_nsec += ODP_TIME_SEC_IN_NS; + --time.spec.tv_sec; } return time; } -static inline odp_time_t time_local(void) +static inline odp_time_t time_spec_cur(void) { int ret; odp_time_t time; @@ -49,77 +52,234 @@ static inline odp_time_t time_local(void) if (odp_unlikely(ret != 0)) ODP_ABORT("clock_gettime failed\n"); - time.tv_sec = sys_time.tv_sec; - time.tv_nsec = sys_time.tv_nsec; + time.spec.tv_sec = sys_time.tv_sec; + time.spec.tv_nsec = sys_time.tv_nsec; - return time_diff(time, start_time); + return time_spec_diff(time, global.start_time); } -static inline int time_cmp(odp_time_t t2, odp_time_t t1) +static inline uint64_t time_spec_res(void) { - if (t2.tv_sec < t1.tv_sec) + int ret; + struct timespec tres; + + ret = clock_getres(CLOCK_MONOTONIC_RAW, &tres); + if (odp_unlikely(ret != 0)) + ODP_ABORT("clock_getres failed\n"); + + return ODP_TIME_SEC_IN_NS / (uint64_t)tres.tv_nsec; +} + +static inline int time_spec_cmp(odp_time_t t2, odp_time_t t1) +{ + if (t2.spec.tv_sec < t1.spec.tv_sec) return -1; - if (t2.tv_sec > t1.tv_sec) + if (t2.spec.tv_sec > t1.spec.tv_sec) return 1; - return t2.tv_nsec - t1.tv_nsec; + return t2.spec.tv_nsec - t1.spec.tv_nsec; } -static inline odp_time_t time_sum(odp_time_t t1, odp_time_t t2) +static inline odp_time_t time_spec_sum(odp_time_t t1, odp_time_t t2) { odp_time_t time; - time.tv_sec = t2.tv_sec + t1.tv_sec; - time.tv_nsec = t2.tv_nsec + t1.tv_nsec; + time.spec.tv_sec = t2.spec.tv_sec + t1.spec.tv_sec; + time.spec.tv_nsec = t2.spec.tv_nsec + t1.spec.tv_nsec; - if (time.tv_nsec >= (long)ODP_TIME_SEC_IN_NS) { - time.tv_nsec -= ODP_TIME_SEC_IN_NS; - ++time.tv_sec; + if (time.spec.tv_nsec >= (long)ODP_TIME_SEC_IN_NS) { + time.spec.tv_nsec -= ODP_TIME_SEC_IN_NS; + ++time.spec.tv_sec; } return time; } -static inline odp_time_t time_local_from_ns(uint64_t ns) +static inline uint64_t time_spec_to_ns(odp_time_t time) +{ + uint64_t ns; + + ns = time.spec.tv_sec * ODP_TIME_SEC_IN_NS; + ns += time.spec.tv_nsec; + + return ns; +} + +static inline odp_time_t time_spec_from_ns(uint64_t ns) { odp_time_t time; - time.tv_sec = ns / ODP_TIME_SEC_IN_NS; - time.tv_nsec = ns - time.tv_sec * ODP_TIME_SEC_IN_NS; + time.spec.tv_sec = ns / ODP_TIME_SEC_IN_NS; + time.spec.tv_nsec = ns - time.spec.tv_sec * ODP_TIME_SEC_IN_NS; return time; } -static inline void time_wait_until(odp_time_t time) +/* + * HW time counter based functions + */ + +static inline odp_time_t time_hw_cur(void) { - odp_time_t cur; + odp_time_t time; - do { - cur = time_local(); - } while (time_cmp(time, cur) > 0); + time.hw.count = cpu_global_time() - global.hw_start; + + return time; } -static inline uint64_t time_local_res(void) +static inline uint64_t time_hw_res(void) { - int ret; - struct timespec tres; + /* Promise a bit lower resolution than average cycle counter + * frequency */ + return global.hw_freq_hz / 10; +} - ret = clock_getres(CLOCK_MONOTONIC_RAW, &tres); - if (odp_unlikely(ret != 0)) - ODP_ABORT("clock_getres failed\n"); +static inline int time_hw_cmp(odp_time_t t2, odp_time_t t1) +{ + if (odp_likely(t2.hw.count > t1.hw.count)) + return 1; - return ODP_TIME_SEC_IN_NS / (uint64_t)tres.tv_nsec; + if (t2.hw.count < t1.hw.count) + return -1; + + return 0; +} + +static inline odp_time_t time_hw_diff(odp_time_t t2, odp_time_t t1) +{ + odp_time_t time; + + time.hw.count = t2.hw.count - t1.hw.count; + + return time; +} + +static inline odp_time_t time_hw_sum(odp_time_t t1, odp_time_t t2) +{ + odp_time_t time; + + time.hw.count = t1.hw.count + t2.hw.count; + + return time; +} + +static inline uint64_t time_hw_to_ns(odp_time_t time) +{ + uint64_t nsec; + uint64_t freq_hz = global.hw_freq_hz; + uint64_t count = time.hw.count; + uint64_t sec = 0; + + if (count >= freq_hz) { + sec = count / freq_hz; + count = count - sec * freq_hz; + } + + nsec = (ODP_TIME_SEC_IN_NS * count) / freq_hz; + + return (sec * ODP_TIME_SEC_IN_NS) + nsec; +} + +static inline odp_time_t time_hw_from_ns(uint64_t ns) +{ + odp_time_t time; + uint64_t count; + uint64_t freq_hz = global.hw_freq_hz; + uint64_t sec = 0; + + if (ns >= ODP_TIME_SEC_IN_NS) { + sec = ns / ODP_TIME_SEC_IN_NS; + ns = ns - sec * ODP_TIME_SEC_IN_NS; + } + + count = sec * freq_hz; + count += (ns * freq_hz) / ODP_TIME_SEC_IN_NS; + + time.hw.reserved = 0; + time.hw.count = count; + + return time; +} + +/* + * Common functions + */ + +static inline odp_time_t time_cur(void) +{ + if (global.use_hw) + return time_hw_cur(); + + return time_spec_cur(); +} + +static inline uint64_t time_res(void) +{ + if (global.use_hw) + return time_hw_res(); + + return time_spec_res(); +} + +static inline int time_cmp(odp_time_t t2, odp_time_t t1) +{ + if (global.use_hw) + return time_hw_cmp(t2, t1); + + return time_spec_cmp(t2, t1); +} + +static inline odp_time_t time_diff(odp_time_t t2, odp_time_t t1) +{ + if (global.use_hw) + return time_hw_diff(t2, t1); + + return time_spec_diff(t2, t1); +} + +static inline odp_time_t time_sum(odp_time_t t1, odp_time_t t2) +{ + if (global.use_hw) + return time_hw_sum(t1, t2); + + return time_spec_sum(t1, t2); +} + +static inline uint64_t time_to_ns(odp_time_t time) +{ + if (global.use_hw) + return time_hw_to_ns(time); + + return time_spec_to_ns(time); +} + +static inline odp_time_t time_from_ns(uint64_t ns) +{ + if (global.use_hw) + return time_hw_from_ns(ns); + + return time_spec_from_ns(ns); +} + +static inline void time_wait_until(odp_time_t time) +{ + odp_time_t cur; + + do { + cur = time_cur(); + } while (time_cmp(time, cur) > 0); } odp_time_t odp_time_local(void) { - return time_local(); + return time_cur(); } odp_time_t odp_time_global(void) { - return time_local(); + return time_cur(); } odp_time_t odp_time_diff(odp_time_t t2, odp_time_t t1) @@ -134,12 +294,12 @@ uint64_t odp_time_to_ns(odp_time_t time) odp_time_t odp_time_local_from_ns(uint64_t ns) { - return time_local_from_ns(ns); + return time_from_ns(ns); } odp_time_t odp_time_global_from_ns(uint64_t ns) { - return time_local_from_ns(ns); + return time_from_ns(ns); } int odp_time_cmp(odp_time_t t2, odp_time_t t1) @@ -154,18 +314,18 @@ odp_time_t odp_time_sum(odp_time_t t1, odp_time_t t2) uint64_t odp_time_local_res(void) { - return time_local_res(); + return time_res(); } uint64_t odp_time_global_res(void) { - return time_local_res(); + return time_res(); } void odp_time_wait_ns(uint64_t ns) { - odp_time_t cur = time_local(); - odp_time_t wait = time_local_from_ns(ns); + odp_time_t cur = time_cur(); + odp_time_t wait = time_from_ns(ns); odp_time_t end_time = time_sum(cur, wait); time_wait_until(end_time); @@ -178,15 +338,31 @@ void odp_time_wait_until(odp_time_t time) int odp_time_init_global(void) { - int ret; - struct timespec time; - - ret = clock_gettime(CLOCK_MONOTONIC_RAW, &time); - if (ret) { - start_time = ODP_TIME_NULL; - } else { - start_time.tv_sec = time.tv_sec; - start_time.tv_nsec = time.tv_nsec; + struct timespec sys_time; + int ret = 0; + + memset(&global, 0, sizeof(time_global_t)); + + if (cpu_has_global_time()) { + global.use_hw = 1; + global.hw_freq_hz = cpu_global_time_freq(); + + if (global.hw_freq_hz == 0) + return -1; + + printf("HW time counter freq: %" PRIu64 " hz\n\n", + global.hw_freq_hz); + + global.hw_start = cpu_global_time(); + return 0; + } + + global.start_time = ODP_TIME_NULL; + + ret = clock_gettime(CLOCK_MONOTONIC_RAW, &sys_time); + if (ret == 0) { + global.start_time.spec.tv_sec = sys_time.tv_sec; + global.start_time.spec.tv_nsec = sys_time.tv_nsec; } return ret;
Use 64 bit HW time counter when available. It is used on x86 when invariant TSC CPU flag indicates that TSC frequency is constant. Otherwise, the system time is used as before. Direct HW time counter usage avoids system call, and related latency and performance issues. Signed-off-by: Petri Savolainen <petri.savolainen@linaro.org> --- platform/linux-generic/Makefile.am | 1 + platform/linux-generic/arch/arm/odp_cpu_arch.c | 16 ++ platform/linux-generic/arch/default/odp_cpu_arch.c | 16 ++ platform/linux-generic/arch/mips64/odp_cpu_arch.c | 16 ++ platform/linux-generic/arch/powerpc/odp_cpu_arch.c | 16 ++ platform/linux-generic/arch/x86/cpu_flags.c | 9 + platform/linux-generic/arch/x86/odp_cpu_arch.c | 59 ++++ .../include/odp/api/plat/time_types.h | 23 +- platform/linux-generic/include/odp_time_internal.h | 24 ++ platform/linux-generic/odp_time.c | 300 ++++++++++++++++----- 10 files changed, 415 insertions(+), 65 deletions(-) create mode 100644 platform/linux-generic/include/odp_time_internal.h -- 2.11.0