Message ID | 20240215081449.848220-11-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | Optimize buffer_is_zero | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > > RFC because I've not benchmarked this on real hw, only run it > through qemu for validation. I think we have an a64fx is the TCWG lab you could probably run the tests on if you want. Otherwise I might be able to spin up a Graviton on AWS to run a measurement. Do we have a benchmark test to run? > > --- > host/include/aarch64/host/cpuinfo.h | 1 + > util/bufferiszero.c | 49 +++++++++++++++++++++++++++++ > util/cpuinfo-aarch64.c | 1 + > meson.build | 13 ++++++++ > 4 files changed, 64 insertions(+) > > diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h > index fe671534e4..b4b816cd07 100644 > --- a/host/include/aarch64/host/cpuinfo.h > +++ b/host/include/aarch64/host/cpuinfo.h > @@ -12,6 +12,7 @@ > #define CPUINFO_AES (1u << 3) > #define CPUINFO_PMULL (1u << 4) > #define CPUINFO_BTI (1u << 5) > +#define CPUINFO_SVE (1u << 6) > > /* Initialized with a constructor. */ > extern unsigned cpuinfo; > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index 2809b09225..af64c9c224 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -270,13 +270,62 @@ static bool buffer_is_zero_simd(const void *buf, size_t len) > return vaddvq_u32(vceqzq_u32(t0)) == -4; > } > > +#ifdef CONFIG_SVE_OPT > +#include <arm_sve.h> > + > +#ifndef __ARM_FEATURE_SVE > +__attribute__((target("+sve"))) > +#endif > +static bool buffer_is_zero_sve(const void *buf, size_t len) > +{ > + svbool_t p, t = svptrue_b8(); > + size_t i, n; > + > + /* > + * For the first vector, align to 16 -- reading 1 to 256 bytes. > + * Note this routine is only called with len >= 256, which is the > + * architectural maximum vector length: the first vector always fits. > + */ > + i = 0; > + n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf; > + p = svwhilelt_b8(i, n); > + > + do { > + svuint8_t d = svld1_u8(p, buf + i); > + > + p = svcmpne_n_u8(t, d, 0); > + if (unlikely(svptest_any(t, p))) { > + return false; > + } > + i += n; > + n = svcntb(); > + p = svwhilelt_b8(i, len); > + } while (svptest_any(t, p)); > + > + return true; > +} > +#endif /* CONFIG_SVE_OPT */ > + > static biz_accel_fn const accel_table[] = { > buffer_is_zero_int_ge256, > buffer_is_zero_simd, > +#ifdef CONFIG_SVE_OPT > + buffer_is_zero_sve, > +#endif > }; > > +#ifdef CONFIG_SVE_OPT > +static unsigned accel_index; > +static void __attribute__((constructor)) init_accel(void) > +{ > + accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1); > + buffer_is_zero_accel = accel_table[accel_index]; > +} > +#define INIT_ACCEL NULL > +#else > static unsigned accel_index = 1; > #define INIT_ACCEL buffer_is_zero_simd > +#endif /* CONFIG_SVE_OPT */ > > bool test_buffer_is_zero_next_accel(void) > { > diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c > index 4c8a005715..a1e22ea66e 100644 > --- a/util/cpuinfo-aarch64.c > +++ b/util/cpuinfo-aarch64.c > @@ -61,6 +61,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void) > info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); > info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); > info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0); > + info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0); > > unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2); > info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0); > diff --git a/meson.build b/meson.build > index c1dc83e4c0..89a8241bc0 100644 > --- a/meson.build > +++ b/meson.build > @@ -2822,6 +2822,18 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' > void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } > ''')) > > +config_host_data.set('CONFIG_SVE_OPT', cc.compiles(''' > + #include <arm_sve.h> > + #ifndef __ARM_FEATURE_SVE > + __attribute__((target("+sve"))) > + #endif > + void foo(void *p) { > + svbool_t t = svptrue_b8(); > + svuint8_t d = svld1_u8(t, p); > + svptest_any(t, svcmpne_n_u8(t, d, 0)); > + } > + ''')) > + > have_pvrdma = get_option('pvrdma') \ > .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ > .require(cc.compiles(gnu_source_prefix + ''' > @@ -4232,6 +4244,7 @@ summary_info += {'memory allocator': get_option('malloc')} > summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} > summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')} > summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} > +summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')} > summary_info += {'gcov': get_option('b_coverage')} > summary_info += {'thread sanitizer': get_option('tsan')} > summary_info += {'CFI support': get_option('cfi')}
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > > RFC because I've not benchmarked this on real hw, only run it > through qemu for validation. > <snip> > > +#ifdef CONFIG_SVE_OPT > +static unsigned accel_index; > +static void __attribute__((constructor)) init_accel(void) > +{ > + accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1); > + buffer_is_zero_accel = accel_table[accel_index]; > +} This really needs to be: - accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1); + unsigned info = cpuinfo_init(); + accel_index = (info & CPUINFO_SVE ? 2 : 1); because otherwise you are relying on constructor initialisation order and on the Graviton 3 I built on it wasn't detecting the SVE. With that I get this from "perf record ./tests/unit/test-bufferiszero -m thorough" 51.17% test-bufferisze test-bufferiszero [.] buffer_is_zero_sve 18.92% test-bufferisze test-bufferiszero [.] buffer_is_zero_simd 18.02% test-bufferisze test-bufferiszero [.] buffer_is_zero_int_ge256 7.67% test-bufferisze test-bufferiszero [.] buffer_is_zero_ool 4.09% test-bufferisze test-bufferiszero [.] test_1 but as I mentioned before it would be nice to have a proper benchmark for the buffer utils as I'm sure the unit test would be prone to noise.
diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h index fe671534e4..b4b816cd07 100644 --- a/host/include/aarch64/host/cpuinfo.h +++ b/host/include/aarch64/host/cpuinfo.h @@ -12,6 +12,7 @@ #define CPUINFO_AES (1u << 3) #define CPUINFO_PMULL (1u << 4) #define CPUINFO_BTI (1u << 5) +#define CPUINFO_SVE (1u << 6) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/util/bufferiszero.c b/util/bufferiszero.c index 2809b09225..af64c9c224 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -270,13 +270,62 @@ static bool buffer_is_zero_simd(const void *buf, size_t len) return vaddvq_u32(vceqzq_u32(t0)) == -4; } +#ifdef CONFIG_SVE_OPT +#include <arm_sve.h> + +#ifndef __ARM_FEATURE_SVE +__attribute__((target("+sve"))) +#endif +static bool buffer_is_zero_sve(const void *buf, size_t len) +{ + svbool_t p, t = svptrue_b8(); + size_t i, n; + + /* + * For the first vector, align to 16 -- reading 1 to 256 bytes. + * Note this routine is only called with len >= 256, which is the + * architectural maximum vector length: the first vector always fits. + */ + i = 0; + n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf; + p = svwhilelt_b8(i, n); + + do { + svuint8_t d = svld1_u8(p, buf + i); + + p = svcmpne_n_u8(t, d, 0); + if (unlikely(svptest_any(t, p))) { + return false; + } + i += n; + n = svcntb(); + p = svwhilelt_b8(i, len); + } while (svptest_any(t, p)); + + return true; +} +#endif /* CONFIG_SVE_OPT */ + static biz_accel_fn const accel_table[] = { buffer_is_zero_int_ge256, buffer_is_zero_simd, +#ifdef CONFIG_SVE_OPT + buffer_is_zero_sve, +#endif }; +#ifdef CONFIG_SVE_OPT +static unsigned accel_index; +static void __attribute__((constructor)) init_accel(void) +{ + accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1); + buffer_is_zero_accel = accel_table[accel_index]; +} +#define INIT_ACCEL NULL +#else static unsigned accel_index = 1; #define INIT_ACCEL buffer_is_zero_simd +#endif /* CONFIG_SVE_OPT */ bool test_buffer_is_zero_next_accel(void) { diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c index 4c8a005715..a1e22ea66e 100644 --- a/util/cpuinfo-aarch64.c +++ b/util/cpuinfo-aarch64.c @@ -61,6 +61,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void) info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0); + info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0); unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2); info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0); diff --git a/meson.build b/meson.build index c1dc83e4c0..89a8241bc0 100644 --- a/meson.build +++ b/meson.build @@ -2822,6 +2822,18 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles(''' void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); } ''')) +config_host_data.set('CONFIG_SVE_OPT', cc.compiles(''' + #include <arm_sve.h> + #ifndef __ARM_FEATURE_SVE + __attribute__((target("+sve"))) + #endif + void foo(void *p) { + svbool_t t = svptrue_b8(); + svuint8_t d = svld1_u8(t, p); + svptest_any(t, svcmpne_n_u8(t, d, 0)); + } + ''')) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' @@ -4232,6 +4244,7 @@ summary_info += {'memory allocator': get_option('malloc')} summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')} summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')} summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')} +summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')} summary_info += {'gcov': get_option('b_coverage')} summary_info += {'thread sanitizer': get_option('tsan')} summary_info += {'CFI support': get_option('cfi')}
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- RFC because I've not benchmarked this on real hw, only run it through qemu for validation. --- host/include/aarch64/host/cpuinfo.h | 1 + util/bufferiszero.c | 49 +++++++++++++++++++++++++++++ util/cpuinfo-aarch64.c | 1 + meson.build | 13 ++++++++ 4 files changed, 64 insertions(+)