diff mbox series

[RFC,v4,10/10] util/bufferiszero: Add sve acceleration for aarch64

Message ID 20240215081449.848220-11-richard.henderson@linaro.org
State New
Headers show
Series Optimize buffer_is_zero | expand

Commit Message

Richard Henderson Feb. 15, 2024, 8:14 a.m. UTC
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

RFC because I've not benchmarked this on real hw, only run it
through qemu for validation.

---
 host/include/aarch64/host/cpuinfo.h |  1 +
 util/bufferiszero.c                 | 49 +++++++++++++++++++++++++++++
 util/cpuinfo-aarch64.c              |  1 +
 meson.build                         | 13 ++++++++
 4 files changed, 64 insertions(+)

Comments

Alex Bennée Feb. 16, 2024, 9:33 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> RFC because I've not benchmarked this on real hw, only run it
> through qemu for validation.

I think we have an a64fx is the TCWG lab you could probably run the
tests on if you want. Otherwise I might be able to spin up a Graviton
on AWS to run a measurement. Do we have a benchmark test to run?

>
> ---
>  host/include/aarch64/host/cpuinfo.h |  1 +
>  util/bufferiszero.c                 | 49 +++++++++++++++++++++++++++++
>  util/cpuinfo-aarch64.c              |  1 +
>  meson.build                         | 13 ++++++++
>  4 files changed, 64 insertions(+)
>
> diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
> index fe671534e4..b4b816cd07 100644
> --- a/host/include/aarch64/host/cpuinfo.h
> +++ b/host/include/aarch64/host/cpuinfo.h
> @@ -12,6 +12,7 @@
>  #define CPUINFO_AES             (1u << 3)
>  #define CPUINFO_PMULL           (1u << 4)
>  #define CPUINFO_BTI             (1u << 5)
> +#define CPUINFO_SVE             (1u << 6)
>  
>  /* Initialized with a constructor. */
>  extern unsigned cpuinfo;
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 2809b09225..af64c9c224 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -270,13 +270,62 @@ static bool buffer_is_zero_simd(const void *buf, size_t len)
>      return vaddvq_u32(vceqzq_u32(t0)) == -4;
>  }
>  
> +#ifdef CONFIG_SVE_OPT
> +#include <arm_sve.h>
> +
> +#ifndef __ARM_FEATURE_SVE
> +__attribute__((target("+sve")))
> +#endif
> +static bool buffer_is_zero_sve(const void *buf, size_t len)
> +{
> +    svbool_t p, t = svptrue_b8();
> +    size_t i, n;
> +
> +    /*
> +     * For the first vector, align to 16 -- reading 1 to 256 bytes.
> +     * Note this routine is only called with len >= 256, which is the
> +     * architectural maximum vector length: the first vector always fits.
> +     */
> +    i = 0;
> +    n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf;
> +    p = svwhilelt_b8(i, n);
> +
> +    do {
> +        svuint8_t d = svld1_u8(p, buf + i);
> +
> +        p = svcmpne_n_u8(t, d, 0);
> +        if (unlikely(svptest_any(t, p))) {
> +            return false;
> +        }
> +        i += n;
> +        n = svcntb();
> +        p = svwhilelt_b8(i, len);
> +    } while (svptest_any(t, p));
> +
> +    return true;
> +}
> +#endif /* CONFIG_SVE_OPT */
> +
>  static biz_accel_fn const accel_table[] = {
>      buffer_is_zero_int_ge256,
>      buffer_is_zero_simd,
> +#ifdef CONFIG_SVE_OPT
> +    buffer_is_zero_sve,
> +#endif
>  };
>  
> +#ifdef CONFIG_SVE_OPT
> +static unsigned accel_index;
> +static void __attribute__((constructor)) init_accel(void)
> +{
> +    accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
> +    buffer_is_zero_accel = accel_table[accel_index];
> +}
> +#define INIT_ACCEL NULL
> +#else
>  static unsigned accel_index = 1;
>  #define INIT_ACCEL buffer_is_zero_simd
> +#endif /* CONFIG_SVE_OPT */
>  
>  bool test_buffer_is_zero_next_accel(void)
>  {
> diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
> index 4c8a005715..a1e22ea66e 100644
> --- a/util/cpuinfo-aarch64.c
> +++ b/util/cpuinfo-aarch64.c
> @@ -61,6 +61,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
>      info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
>      info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
>      info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
> +    info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0);
>  
>      unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
>      info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0);
> diff --git a/meson.build b/meson.build
> index c1dc83e4c0..89a8241bc0 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2822,6 +2822,18 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
>      void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
>    '''))
>  
> +config_host_data.set('CONFIG_SVE_OPT', cc.compiles('''
> +    #include <arm_sve.h>
> +    #ifndef __ARM_FEATURE_SVE
> +    __attribute__((target("+sve")))
> +    #endif
> +    void foo(void *p) {
> +        svbool_t t = svptrue_b8();
> +        svuint8_t d = svld1_u8(t, p);
> +        svptest_any(t, svcmpne_n_u8(t, d, 0));
> +    }
> +  '''))
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> @@ -4232,6 +4244,7 @@ summary_info += {'memory allocator':  get_option('malloc')}
>  summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')}
>  summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')}
>  summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')}
> +summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')}
>  summary_info += {'gcov':              get_option('b_coverage')}
>  summary_info += {'thread sanitizer':  get_option('tsan')}
>  summary_info += {'CFI support':       get_option('cfi')}
Alex Bennée Feb. 16, 2024, 11:05 a.m. UTC | #2
Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> RFC because I've not benchmarked this on real hw, only run it
> through qemu for validation.
>
<snip>
>  
> +#ifdef CONFIG_SVE_OPT
> +static unsigned accel_index;
> +static void __attribute__((constructor)) init_accel(void)
> +{
> +    accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
> +    buffer_is_zero_accel = accel_table[accel_index];
> +}

This really needs to be:

  -    accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
  +    unsigned info = cpuinfo_init();
  +    accel_index = (info & CPUINFO_SVE ? 2 : 1);

because otherwise you are relying on constructor initialisation order
and on the Graviton 3 I built on it wasn't detecting the SVE. With that I
get this from "perf record ./tests/unit/test-bufferiszero -m thorough"

  51.17%  test-bufferisze  test-bufferiszero      [.] buffer_is_zero_sve
  18.92%  test-bufferisze  test-bufferiszero      [.] buffer_is_zero_simd
  18.02%  test-bufferisze  test-bufferiszero      [.] buffer_is_zero_int_ge256
   7.67%  test-bufferisze  test-bufferiszero      [.] buffer_is_zero_ool
   4.09%  test-bufferisze  test-bufferiszero      [.] test_1

but as I mentioned before it would be nice to have a proper benchmark
for the buffer utils as I'm sure the unit test would be prone to noise.
diff mbox series

Patch

diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
index fe671534e4..b4b816cd07 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -12,6 +12,7 @@ 
 #define CPUINFO_AES             (1u << 3)
 #define CPUINFO_PMULL           (1u << 4)
 #define CPUINFO_BTI             (1u << 5)
+#define CPUINFO_SVE             (1u << 6)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 2809b09225..af64c9c224 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -270,13 +270,62 @@  static bool buffer_is_zero_simd(const void *buf, size_t len)
     return vaddvq_u32(vceqzq_u32(t0)) == -4;
 }
 
+#ifdef CONFIG_SVE_OPT
+#include <arm_sve.h>
+
+#ifndef __ARM_FEATURE_SVE
+__attribute__((target("+sve")))
+#endif
+static bool buffer_is_zero_sve(const void *buf, size_t len)
+{
+    svbool_t p, t = svptrue_b8();
+    size_t i, n;
+
+    /*
+     * For the first vector, align to 16 -- reading 1 to 256 bytes.
+     * Note this routine is only called with len >= 256, which is the
+     * architectural maximum vector length: the first vector always fits.
+     */
+    i = 0;
+    n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf;
+    p = svwhilelt_b8(i, n);
+
+    do {
+        svuint8_t d = svld1_u8(p, buf + i);
+
+        p = svcmpne_n_u8(t, d, 0);
+        if (unlikely(svptest_any(t, p))) {
+            return false;
+        }
+        i += n;
+        n = svcntb();
+        p = svwhilelt_b8(i, len);
+    } while (svptest_any(t, p));
+
+    return true;
+}
+#endif /* CONFIG_SVE_OPT */
+
 static biz_accel_fn const accel_table[] = {
     buffer_is_zero_int_ge256,
     buffer_is_zero_simd,
+#ifdef CONFIG_SVE_OPT
+    buffer_is_zero_sve,
+#endif
 };
 
+#ifdef CONFIG_SVE_OPT
+static unsigned accel_index;
+static void __attribute__((constructor)) init_accel(void)
+{
+    accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
+    buffer_is_zero_accel = accel_table[accel_index];
+}
+#define INIT_ACCEL NULL
+#else
 static unsigned accel_index = 1;
 #define INIT_ACCEL buffer_is_zero_simd
+#endif /* CONFIG_SVE_OPT */
 
 bool test_buffer_is_zero_next_accel(void)
 {
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index 4c8a005715..a1e22ea66e 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -61,6 +61,7 @@  unsigned __attribute__((constructor)) cpuinfo_init(void)
     info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
     info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
     info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
+    info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0);
 
     unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
     info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0);
diff --git a/meson.build b/meson.build
index c1dc83e4c0..89a8241bc0 100644
--- a/meson.build
+++ b/meson.build
@@ -2822,6 +2822,18 @@  config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
     void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
   '''))
 
+config_host_data.set('CONFIG_SVE_OPT', cc.compiles('''
+    #include <arm_sve.h>
+    #ifndef __ARM_FEATURE_SVE
+    __attribute__((target("+sve")))
+    #endif
+    void foo(void *p) {
+        svbool_t t = svptrue_b8();
+        svuint8_t d = svld1_u8(t, p);
+        svptest_any(t, svcmpne_n_u8(t, d, 0));
+    }
+  '''))
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
@@ -4232,6 +4244,7 @@  summary_info += {'memory allocator':  get_option('malloc')}
 summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')}
 summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')}
 summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')}
+summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')}
 summary_info += {'gcov':              get_option('b_coverage')}
 summary_info += {'thread sanitizer':  get_option('tsan')}
 summary_info += {'CFI support':       get_option('cfi')}