diff mbox series

[057/114] target/arm: Move sve zip high_ofs into simd_data

Message ID 20220527181907.189259-58-richard.henderson@linaro.org
State Superseded
Headers show
Series target/arm: Rewrite sve feature tests | expand

Commit Message

Richard Henderson May 27, 2022, 6:18 p.m. UTC
This is in line with how we treat uzp, and will
eliminate the special case code during translation.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/sve_helper.c    |  6 ++++--
 target/arm/translate-sve.c | 12 ++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

Comments

Idan Horowitz Oct. 27, 2022, 5:29 p.m. UTC | #1
On Fri, 27 May 2022 at 22:05, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is in line with how we treat uzp, and will
> eliminate the special case code during translation.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---

Sorry, a bit late, but I believe this change broke the implementation
of the ZIP2 SVE instructions:

>  target/arm/sve_helper.c    |  6 ++++--
>  target/arm/translate-sve.c | 12 ++++++------
>  2 files changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index e0f9aa9983..3bdcd4ce9d 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -3382,6 +3382,7 @@ void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
>  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
>  {                                                                    \
>      intptr_t oprsz = simd_oprsz(desc);                               \
> +    intptr_t odd_ofs = simd_data(desc);                              \
>      intptr_t i, oprsz_2 = oprsz / 2;                                 \
>      ARMVectorReg tmp_n, tmp_m;                                       \
>      /* We produce output faster than we consume input.               \
> @@ -3393,8 +3394,9 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
>          vm = memcpy(&tmp_m, vm, oprsz_2);                            \
>      }                                                                \

Unlike the for loop below, we are not taking the odd_ofs into account
in the 2 memcpys above. As an optimization these memcpys only copy
half of the vector, so when the instruction is using the same vector
register for 2 of the operands, and odd_ofs is not 0, we end up
reading the upper bytes of tmp_n/tmp_m, which are uninitialized
garbage. (Which is ironically a good thing in this case, since
non-deterministic incorrect behaviour was easier to find than
deterministic wrong results)

>      for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
> -        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
> -        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
> +        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
> +        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
> +            *(TYPE *)(vm + odd_ofs + H(i));                          \
>      }                                                                \
>      if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
>          memset(vd + oprsz - 16, 0, 16);                              \
> diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
> index 1e6bcedb9d..c2ced3e2bb 100644
> --- a/target/arm/translate-sve.c
> +++ b/target/arm/translate-sve.c
> @@ -2298,9 +2298,9 @@ static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high)
>          unsigned vsz = vec_full_reg_size(s);
>          unsigned high_ofs = high ? vsz / 2 : 0;
>          tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
> -                           vec_full_reg_offset(s, a->rn) + high_ofs,
> -                           vec_full_reg_offset(s, a->rm) + high_ofs,
> -                           vsz, vsz, 0, fns[a->esz]);
> +                           vec_full_reg_offset(s, a->rn),
> +                           vec_full_reg_offset(s, a->rm),
> +                           vsz, vsz, high_ofs, fns[a->esz]);
>      }
>      return true;
>  }
> @@ -2324,9 +2324,9 @@ static bool do_zip_q(DisasContext *s, arg_rrr_esz *a, bool high)
>          unsigned vsz = vec_full_reg_size(s);
>          unsigned high_ofs = high ? QEMU_ALIGN_DOWN(vsz, 32) / 2 : 0;
>          tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
> -                           vec_full_reg_offset(s, a->rn) + high_ofs,
> -                           vec_full_reg_offset(s, a->rm) + high_ofs,
> -                           vsz, vsz, 0, gen_helper_sve2_zip_q);
> +                           vec_full_reg_offset(s, a->rn),
> +                           vec_full_reg_offset(s, a->rm),
> +                           vsz, vsz, high_ofs, gen_helper_sve2_zip_q);
>      }
>      return true;
>  }
> --
> 2.34.1
>
>

Best Regards, Idan Horowitz
diff mbox series

Patch

diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index e0f9aa9983..3bdcd4ce9d 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -3382,6 +3382,7 @@  void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
 {                                                                    \
     intptr_t oprsz = simd_oprsz(desc);                               \
+    intptr_t odd_ofs = simd_data(desc);                              \
     intptr_t i, oprsz_2 = oprsz / 2;                                 \
     ARMVectorReg tmp_n, tmp_m;                                       \
     /* We produce output faster than we consume input.               \
@@ -3393,8 +3394,9 @@  void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
         vm = memcpy(&tmp_m, vm, oprsz_2);                            \
     }                                                                \
     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
-        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
-        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
+        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
+        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
+            *(TYPE *)(vm + odd_ofs + H(i));                          \
     }                                                                \
     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
         memset(vd + oprsz - 16, 0, 16);                              \
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 1e6bcedb9d..c2ced3e2bb 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -2298,9 +2298,9 @@  static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high)
         unsigned vsz = vec_full_reg_size(s);
         unsigned high_ofs = high ? vsz / 2 : 0;
         tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->rn) + high_ofs,
-                           vec_full_reg_offset(s, a->rm) + high_ofs,
-                           vsz, vsz, 0, fns[a->esz]);
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           vsz, vsz, high_ofs, fns[a->esz]);
     }
     return true;
 }
@@ -2324,9 +2324,9 @@  static bool do_zip_q(DisasContext *s, arg_rrr_esz *a, bool high)
         unsigned vsz = vec_full_reg_size(s);
         unsigned high_ofs = high ? QEMU_ALIGN_DOWN(vsz, 32) / 2 : 0;
         tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
-                           vec_full_reg_offset(s, a->rn) + high_ofs,
-                           vec_full_reg_offset(s, a->rm) + high_ofs,
-                           vsz, vsz, 0, gen_helper_sve2_zip_q);
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           vsz, vsz, high_ofs, gen_helper_sve2_zip_q);
     }
     return true;
 }