Message ID | 20220527181907.189259-58-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: Rewrite sve feature tests | expand |
On Fri, 27 May 2022 at 22:05, Richard Henderson <richard.henderson@linaro.org> wrote: > > This is in line with how we treat uzp, and will > eliminate the special case code during translation. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- Sorry, a bit late, but I believe this change broke the implementation of the ZIP2 SVE instructions: > target/arm/sve_helper.c | 6 ++++-- > target/arm/translate-sve.c | 12 ++++++------ > 2 files changed, 10 insertions(+), 8 deletions(-) > > diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index e0f9aa9983..3bdcd4ce9d 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -3382,6 +3382,7 @@ void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) > void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ > { \ > intptr_t oprsz = simd_oprsz(desc); \ > + intptr_t odd_ofs = simd_data(desc); \ > intptr_t i, oprsz_2 = oprsz / 2; \ > ARMVectorReg tmp_n, tmp_m; \ > /* We produce output faster than we consume input. \ > @@ -3393,8 +3394,9 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ > vm = memcpy(&tmp_m, vm, oprsz_2); \ > } \ Unlike the for loop below, we are not taking the odd_ofs into account in the 2 memcpys above. As an optimization these memcpys only copy half of the vector, so when the instruction is using the same vector register for 2 of the operands, and odd_ofs is not 0, we end up reading the upper bytes of tmp_n/tmp_m, which are uninitialized garbage. (Which is ironically a good thing in this case, since non-deterministic incorrect behaviour was easier to find than deterministic wrong results) > for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ > - *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \ > - *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \ > + *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ > + *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ > + *(TYPE *)(vm + odd_ofs + H(i)); \ > } \ > if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ > memset(vd + oprsz - 16, 0, 16); \ > diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c > index 1e6bcedb9d..c2ced3e2bb 100644 > --- a/target/arm/translate-sve.c > +++ b/target/arm/translate-sve.c > @@ -2298,9 +2298,9 @@ static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high) > unsigned vsz = vec_full_reg_size(s); > unsigned high_ofs = high ? vsz / 2 : 0; > tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), > - vec_full_reg_offset(s, a->rn) + high_ofs, > - vec_full_reg_offset(s, a->rm) + high_ofs, > - vsz, vsz, 0, fns[a->esz]); > + vec_full_reg_offset(s, a->rn), > + vec_full_reg_offset(s, a->rm), > + vsz, vsz, high_ofs, fns[a->esz]); > } > return true; > } > @@ -2324,9 +2324,9 @@ static bool do_zip_q(DisasContext *s, arg_rrr_esz *a, bool high) > unsigned vsz = vec_full_reg_size(s); > unsigned high_ofs = high ? QEMU_ALIGN_DOWN(vsz, 32) / 2 : 0; > tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), > - vec_full_reg_offset(s, a->rn) + high_ofs, > - vec_full_reg_offset(s, a->rm) + high_ofs, > - vsz, vsz, 0, gen_helper_sve2_zip_q); > + vec_full_reg_offset(s, a->rn), > + vec_full_reg_offset(s, a->rm), > + vsz, vsz, high_ofs, gen_helper_sve2_zip_q); > } > return true; > } > -- > 2.34.1 > > Best Regards, Idan Horowitz
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index e0f9aa9983..3bdcd4ce9d 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -3382,6 +3382,7 @@ void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ { \ intptr_t oprsz = simd_oprsz(desc); \ + intptr_t odd_ofs = simd_data(desc); \ intptr_t i, oprsz_2 = oprsz / 2; \ ARMVectorReg tmp_n, tmp_m; \ /* We produce output faster than we consume input. \ @@ -3393,8 +3394,9 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ vm = memcpy(&tmp_m, vm, oprsz_2); \ } \ for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ - *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \ - *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \ + *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ + *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ + *(TYPE *)(vm + odd_ofs + H(i)); \ } \ if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ memset(vd + oprsz - 16, 0, 16); \ diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 1e6bcedb9d..c2ced3e2bb 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -2298,9 +2298,9 @@ static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high) unsigned vsz = vec_full_reg_size(s); unsigned high_ofs = high ? vsz / 2 : 0; tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), - vec_full_reg_offset(s, a->rn) + high_ofs, - vec_full_reg_offset(s, a->rm) + high_ofs, - vsz, vsz, 0, fns[a->esz]); + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, high_ofs, fns[a->esz]); } return true; } @@ -2324,9 +2324,9 @@ static bool do_zip_q(DisasContext *s, arg_rrr_esz *a, bool high) unsigned vsz = vec_full_reg_size(s); unsigned high_ofs = high ? QEMU_ALIGN_DOWN(vsz, 32) / 2 : 0; tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), - vec_full_reg_offset(s, a->rn) + high_ofs, - vec_full_reg_offset(s, a->rm) + high_ofs, - vsz, vsz, 0, gen_helper_sve2_zip_q); + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, high_ofs, gen_helper_sve2_zip_q); } return true; }
This is in line with how we treat uzp, and will eliminate the special case code during translation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/sve_helper.c | 6 ++++-- target/arm/translate-sve.c | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-)