Message ID | 20200918183751.2787647-52-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: Implement SVE2 | expand |
On 2020/9/19 2:37, Richard Henderson wrote: > For SVE, we potentially have a 4th argument coming from the > movprfx instruction. Currently we do not optimize movprfx, > so the problem is not visible. Hi Richard, I am a little confused. If it is not immediately preceded by a MOVPRFX instruction, the addend will still be used. Is it right? Zhiwei > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/helper.h | 20 +++--- > target/arm/sve.decode | 7 +- > target/arm/translate-a64.c | 15 ++++- > target/arm/translate-sve.c | 13 ++-- > target/arm/vec_helper.c | 112 ++++++++++++++++++-------------- > target/arm/translate-neon.c.inc | 10 +-- > 6 files changed, 105 insertions(+), 72 deletions(-) > > diff --git a/target/arm/helper.h b/target/arm/helper.h > index 8294055cab..97222bd256 100644 > --- a/target/arm/helper.h > +++ b/target/arm/helper.h > @@ -595,15 +595,19 @@ DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG, > DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, i32) > > -DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > > -DEF_HELPER_FLAGS_4(gvec_sdot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_sdot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_idx_b, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_idx_b, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > > DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, i32) > diff --git a/target/arm/sve.decode b/target/arm/sve.decode > index 0688dae450..5815ba9b1c 100644 > --- a/target/arm/sve.decode > +++ b/target/arm/sve.decode > @@ -756,12 +756,13 @@ UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u > MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s > > # SVE integer dot product (unpredicated) > -DOT_zzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 ra=%reg_movprfx > +DOT_zzzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \ > + ra=%reg_movprfx > > # SVE integer dot product (indexed) > -DOT_zzx 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ > +DOT_zzxw 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ > sz=0 ra=%reg_movprfx > -DOT_zzx 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ > +DOT_zzxw 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ > sz=1 ra=%reg_movprfx > > # SVE floating-point complex add (predicated) > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 76e54c1a4e..1a9251db67 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -692,6 +692,17 @@ static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn, > tcg_temp_free_ptr(qc_ptr); > } > > +/* Expand a 4-operand operation using an out-of-line helper. */ > +static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn, > + int rm, int ra, int data, gen_helper_gvec_4 *fn) > +{ > + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), > + vec_full_reg_offset(s, rn), > + vec_full_reg_offset(s, rm), > + vec_full_reg_offset(s, ra), > + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); > +} > + > /* Set ZF and NF based on a 64 bit result. This is alas fiddlier > * than the 32 bit equivalent. > */ > @@ -12202,7 +12213,7 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) > return; > > case 0x2: /* SDOT / UDOT */ > - gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, > + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, > u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b); > return; > > @@ -13461,7 +13472,7 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) > switch (16 * u + opcode) { > case 0x0e: /* SDOT */ > case 0x1e: /* UDOT */ > - gen_gvec_op3_ool(s, is_q, rd, rn, rm, index, > + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, > u ? gen_helper_gvec_udot_idx_b > : gen_helper_gvec_sdot_idx_b); > return; > diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c > index 5e8291e44b..66303dac54 100644 > --- a/target/arm/translate-sve.c > +++ b/target/arm/translate-sve.c > @@ -3804,28 +3804,29 @@ DO_ZZI(UMIN, umin) > > #undef DO_ZZI > > -static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a) > +static bool trans_DOT_zzzz(DisasContext *s, arg_DOT_zzzz *a) > { > - static gen_helper_gvec_3 * const fns[2][2] = { > + static gen_helper_gvec_4 * const fns[2][2] = { > { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, > { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } > }; > > if (sve_access_check(s)) { > - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, 0); > + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0); > } > return true; > } > > -static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a) > +static bool trans_DOT_zzxw(DisasContext *s, arg_DOT_zzxw *a) > { > - static gen_helper_gvec_3 * const fns[2][2] = { > + static gen_helper_gvec_4 * const fns[2][2] = { > { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h }, > { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h } > }; > > if (sve_access_check(s)) { > - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->index); > + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, > + a->ra, a->index); > } > return true; > } > diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index 32a4403256..d7ef31915b 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, > * All elements are treated equally, no matter where they are. > */ > > -void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; > int8_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 4; ++i) { > - d[i] += n[i * 4 + 0] * m[i * 4 + 0] > - + n[i * 4 + 1] * m[i * 4 + 1] > - + n[i * 4 + 2] * m[i * 4 + 2] > - + n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + n[i * 4 + 0] * m[i * 4 + 0] + > + n[i * 4 + 1] * m[i * 4 + 1] + > + n[i * 4 + 2] * m[i * 4 + 2] + > + n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint32_t *d = vd; > + uint32_t *d = vd, *a = va; > uint8_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 4; ++i) { > - d[i] += n[i * 4 + 0] * m[i * 4 + 0] > - + n[i * 4 + 1] * m[i * 4 + 1] > - + n[i * 4 + 2] * m[i * 4 + 2] > - + n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + n[i * 4 + 0] * m[i * 4 + 0] + > + n[i * 4 + 1] * m[i * 4 + 1] + > + n[i * 4 + 2] * m[i * 4 + 2] + > + n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; > int16_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 8; ++i) { > - d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] > - + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] > - + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] > - + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + (int64_t)n[i * 4 + 0] * m[i * 4 + 0] + > + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] + > + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] + > + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint64_t *d = vd; > + uint64_t *d = vd, *a = va; > uint16_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 8; ++i) { > - d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] > - + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] > - + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] > - + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] + > + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] + > + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] + > + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; > intptr_t index = simd_data(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; > int8_t *n = vn; > int8_t *m_indexed = (int8_t *)vm + index * 4; > > @@ -455,10 +460,11 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > int8_t m3 = m_indexed[i * 4 + 3]; > > do { > - d[i] += n[i * 4 + 0] * m0 > - + n[i * 4 + 1] * m1 > - + n[i * 4 + 2] * m2 > - + n[i * 4 + 3] * m3; > + d[i] = (a[i] + > + n[i * 4 + 0] * m0 + > + n[i * 4 + 1] * m1 + > + n[i * 4 + 2] * m2 + > + n[i * 4 + 3] * m3); > } while (++i < segend); > segend = i + 4; > } while (i < opr_sz_4); > @@ -466,11 +472,12 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; > intptr_t index = simd_data(desc); > - uint32_t *d = vd; > + uint32_t *d = vd, *a = va; > uint8_t *n = vn; > uint8_t *m_indexed = (uint8_t *)vm + index * 4; > > @@ -486,10 +493,11 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > uint8_t m3 = m_indexed[i * 4 + 3]; > > do { > - d[i] += n[i * 4 + 0] * m0 > - + n[i * 4 + 1] * m1 > - + n[i * 4 + 2] * m2 > - + n[i * 4 + 3] * m3; > + d[i] = (a[i] + > + n[i * 4 + 0] * m0 + > + n[i * 4 + 1] * m1 + > + n[i * 4 + 2] * m2 + > + n[i * 4 + 3] * m3); > } while (++i < segend); > segend = i + 4; > } while (i < opr_sz_4); > @@ -497,11 +505,12 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; > intptr_t index = simd_data(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; > int16_t *n = vn; > int16_t *m_indexed = (int16_t *)vm + index * 4; > > @@ -509,14 +518,17 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > * Process the entire segment all at once, writing back the results > * only after we've consumed all of the inputs. > */ > - for (i = 0; i < opr_sz_8 ; i += 2) { > - uint64_t d0, d1; > + for (i = 0; i < opr_sz_8; i += 2) { > + int64_t d0, d1; > > - d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > + d0 = a[i + 0]; > + d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; > d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; > d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; > - d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > + > + d1 = a[i + 1]; > + d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; > d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; > d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; > @@ -524,15 +536,15 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > d[i + 0] += d0; > d[i + 1] += d1; > } > - > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; > intptr_t index = simd_data(desc); > - uint64_t *d = vd; > + uint64_t *d = vd, *a = va; > uint16_t *n = vn; > uint16_t *m_indexed = (uint16_t *)vm + index * 4; > > @@ -540,14 +552,17 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > * Process the entire segment all at once, writing back the results > * only after we've consumed all of the inputs. > */ > - for (i = 0; i < opr_sz_8 ; i += 2) { > + for (i = 0; i < opr_sz_8; i += 2) { > uint64_t d0, d1; > > - d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; > + d0 = a[i + 0]; > + d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; > d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; > d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; > d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; > - d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; > + > + d1 = a[i + 1]; > + d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; > d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; > d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; > d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; > @@ -555,7 +570,6 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > d[i + 0] += d0; > d[i + 1] += d1; > } > - > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc > index 4d1a292981..7efe3d9556 100644 > --- a/target/arm/translate-neon.c.inc > +++ b/target/arm/translate-neon.c.inc > @@ -240,7 +240,7 @@ static bool trans_VCADD(DisasContext *s, arg_VCADD *a) > static bool trans_VDOT(DisasContext *s, arg_VDOT *a) > { > int opr_sz; > - gen_helper_gvec_3 *fn_gvec; > + gen_helper_gvec_4 *fn_gvec; > > if (!dc_isar_feature(aa32_dp, s)) { > return false; > @@ -262,9 +262,10 @@ static bool trans_VDOT(DisasContext *s, arg_VDOT *a) > > opr_sz = (1 + a->q) * 8; > fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b; > - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), > + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), > vfp_reg_offset(1, a->vn), > vfp_reg_offset(1, a->vm), > + vfp_reg_offset(1, a->vd), > opr_sz, opr_sz, 0, fn_gvec); > return true; > } > @@ -342,7 +343,7 @@ static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a) > > static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) > { > - gen_helper_gvec_3 *fn_gvec; > + gen_helper_gvec_4 *fn_gvec; > int opr_sz; > TCGv_ptr fpst; > > @@ -367,9 +368,10 @@ static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) > fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b; > opr_sz = (1 + a->q) * 8; > fpst = fpstatus_ptr(FPST_STD); > - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), > + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), > vfp_reg_offset(1, a->vn), > vfp_reg_offset(1, a->rm), > + vfp_reg_offset(1, a->vd), > opr_sz, opr_sz, a->index, fn_gvec); > tcg_temp_free_ptr(fpst); > return true;
On 2020/9/19 2:37, Richard Henderson wrote: > For SVE, we potentially have a 4th argument coming from the > movprfx instruction. Currently we do not optimize movprfx, > so the problem is not visible. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/helper.h | 20 +++--- > target/arm/sve.decode | 7 +- > target/arm/translate-a64.c | 15 ++++- > target/arm/translate-sve.c | 13 ++-- > target/arm/vec_helper.c | 112 ++++++++++++++++++-------------- > target/arm/translate-neon.c.inc | 10 +-- > 6 files changed, 105 insertions(+), 72 deletions(-) > > diff --git a/target/arm/helper.h b/target/arm/helper.h > index 8294055cab..97222bd256 100644 > --- a/target/arm/helper.h > +++ b/target/arm/helper.h > @@ -595,15 +595,19 @@ DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG, > DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, i32) > > -DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) > > -DEF_HELPER_FLAGS_4(gvec_sdot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_sdot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > -DEF_HELPER_FLAGS_4(gvec_udot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_idx_b, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_idx_b, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_sdot_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_5(gvec_udot_idx_h, TCG_CALL_NO_RWG, > + void, ptr, ptr, ptr, ptr, i32) > > DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, i32) > diff --git a/target/arm/sve.decode b/target/arm/sve.decode > index 0688dae450..5815ba9b1c 100644 > --- a/target/arm/sve.decode > +++ b/target/arm/sve.decode > @@ -756,12 +756,13 @@ UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u > MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s > > # SVE integer dot product (unpredicated) > -DOT_zzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 ra=%reg_movprfx > +DOT_zzzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \ > + ra=%reg_movprfx > > # SVE integer dot product (indexed) > -DOT_zzx 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ > +DOT_zzxw 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ > sz=0 ra=%reg_movprfx > -DOT_zzx 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ > +DOT_zzxw 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ > sz=1 ra=%reg_movprfx > > # SVE floating-point complex add (predicated) > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 76e54c1a4e..1a9251db67 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -692,6 +692,17 @@ static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn, > tcg_temp_free_ptr(qc_ptr); > } > > +/* Expand a 4-operand operation using an out-of-line helper. */ > +static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn, > + int rm, int ra, int data, gen_helper_gvec_4 *fn) > +{ > + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), > + vec_full_reg_offset(s, rn), > + vec_full_reg_offset(s, rm), > + vec_full_reg_offset(s, ra), > + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); > +} > + > /* Set ZF and NF based on a 64 bit result. This is alas fiddlier > * than the 32 bit equivalent. > */ > @@ -12202,7 +12213,7 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) > return; > > case 0x2: /* SDOT / UDOT */ > - gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, > + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, > u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b); > return; > > @@ -13461,7 +13472,7 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) > switch (16 * u + opcode) { > case 0x0e: /* SDOT */ > case 0x1e: /* UDOT */ > - gen_gvec_op3_ool(s, is_q, rd, rn, rm, index, > + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, > u ? gen_helper_gvec_udot_idx_b > : gen_helper_gvec_sdot_idx_b); > return; > diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c > index 5e8291e44b..66303dac54 100644 > --- a/target/arm/translate-sve.c > +++ b/target/arm/translate-sve.c > @@ -3804,28 +3804,29 @@ DO_ZZI(UMIN, umin) > > #undef DO_ZZI > > -static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a) > +static bool trans_DOT_zzzz(DisasContext *s, arg_DOT_zzzz *a) > { > - static gen_helper_gvec_3 * const fns[2][2] = { > + static gen_helper_gvec_4 * const fns[2][2] = { > { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, > { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } > }; > > if (sve_access_check(s)) { > - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, 0); > + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0); > } > return true; > } > > -static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a) > +static bool trans_DOT_zzxw(DisasContext *s, arg_DOT_zzxw *a) > { > - static gen_helper_gvec_3 * const fns[2][2] = { > + static gen_helper_gvec_4 * const fns[2][2] = { > { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h }, > { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h } > }; > > if (sve_access_check(s)) { > - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->index); > + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, > + a->ra, a->index); > } > return true; > } > diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index 32a4403256..d7ef31915b 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, > * All elements are treated equally, no matter where they are. > */ > > -void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; > int8_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 4; ++i) { > - d[i] += n[i * 4 + 0] * m[i * 4 + 0] > - + n[i * 4 + 1] * m[i * 4 + 1] > - + n[i * 4 + 2] * m[i * 4 + 2] > - + n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + n[i * 4 + 0] * m[i * 4 + 0] + > + n[i * 4 + 1] * m[i * 4 + 1] + > + n[i * 4 + 2] * m[i * 4 + 2] + > + n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint32_t *d = vd; > + uint32_t *d = vd, *a = va; > uint8_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 4; ++i) { > - d[i] += n[i * 4 + 0] * m[i * 4 + 0] > - + n[i * 4 + 1] * m[i * 4 + 1] > - + n[i * 4 + 2] * m[i * 4 + 2] > - + n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + n[i * 4 + 0] * m[i * 4 + 0] + > + n[i * 4 + 1] * m[i * 4 + 1] + > + n[i * 4 + 2] * m[i * 4 + 2] + > + n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; > int16_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 8; ++i) { > - d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] > - + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] > - + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] > - + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + (int64_t)n[i * 4 + 0] * m[i * 4 + 0] + > + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] + > + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] + > + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint64_t *d = vd; > + uint64_t *d = vd, *a = va; > uint16_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 8; ++i) { > - d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] > - + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] > - + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] > - + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] + > + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] + > + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] + > + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; > intptr_t index = simd_data(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; > int8_t *n = vn; > int8_t *m_indexed = (int8_t *)vm + index * 4; > > @@ -455,10 +460,11 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > int8_t m3 = m_indexed[i * 4 + 3]; > > do { > - d[i] += n[i * 4 + 0] * m0 > - + n[i * 4 + 1] * m1 > - + n[i * 4 + 2] * m2 > - + n[i * 4 + 3] * m3; > + d[i] = (a[i] + > + n[i * 4 + 0] * m0 + > + n[i * 4 + 1] * m1 + > + n[i * 4 + 2] * m2 + > + n[i * 4 + 3] * m3); > } while (++i < segend); > segend = i + 4; > } while (i < opr_sz_4); > @@ -466,11 +472,12 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; > intptr_t index = simd_data(desc); > - uint32_t *d = vd; > + uint32_t *d = vd, *a = va; > uint8_t *n = vn; > uint8_t *m_indexed = (uint8_t *)vm + index * 4; > > @@ -486,10 +493,11 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > uint8_t m3 = m_indexed[i * 4 + 3]; > > do { > - d[i] += n[i * 4 + 0] * m0 > - + n[i * 4 + 1] * m1 > - + n[i * 4 + 2] * m2 > - + n[i * 4 + 3] * m3; > + d[i] = (a[i] + > + n[i * 4 + 0] * m0 + > + n[i * 4 + 1] * m1 + > + n[i * 4 + 2] * m2 + > + n[i * 4 + 3] * m3); > } while (++i < segend); > segend = i + 4; > } while (i < opr_sz_4); > @@ -497,11 +505,12 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; > intptr_t index = simd_data(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; > int16_t *n = vn; > int16_t *m_indexed = (int16_t *)vm + index * 4; > > @@ -509,14 +518,17 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > * Process the entire segment all at once, writing back the results > * only after we've consumed all of the inputs. > */ > - for (i = 0; i < opr_sz_8 ; i += 2) { > - uint64_t d0, d1; > + for (i = 0; i < opr_sz_8; i += 2) { > + int64_t d0, d1; > > - d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > + d0 = a[i + 0]; > + d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; > d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; > d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; > - d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > + > + d1 = a[i + 1]; > + d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; > d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; > d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; > @@ -524,15 +536,15 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > d[i + 0] += d0; > d[i + 1] += d1; > } > - > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; > intptr_t index = simd_data(desc); > - uint64_t *d = vd; > + uint64_t *d = vd, *a = va; > uint16_t *n = vn; > uint16_t *m_indexed = (uint16_t *)vm + index * 4; > > @@ -540,14 +552,17 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > * Process the entire segment all at once, writing back the results > * only after we've consumed all of the inputs. > */ > - for (i = 0; i < opr_sz_8 ; i += 2) { > + for (i = 0; i < opr_sz_8; i += 2) { > uint64_t d0, d1; > > - d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; > + d0 = a[i + 0]; Add once. > + d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; > d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; > d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; > d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; > - d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; > + > + d1 = a[i + 1]; > + d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; > d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; > d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; > d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; > @@ -555,7 +570,6 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > d[i + 0] += d0; Add twice. I think it is wrong here. Do you thinks so? Zhiwei > d[i + 1] += d1; > } > - > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc > index 4d1a292981..7efe3d9556 100644 > --- a/target/arm/translate-neon.c.inc > +++ b/target/arm/translate-neon.c.inc > @@ -240,7 +240,7 @@ static bool trans_VCADD(DisasContext *s, arg_VCADD *a) > static bool trans_VDOT(DisasContext *s, arg_VDOT *a) > { > int opr_sz; > - gen_helper_gvec_3 *fn_gvec; > + gen_helper_gvec_4 *fn_gvec; > > if (!dc_isar_feature(aa32_dp, s)) { > return false; > @@ -262,9 +262,10 @@ static bool trans_VDOT(DisasContext *s, arg_VDOT *a) > > opr_sz = (1 + a->q) * 8; > fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b; > - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), > + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), > vfp_reg_offset(1, a->vn), > vfp_reg_offset(1, a->vm), > + vfp_reg_offset(1, a->vd), > opr_sz, opr_sz, 0, fn_gvec); > return true; > } > @@ -342,7 +343,7 @@ static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a) > > static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) > { > - gen_helper_gvec_3 *fn_gvec; > + gen_helper_gvec_4 *fn_gvec; > int opr_sz; > TCGv_ptr fpst; > > @@ -367,9 +368,10 @@ static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) > fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b; > opr_sz = (1 + a->q) * 8; > fpst = fpstatus_ptr(FPST_STD); > - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), > + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), > vfp_reg_offset(1, a->vn), > vfp_reg_offset(1, a->rm), > + vfp_reg_offset(1, a->vd), > opr_sz, opr_sz, a->index, fn_gvec); > tcg_temp_free_ptr(fpst); > return true;
On 9/23/20 3:01 AM, LIU Zhiwei wrote: > > > On 2020/9/19 2:37, Richard Henderson wrote: >> For SVE, we potentially have a 4th argument coming from the >> movprfx instruction. Currently we do not optimize movprfx, >> so the problem is not visible. > Hi Richard, > > I am a little confused. If it is not immediately preceded by a MOVPRFX > instruction, the addend will > still be used. > > Is it right? If movprfx is not used, then the addend register will be the same as the destination register. r~
On 2020/9/23 22:46, Richard Henderson wrote: > On 9/23/20 3:01 AM, LIU Zhiwei wrote: >> >> On 2020/9/19 2:37, Richard Henderson wrote: >>> For SVE, we potentially have a 4th argument coming from the >>> movprfx instruction. Currently we do not optimize movprfx, >>> so the problem is not visible. >> Hi Richard, >> >> I am a little confused. If it is not immediately preceded by a MOVPRFX >> instruction, the addend will >> still be used. >> >> Is it right? > If movprfx is not used, then the addend register will be the same as the > destination register. Get it. Could you see again the definition of HELPER(gvec_udot_idx_h) or the HELPER(gvec_sdot_idx_h)? I think it is wrong there, it code sequence is like this: d0 = a[i+0] //dot calculation d[i+0] += d0 Because when addend is the destination register, it has no reason to add destination register twice. Best Regards, Zhiwei > > r~ <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> </head> <body> <br> <br> <div class="moz-cite-prefix">On 2020/9/23 22:46, Richard Henderson wrote:<br> </div> <blockquote type="cite" cite="mid:300c780c-0167-0a87-58b9-2fff8ab91190@linaro.org"> <pre class="moz-quote-pre" wrap="">On 9/23/20 3:01 AM, LIU Zhiwei wrote: </pre> <blockquote type="cite"> <pre class="moz-quote-pre" wrap=""> On 2020/9/19 2:37, Richard Henderson wrote: </pre> <blockquote type="cite"> <pre class="moz-quote-pre" wrap="">For SVE, we potentially have a 4th argument coming from the movprfx instruction. Currently we do not optimize movprfx, so the problem is not visible. </pre> </blockquote> <pre class="moz-quote-pre" wrap="">Hi Richard, I am a little confused. If it is not immediately preceded by a MOVPRFX instruction, the addend will still be used. Is it right? </pre> </blockquote> <pre class="moz-quote-pre" wrap=""> If movprfx is not used, then the addend register will be the same as the destination register.</pre> </blockquote> Get it.<br> <br> Could you see again the definition of HELPER(gvec_udot_idx_h) or the HELPER(gvec_sdot_idx_h)?<br> <br> I think it is wrong there, it code sequence is like this:<br> <pre>d0 = a[i+0] //dot calculation d[i+0] += d0</pre> Because when addend is the destination register, it has no reason to add destination register twice.<br> <br> Best Regards,<br> Zhiwei<br> <blockquote type="cite" cite="mid:300c780c-0167-0a87-58b9-2fff8ab91190@linaro.org"> <pre class="moz-quote-pre" wrap=""> r~ </pre> </blockquote> <br> </body> </html>
On 9/23/20 6:48 AM, LIU Zhiwei wrote: >> + for (i = 0; i < opr_sz_8; i += 2) { >> uint64_t d0, d1; >> - d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; >> + d0 = a[i + 0]; > Add once. >> + d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; >> d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; >> d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; >> d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; >> - d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; >> + >> + d1 = a[i + 1]; >> + d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; >> d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; >> d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; >> d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; >> @@ -555,7 +570,6 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void >> *vm, uint32_t desc) >> d[i + 0] += d0; > Add twice. > > I think it is wrong here. Do you thinks so? Yep. Thanks for noticing. r~
diff --git a/target/arm/helper.h b/target/arm/helper.h index 8294055cab..97222bd256 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -595,15 +595,19 @@ DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_sdot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_udot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_sdot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) -DEF_HELPER_FLAGS_4(gvec_udot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_idx_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_idx_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_sdot_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_udot_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) diff --git a/target/arm/sve.decode b/target/arm/sve.decode index 0688dae450..5815ba9b1c 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -756,12 +756,13 @@ UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s # SVE integer dot product (unpredicated) -DOT_zzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 ra=%reg_movprfx +DOT_zzzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 \ + ra=%reg_movprfx # SVE integer dot product (indexed) -DOT_zzx 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ +DOT_zzxw 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ sz=0 ra=%reg_movprfx -DOT_zzx 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ +DOT_zzxw 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ sz=1 ra=%reg_movprfx # SVE floating-point complex add (predicated) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 76e54c1a4e..1a9251db67 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -692,6 +692,17 @@ static void gen_gvec_op3_qc(DisasContext *s, bool is_q, int rd, int rn, tcg_temp_free_ptr(qc_ptr); } +/* Expand a 4-operand operation using an out-of-line helper. */ +static void gen_gvec_op4_ool(DisasContext *s, bool is_q, int rd, int rn, + int rm, int ra, int data, gen_helper_gvec_4 *fn) +{ + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + vec_full_reg_offset(s, ra), + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + /* Set ZF and NF based on a 64 bit result. This is alas fiddlier * than the 32 bit equivalent. */ @@ -12202,7 +12213,7 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) return; case 0x2: /* SDOT / UDOT */ - gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b); return; @@ -13461,7 +13472,7 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) switch (16 * u + opcode) { case 0x0e: /* SDOT */ case 0x1e: /* UDOT */ - gen_gvec_op3_ool(s, is_q, rd, rn, rm, index, + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index, u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b); return; diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 5e8291e44b..66303dac54 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -3804,28 +3804,29 @@ DO_ZZI(UMIN, umin) #undef DO_ZZI -static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a) +static bool trans_DOT_zzzz(DisasContext *s, arg_DOT_zzzz *a) { - static gen_helper_gvec_3 * const fns[2][2] = { + static gen_helper_gvec_4 * const fns[2][2] = { { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } }; if (sve_access_check(s)) { - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, 0); + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->ra, 0); } return true; } -static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a) +static bool trans_DOT_zzxw(DisasContext *s, arg_DOT_zzxw *a) { - static gen_helper_gvec_3 * const fns[2][2] = { + static gen_helper_gvec_4 * const fns[2][2] = { { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h }, { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h } }; if (sve_access_check(s)) { - gen_gvec_ool_zzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, a->index); + gen_gvec_ool_zzzz(s, fns[a->u][a->sz], a->rd, a->rn, a->rm, + a->ra, a->index); } return true; } diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index 32a4403256..d7ef31915b 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, * All elements are treated equally, no matter where they are. */ -void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc); - uint32_t *d = vd; + int32_t *d = vd, *a = va; int8_t *n = vn, *m = vm; for (i = 0; i < opr_sz / 4; ++i) { - d[i] += n[i * 4 + 0] * m[i * 4 + 0] - + n[i * 4 + 1] * m[i * 4 + 1] - + n[i * 4 + 2] * m[i * 4 + 2] - + n[i * 4 + 3] * m[i * 4 + 3]; + d[i] = (a[i] + + n[i * 4 + 0] * m[i * 4 + 0] + + n[i * 4 + 1] * m[i * 4 + 1] + + n[i * 4 + 2] * m[i * 4 + 2] + + n[i * 4 + 3] * m[i * 4 + 3]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc); - uint32_t *d = vd; + uint32_t *d = vd, *a = va; uint8_t *n = vn, *m = vm; for (i = 0; i < opr_sz / 4; ++i) { - d[i] += n[i * 4 + 0] * m[i * 4 + 0] - + n[i * 4 + 1] * m[i * 4 + 1] - + n[i * 4 + 2] * m[i * 4 + 2] - + n[i * 4 + 3] * m[i * 4 + 3]; + d[i] = (a[i] + + n[i * 4 + 0] * m[i * 4 + 0] + + n[i * 4 + 1] * m[i * 4 + 1] + + n[i * 4 + 2] * m[i * 4 + 2] + + n[i * 4 + 3] * m[i * 4 + 3]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd; + int64_t *d = vd, *a = va; int16_t *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] - + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] - + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] - + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; + d[i] = (a[i] + + (int64_t)n[i * 4 + 0] * m[i * 4 + 0] + + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] + + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] + + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc); - uint64_t *d = vd; + uint64_t *d = vd, *a = va; uint16_t *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] - + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] - + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] - + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; + d[i] = (a[i] + + (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] + + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] + + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] + + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) { intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; intptr_t index = simd_data(desc); - uint32_t *d = vd; + int32_t *d = vd, *a = va; int8_t *n = vn; int8_t *m_indexed = (int8_t *)vm + index * 4; @@ -455,10 +460,11 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) int8_t m3 = m_indexed[i * 4 + 3]; do { - d[i] += n[i * 4 + 0] * m0 - + n[i * 4 + 1] * m1 - + n[i * 4 + 2] * m2 - + n[i * 4 + 3] * m3; + d[i] = (a[i] + + n[i * 4 + 0] * m0 + + n[i * 4 + 1] * m1 + + n[i * 4 + 2] * m2 + + n[i * 4 + 3] * m3); } while (++i < segend); segend = i + 4; } while (i < opr_sz_4); @@ -466,11 +472,12 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) { intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; intptr_t index = simd_data(desc); - uint32_t *d = vd; + uint32_t *d = vd, *a = va; uint8_t *n = vn; uint8_t *m_indexed = (uint8_t *)vm + index * 4; @@ -486,10 +493,11 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) uint8_t m3 = m_indexed[i * 4 + 3]; do { - d[i] += n[i * 4 + 0] * m0 - + n[i * 4 + 1] * m1 - + n[i * 4 + 2] * m2 - + n[i * 4 + 3] * m3; + d[i] = (a[i] + + n[i * 4 + 0] * m0 + + n[i * 4 + 1] * m1 + + n[i * 4 + 2] * m2 + + n[i * 4 + 3] * m3); } while (++i < segend); segend = i + 4; } while (i < opr_sz_4); @@ -497,11 +505,12 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; intptr_t index = simd_data(desc); - uint64_t *d = vd; + int64_t *d = vd, *a = va; int16_t *n = vn; int16_t *m_indexed = (int16_t *)vm + index * 4; @@ -509,14 +518,17 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) * Process the entire segment all at once, writing back the results * only after we've consumed all of the inputs. */ - for (i = 0; i < opr_sz_8 ; i += 2) { - uint64_t d0, d1; + for (i = 0; i < opr_sz_8; i += 2) { + int64_t d0, d1; - d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; + d0 = a[i + 0]; + d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; - d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; + + d1 = a[i + 1]; + d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; @@ -524,15 +536,15 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) d[i + 0] += d0; d[i + 1] += d1; } - clear_tail(d, opr_sz, simd_maxsz(desc)); } -void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, + void *va, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; intptr_t index = simd_data(desc); - uint64_t *d = vd; + uint64_t *d = vd, *a = va; uint16_t *n = vn; uint16_t *m_indexed = (uint16_t *)vm + index * 4; @@ -540,14 +552,17 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) * Process the entire segment all at once, writing back the results * only after we've consumed all of the inputs. */ - for (i = 0; i < opr_sz_8 ; i += 2) { + for (i = 0; i < opr_sz_8; i += 2) { uint64_t d0, d1; - d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; + d0 = a[i + 0]; + d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; - d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; + + d1 = a[i + 1]; + d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; @@ -555,7 +570,6 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) d[i + 0] += d0; d[i + 1] += d1; } - clear_tail(d, opr_sz, simd_maxsz(desc)); } diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc index 4d1a292981..7efe3d9556 100644 --- a/target/arm/translate-neon.c.inc +++ b/target/arm/translate-neon.c.inc @@ -240,7 +240,7 @@ static bool trans_VCADD(DisasContext *s, arg_VCADD *a) static bool trans_VDOT(DisasContext *s, arg_VDOT *a) { int opr_sz; - gen_helper_gvec_3 *fn_gvec; + gen_helper_gvec_4 *fn_gvec; if (!dc_isar_feature(aa32_dp, s)) { return false; @@ -262,9 +262,10 @@ static bool trans_VDOT(DisasContext *s, arg_VDOT *a) opr_sz = (1 + a->q) * 8; fn_gvec = a->u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b; - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), vfp_reg_offset(1, a->vn), vfp_reg_offset(1, a->vm), + vfp_reg_offset(1, a->vd), opr_sz, opr_sz, 0, fn_gvec); return true; } @@ -342,7 +343,7 @@ static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a) static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) { - gen_helper_gvec_3 *fn_gvec; + gen_helper_gvec_4 *fn_gvec; int opr_sz; TCGv_ptr fpst; @@ -367,9 +368,10 @@ static bool trans_VDOT_scalar(DisasContext *s, arg_VDOT_scalar *a) fn_gvec = a->u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b; opr_sz = (1 + a->q) * 8; fpst = fpstatus_ptr(FPST_STD); - tcg_gen_gvec_3_ool(vfp_reg_offset(1, a->vd), + tcg_gen_gvec_4_ool(vfp_reg_offset(1, a->vd), vfp_reg_offset(1, a->vn), vfp_reg_offset(1, a->rm), + vfp_reg_offset(1, a->vd), opr_sz, opr_sz, a->index, fn_gvec); tcg_temp_free_ptr(fpst); return true;
For SVE, we potentially have a 4th argument coming from the movprfx instruction. Currently we do not optimize movprfx, so the problem is not visible. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper.h | 20 +++--- target/arm/sve.decode | 7 +- target/arm/translate-a64.c | 15 ++++- target/arm/translate-sve.c | 13 ++-- target/arm/vec_helper.c | 112 ++++++++++++++++++-------------- target/arm/translate-neon.c.inc | 10 +-- 6 files changed, 105 insertions(+), 72 deletions(-)