Message ID | 20171004184325.24157-7-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | ARM v8.1 simd + v8.3 complex insns | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/translate.c | 83 ++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 64 insertions(+), 19 deletions(-) > > diff --git a/target/arm/translate.c b/target/arm/translate.c > index ab1a12a1b8..0cd58710b3 100644 > --- a/target/arm/translate.c > +++ b/target/arm/translate.c > @@ -25,6 +25,7 @@ > #include "disas/disas.h" > #include "exec/exec-all.h" > #include "tcg-op.h" > +#include "tcg-op-gvec.h" > #include "qemu/log.h" > #include "qemu/bitops.h" > #include "arm_ldst.h" > @@ -5334,9 +5335,9 @@ static void gen_neon_narrow_op(int op, int u, int size, > #define NEON_3R_VPMAX 20 > #define NEON_3R_VPMIN 21 > #define NEON_3R_VQDMULH_VQRDMULH 22 > -#define NEON_3R_VPADD 23 > +#define NEON_3R_VPADD_VQRDMLAH 23 > #define NEON_3R_SHA 24 /* SHA1C,SHA1P,SHA1M,SHA1SU0,SHA256H{2},SHA256SU1 */ > -#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */ > +#define NEON_3R_VFM_VQRDMLSH 25 /* VFMA, VFMS : float fused multiply-add */ > #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */ > #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */ > #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */ > @@ -5368,9 +5369,9 @@ static const uint8_t neon_3r_sizes[] = { > [NEON_3R_VPMAX] = 0x7, > [NEON_3R_VPMIN] = 0x7, > [NEON_3R_VQDMULH_VQRDMULH] = 0x6, > - [NEON_3R_VPADD] = 0x7, > + [NEON_3R_VPADD_VQRDMLAH] = 0x7, > [NEON_3R_SHA] = 0xf, /* size field encodes op type */ > - [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */ > + [NEON_3R_VFM_VQRDMLSH] = 0x7, /* For VFM, size bit 1 encodes op */ > [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */ > [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */ > [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */ > @@ -5556,6 +5557,7 @@ static const uint8_t neon_2rm_sizes[] = { > > static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > { > + void (*fn_gvec_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); > int op; > int q; > int rd, rn, rm; > @@ -5600,12 +5602,12 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > if (q && ((rd | rn | rm) & 1)) { > return 1; > } > - /* > - * The SHA-1/SHA-256 3-register instructions require special treatment > - * here, as their size field is overloaded as an op type selector, and > - * they all consume their input in a single pass. > - */ > - if (op == NEON_3R_SHA) { > + switch (op) { > + case NEON_3R_SHA: > + /* The SHA-1/SHA-256 3-register instructions require special > + * treatment here, as their size field is overloaded as an > + * op type selector, and they all consume their input in a > + * single pass. */ > if (!q) { > return 1; > } > @@ -5642,6 +5644,53 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > tcg_temp_free_i32(tmp2); > tcg_temp_free_i32(tmp3); > return 0; > + > + case NEON_3R_VPADD_VQRDMLAH: > + if (!u) { > + break; /* VPADD */ > + } > + /* VQRDMLAH */ > + switch (size) { > + case 1: > + fn_gvec_ptr = gen_helper_gvec_qrdmlah_s16; > + break; > + case 2: > + fn_gvec_ptr = gen_helper_gvec_qrdmlah_s32; > + break; > + default: > + return 1; > + } > + do_vqrdmlx: > + if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { > + int opr_sz = (1 + q) * 8; > + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), > + vfp_reg_offset(1, rn), > + vfp_reg_offset(1, rm), cpu_env, > + opr_sz, opr_sz, 0, fn_gvec_ptr); > + return 0; > + } > + return 1; > + > + case NEON_3R_VFM_VQRDMLSH: > + if (!u) { > + /* VFM, VFMS */ > + if ((5 & (1 << size)) == 0) { > + return 1; > + } > + break; > + } > + /* VQRDMLSH */ > + switch (size) { > + case 1: > + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; > + break; > + case 2: > + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; > + break; > + default: > + return 1; > + } > + goto do_vqrdmlx; Could we not take the opportunity to re-factor out the common bit rather than make this mega function even more byzantine? > } > if (size == 3 && op != NEON_3R_LOGIC) { > /* 64-bit element instructions. */ > @@ -5727,11 +5776,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > rm = rtmp; > } > break; > - case NEON_3R_VPADD: > - if (u) { > - return 1; > - } > - /* Fall through */ > + case NEON_3R_VPADD_VQRDMLAH: > case NEON_3R_VPMAX: > case NEON_3R_VPMIN: > pairwise = 1; > @@ -5765,8 +5810,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > return 1; > } > break; > - case NEON_3R_VFM: > - if (!arm_dc_feature(s, ARM_FEATURE_VFP4) || u) { > + case NEON_3R_VFM_VQRDMLSH: > + if (!arm_dc_feature(s, ARM_FEATURE_VFP4)) { > return 1; > } > break; > @@ -5963,7 +6008,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > } > } > break; > - case NEON_3R_VPADD: > + case NEON_3R_VPADD_VQRDMLAH: > switch (size) { > case 0: gen_helper_neon_padd_u8(tmp, tmp, tmp2); break; > case 1: gen_helper_neon_padd_u16(tmp, tmp, tmp2); break; > @@ -6062,7 +6107,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > } > } > break; > - case NEON_3R_VFM: > + case NEON_3R_VFM_VQRDMLSH: > { > /* VFMA, VFMS: fused multiply-add */ > TCGv_ptr fpstatus = get_fpstatus_ptr(1); -- Alex Bennée
On 11/13/2017 05:55 PM, Alex Bennée wrote: >> + case NEON_3R_VFM_VQRDMLSH: >> + if (!u) { >> + /* VFM, VFMS */ >> + if ((5 & (1 << size)) == 0) { >> + return 1; >> + } >> + break; >> + } >> + /* VQRDMLSH */ >> + switch (size) { >> + case 1: >> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; >> + break; >> + case 2: >> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; >> + break; >> + default: >> + return 1; >> + } >> + goto do_vqrdmlx; > Could we not take the opportunity to re-factor out the common bit rather > than make this mega What, specifically, did you have in mind? r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 11/13/2017 05:55 PM, Alex Bennée wrote: >>> + case NEON_3R_VFM_VQRDMLSH: >>> + if (!u) { >>> + /* VFM, VFMS */ >>> + if ((5 & (1 << size)) == 0) { >>> + return 1; >>> + } >>> + break; >>> + } >>> + /* VQRDMLSH */ >>> + switch (size) { >>> + case 1: >>> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; >>> + break; >>> + case 2: >>> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; >>> + break; >>> + default: >>> + return 1; >>> + } >>> + goto do_vqrdmlx; >> Could we not take the opportunity to re-factor out the common bit rather >> than make this mega > > What, specifically, did you have in mind? Something like: translate: use helper to avoid goto shenanigans 1 file changed, 18 insertions(+), 17 deletions(-) target/arm/translate.c | 35 ++++++++++++++++++----------------- modified target/arm/translate.c @@ -5576,6 +5576,20 @@ static const uint8_t neon_2rm_sizes[] = { [NEON_2RM_VCVT_UF] = 0x4, }; +/* expand v8.1 simd helper */ +static int do_qrdml(DisasContext *s, gen_helper_gvec_3_ptr *fn, int q, int rd, int rn, int rm) +{ + if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { + int opr_sz = (1 + q) * 8; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), cpu_env, + opr_sz, opr_sz, 0, fn); + return 0; + } + return 1; +} + /* Translate a NEON data processing instruction. Return nonzero if the instruction is invalid. We process data in a mixture of 32-bit and 64-bit chunks. @@ -5583,7 +5597,6 @@ static const uint8_t neon_2rm_sizes[] = { static int disas_neon_data_insn(DisasContext *s, uint32_t insn) { - void (*fn_gvec_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); int op; int q; int rd, rn, rm; @@ -5678,24 +5691,13 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) /* VQRDMLAH */ switch (size) { case 1: - fn_gvec_ptr = gen_helper_gvec_qrdmlah_s16; - break; + return do_qrdml(s, gen_helper_gvec_qrdmlah_s16, q, rd, rn, rm); case 2: - fn_gvec_ptr = gen_helper_gvec_qrdmlah_s32; + return do_qrdml(s, gen_helper_gvec_qrdmlah_s32, q, rd, rn, rm); break; default: return 1; } - do_vqrdmlx: - if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { - int opr_sz = (1 + q) * 8; - tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), - vfp_reg_offset(1, rn), - vfp_reg_offset(1, rm), cpu_env, - opr_sz, opr_sz, 0, fn_gvec_ptr); - return 0; - } - return 1; case NEON_3R_VFM_VQRDMLSH: if (!u) { @@ -5708,15 +5710,14 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) /* VQRDMLSH */ switch (size) { case 1: - fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; + return do_qrdml(s, gen_helper_gvec_qrdmlsh_s16, q, rd, rn, rm); break; case 2: - fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; + return do_qrdml(s, gen_helper_gvec_qrdmlsh_s32, q, rd, rn, rm); break; default: return 1; } - goto do_vqrdmlx; } if (size == 3 && op != NEON_3R_LOGIC) { /* 64-bit element instructions. */ -- Alex Bennée
On 11/14/2017 11:06 AM, Alex Bennée wrote: > > Richard Henderson <richard.henderson@linaro.org> writes: > >> On 11/13/2017 05:55 PM, Alex Bennée wrote: >>>> + case NEON_3R_VFM_VQRDMLSH: >>>> + if (!u) { >>>> + /* VFM, VFMS */ >>>> + if ((5 & (1 << size)) == 0) { >>>> + return 1; >>>> + } >>>> + break; >>>> + } >>>> + /* VQRDMLSH */ >>>> + switch (size) { >>>> + case 1: >>>> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; >>>> + break; >>>> + case 2: >>>> + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; >>>> + break; >>>> + default: >>>> + return 1; >>>> + } >>>> + goto do_vqrdmlx; >>> Could we not take the opportunity to re-factor out the common bit rather >>> than make this mega >> >> What, specifically, did you have in mind? > > Something like: > > translate: use helper to avoid goto shenanigans Thanks, this certainly looks better. r~ > > 1 file changed, 18 insertions(+), 17 deletions(-) > target/arm/translate.c | 35 ++++++++++++++++++----------------- > > modified target/arm/translate.c > @@ -5576,6 +5576,20 @@ static const uint8_t neon_2rm_sizes[] = { > [NEON_2RM_VCVT_UF] = 0x4, > }; > > +/* expand v8.1 simd helper */ > +static int do_qrdml(DisasContext *s, gen_helper_gvec_3_ptr *fn, int q, int rd, int rn, int rm) > +{ > + if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { > + int opr_sz = (1 + q) * 8; > + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), > + vfp_reg_offset(1, rn), > + vfp_reg_offset(1, rm), cpu_env, > + opr_sz, opr_sz, 0, fn); > + return 0; > + } > + return 1; > +} > + > /* Translate a NEON data processing instruction. Return nonzero if the > instruction is invalid. > We process data in a mixture of 32-bit and 64-bit chunks. > @@ -5583,7 +5597,6 @@ static const uint8_t neon_2rm_sizes[] = { > > static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > { > - void (*fn_gvec_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); > int op; > int q; > int rd, rn, rm; > @@ -5678,24 +5691,13 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > /* VQRDMLAH */ > switch (size) { > case 1: > - fn_gvec_ptr = gen_helper_gvec_qrdmlah_s16; > - break; > + return do_qrdml(s, gen_helper_gvec_qrdmlah_s16, q, rd, rn, rm); > case 2: > - fn_gvec_ptr = gen_helper_gvec_qrdmlah_s32; > + return do_qrdml(s, gen_helper_gvec_qrdmlah_s32, q, rd, rn, rm); > break; > default: > return 1; > } > - do_vqrdmlx: > - if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { > - int opr_sz = (1 + q) * 8; > - tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), > - vfp_reg_offset(1, rn), > - vfp_reg_offset(1, rm), cpu_env, > - opr_sz, opr_sz, 0, fn_gvec_ptr); > - return 0; > - } > - return 1; > > case NEON_3R_VFM_VQRDMLSH: > if (!u) { > @@ -5708,15 +5710,14 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > /* VQRDMLSH */ > switch (size) { > case 1: > - fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; > + return do_qrdml(s, gen_helper_gvec_qrdmlsh_s16, q, rd, rn, rm); > break; > case 2: > - fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; > + return do_qrdml(s, gen_helper_gvec_qrdmlsh_s32, q, rd, rn, rm); > break; > default: > return 1; > } > - goto do_vqrdmlx; > } > if (size == 3 && op != NEON_3R_LOGIC) { > /* 64-bit element instructions. */ > > > -- > Alex Bennée >
diff --git a/target/arm/translate.c b/target/arm/translate.c index ab1a12a1b8..0cd58710b3 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -25,6 +25,7 @@ #include "disas/disas.h" #include "exec/exec-all.h" #include "tcg-op.h" +#include "tcg-op-gvec.h" #include "qemu/log.h" #include "qemu/bitops.h" #include "arm_ldst.h" @@ -5334,9 +5335,9 @@ static void gen_neon_narrow_op(int op, int u, int size, #define NEON_3R_VPMAX 20 #define NEON_3R_VPMIN 21 #define NEON_3R_VQDMULH_VQRDMULH 22 -#define NEON_3R_VPADD 23 +#define NEON_3R_VPADD_VQRDMLAH 23 #define NEON_3R_SHA 24 /* SHA1C,SHA1P,SHA1M,SHA1SU0,SHA256H{2},SHA256SU1 */ -#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */ +#define NEON_3R_VFM_VQRDMLSH 25 /* VFMA, VFMS : float fused multiply-add */ #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */ #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */ #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */ @@ -5368,9 +5369,9 @@ static const uint8_t neon_3r_sizes[] = { [NEON_3R_VPMAX] = 0x7, [NEON_3R_VPMIN] = 0x7, [NEON_3R_VQDMULH_VQRDMULH] = 0x6, - [NEON_3R_VPADD] = 0x7, + [NEON_3R_VPADD_VQRDMLAH] = 0x7, [NEON_3R_SHA] = 0xf, /* size field encodes op type */ - [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */ + [NEON_3R_VFM_VQRDMLSH] = 0x7, /* For VFM, size bit 1 encodes op */ [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */ @@ -5556,6 +5557,7 @@ static const uint8_t neon_2rm_sizes[] = { static int disas_neon_data_insn(DisasContext *s, uint32_t insn) { + void (*fn_gvec_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); int op; int q; int rd, rn, rm; @@ -5600,12 +5602,12 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) if (q && ((rd | rn | rm) & 1)) { return 1; } - /* - * The SHA-1/SHA-256 3-register instructions require special treatment - * here, as their size field is overloaded as an op type selector, and - * they all consume their input in a single pass. - */ - if (op == NEON_3R_SHA) { + switch (op) { + case NEON_3R_SHA: + /* The SHA-1/SHA-256 3-register instructions require special + * treatment here, as their size field is overloaded as an + * op type selector, and they all consume their input in a + * single pass. */ if (!q) { return 1; } @@ -5642,6 +5644,53 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) tcg_temp_free_i32(tmp2); tcg_temp_free_i32(tmp3); return 0; + + case NEON_3R_VPADD_VQRDMLAH: + if (!u) { + break; /* VPADD */ + } + /* VQRDMLAH */ + switch (size) { + case 1: + fn_gvec_ptr = gen_helper_gvec_qrdmlah_s16; + break; + case 2: + fn_gvec_ptr = gen_helper_gvec_qrdmlah_s32; + break; + default: + return 1; + } + do_vqrdmlx: + if (arm_dc_feature(s, ARM_FEATURE_V8_1_SIMD)) { + int opr_sz = (1 + q) * 8; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), cpu_env, + opr_sz, opr_sz, 0, fn_gvec_ptr); + return 0; + } + return 1; + + case NEON_3R_VFM_VQRDMLSH: + if (!u) { + /* VFM, VFMS */ + if ((5 & (1 << size)) == 0) { + return 1; + } + break; + } + /* VQRDMLSH */ + switch (size) { + case 1: + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s16; + break; + case 2: + fn_gvec_ptr = gen_helper_gvec_qrdmlsh_s32; + break; + default: + return 1; + } + goto do_vqrdmlx; } if (size == 3 && op != NEON_3R_LOGIC) { /* 64-bit element instructions. */ @@ -5727,11 +5776,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) rm = rtmp; } break; - case NEON_3R_VPADD: - if (u) { - return 1; - } - /* Fall through */ + case NEON_3R_VPADD_VQRDMLAH: case NEON_3R_VPMAX: case NEON_3R_VPMIN: pairwise = 1; @@ -5765,8 +5810,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) return 1; } break; - case NEON_3R_VFM: - if (!arm_dc_feature(s, ARM_FEATURE_VFP4) || u) { + case NEON_3R_VFM_VQRDMLSH: + if (!arm_dc_feature(s, ARM_FEATURE_VFP4)) { return 1; } break; @@ -5963,7 +6008,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } } break; - case NEON_3R_VPADD: + case NEON_3R_VPADD_VQRDMLAH: switch (size) { case 0: gen_helper_neon_padd_u8(tmp, tmp, tmp2); break; case 1: gen_helper_neon_padd_u16(tmp, tmp, tmp2); break; @@ -6062,7 +6107,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } } break; - case NEON_3R_VFM: + case NEON_3R_VFM_VQRDMLSH: { /* VFMA, VFMS: fused multiply-add */ TCGv_ptr fpstatus = get_fpstatus_ptr(1);
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/translate.c | 83 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 19 deletions(-) -- 2.13.6