Message ID | 20211218194250.247633-12-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | tcg: vector improvements | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > AVX512VL has VPROLD and VPROLQ, layered onto the same > opcode as PSHIFTD, but requires EVEX encoding and W. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/i386/tcg-target.h | 2 +- > tcg/i386/tcg-target.c.inc | 15 +++++++++++++-- > 2 files changed, 14 insertions(+), 3 deletions(-) > > diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h > index 12d098ad6c..38c09fd66c 100644 > --- a/tcg/i386/tcg-target.h > +++ b/tcg/i386/tcg-target.h > @@ -195,7 +195,7 @@ extern bool have_movbe; > #define TCG_TARGET_HAS_not_vec 0 > #define TCG_TARGET_HAS_neg_vec 0 > #define TCG_TARGET_HAS_abs_vec 1 > -#define TCG_TARGET_HAS_roti_vec 0 > +#define TCG_TARGET_HAS_roti_vec have_avx512vl > #define TCG_TARGET_HAS_rots_vec 0 > #define TCG_TARGET_HAS_rotv_vec 0 > #define TCG_TARGET_HAS_shi_vec 1 > diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc > index c4e6f2e5ea..5ab7c4c0fa 100644 > --- a/tcg/i386/tcg-target.c.inc > +++ b/tcg/i386/tcg-target.c.inc > @@ -361,7 +361,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) > #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) > #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) > #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ > -#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ > +#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ > #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ > #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) > #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) > @@ -2906,6 +2906,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, > insn |= P_VEXW | P_EVEX; > } > sub = 4; > + goto gen_shift; > + case INDEX_op_rotli_vec: > + insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ > + if (vece == MO_64) { > + insn |= P_VEXW; > + } > + sub = 1; > + goto gen_shift; This could just be a /* fall-through */ although given the large amount of gotos the switch statement is gathering I'm not sure it makes too much difference. Is there any reason why gen_shift couldn't be pushed into a helper function so we just had: static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) { tcg_debug_assert(vece != MO_8); if (type == TCG_TYPE_V256) { insn |= P_VEXL; } tcg_out_vex_modrm(s, insn, sub, a0, a1); tcg_out8(s, a2); } ... case INDEX_op_rotli_vec: insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ if (vece == MO_64) { insn |= P_VEXW; } tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2); break; Surely the compiler would inline if needed (and even if it didn't it the code generation that critical we care about a few cycles)?
On 2/3/22 01:05, Alex Bennée wrote: > Is there any reason why gen_shift couldn't be pushed into a helper > function so we just had: > > static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) { > tcg_debug_assert(vece != MO_8); > if (type == TCG_TYPE_V256) { > insn |= P_VEXL; > } > tcg_out_vex_modrm(s, insn, sub, a0, a1); > tcg_out8(s, a2); > } > > ... > > case INDEX_op_rotli_vec: > insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ > if (vece == MO_64) { > insn |= P_VEXW; > } > tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2); > break; > > Surely the compiler would inline if needed (and even if it didn't it the > code generation that critical we care about a few cycles)? Yes, I suppose I could pull out a helper or two. Just one of those things where something used to be cleaner, and then the code grew. r~
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 12d098ad6c..38c09fd66c 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -195,7 +195,7 @@ extern bool have_movbe; #define TCG_TARGET_HAS_not_vec 0 #define TCG_TARGET_HAS_neg_vec 0 #define TCG_TARGET_HAS_abs_vec 1 -#define TCG_TARGET_HAS_roti_vec 0 +#define TCG_TARGET_HAS_roti_vec have_avx512vl #define TCG_TARGET_HAS_rots_vec 0 #define TCG_TARGET_HAS_rotv_vec 0 #define TCG_TARGET_HAS_shi_vec 1 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index c4e6f2e5ea..5ab7c4c0fa 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -361,7 +361,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ -#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) @@ -2906,6 +2906,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, insn |= P_VEXW | P_EVEX; } sub = 4; + goto gen_shift; + case INDEX_op_rotli_vec: + insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ + if (vece == MO_64) { + insn |= P_VEXW; + } + sub = 1; + goto gen_shift; gen_shift: tcg_debug_assert(vece != MO_8); if (type == TCG_TYPE_V256) { @@ -3195,6 +3203,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_shli_vec: case INDEX_op_shri_vec: case INDEX_op_sari_vec: + case INDEX_op_rotli_vec: case INDEX_op_x86_psrldq_vec: return C_O1_I1(x, x); @@ -3216,11 +3225,13 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_xor_vec: case INDEX_op_andc_vec: return 1; - case INDEX_op_rotli_vec: case INDEX_op_cmp_vec: case INDEX_op_cmpsel_vec: return -1; + case INDEX_op_rotli_vec: + return have_avx512vl && vece >= MO_32 ? 1 : -1; + case INDEX_op_shli_vec: case INDEX_op_shri_vec: /* We must expand the operation for MO_8. */
AVX512VL has VPROLD and VPROLQ, layered onto the same opcode as PSHIFTD, but requires EVEX encoding and W. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/i386/tcg-target.h | 2 +- tcg/i386/tcg-target.c.inc | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-)