Message ID | 20170817230114.3655-9-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | TCG vectorization and example conversion | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/i386/tcg-target.h | 46 +++++- > tcg/tcg-opc.h | 12 +- > tcg/i386/tcg-target.inc.c | 382 ++++++++++++++++++++++++++++++++++++++++++---- > 3 files changed, 399 insertions(+), 41 deletions(-) > > diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h > index e512648c95..147f82062b 100644 > --- a/tcg/i386/tcg-target.h > +++ b/tcg/i386/tcg-target.h > @@ -30,11 +30,10 @@ > > #ifdef __x86_64__ > # define TCG_TARGET_REG_BITS 64 > -# define TCG_TARGET_NB_REGS 16 > #else > # define TCG_TARGET_REG_BITS 32 > -# define TCG_TARGET_NB_REGS 8 > #endif > +# define TCG_TARGET_NB_REGS 24 > > typedef enum { > TCG_REG_EAX = 0, > @@ -56,6 +55,19 @@ typedef enum { > TCG_REG_R13, > TCG_REG_R14, > TCG_REG_R15, > + > + /* SSE registers; 64-bit has access to 8 more, but we won't > + need more than a few and using only the first 8 minimizes > + the need for a rex prefix on the sse instructions. */ > + TCG_REG_XMM0, > + TCG_REG_XMM1, > + TCG_REG_XMM2, > + TCG_REG_XMM3, > + TCG_REG_XMM4, > + TCG_REG_XMM5, > + TCG_REG_XMM6, > + TCG_REG_XMM7, > + > TCG_REG_RAX = TCG_REG_EAX, > TCG_REG_RCX = TCG_REG_ECX, > TCG_REG_RDX = TCG_REG_EDX, > @@ -79,6 +91,17 @@ extern bool have_bmi1; > extern bool have_bmi2; > extern bool have_popcnt; > > +#ifdef __SSE2__ > +#define have_sse2 true > +#else > +extern bool have_sse2; > +#endif > +#ifdef __AVX2__ > +#define have_avx2 true > +#else > +extern bool have_avx2; > +#endif > + > /* optional instructions */ > #define TCG_TARGET_HAS_div2_i32 1 > #define TCG_TARGET_HAS_rot_i32 1 > @@ -147,6 +170,25 @@ extern bool have_popcnt; > #define TCG_TARGET_HAS_mulsh_i64 0 > #endif > > +#define TCG_TARGET_HAS_v64 have_sse2 > +#define TCG_TARGET_HAS_v128 have_sse2 > +#define TCG_TARGET_HAS_v256 have_avx2 > + > +#define TCG_TARGET_HAS_andc_v64 TCG_TARGET_HAS_v64 > +#define TCG_TARGET_HAS_orc_v64 0 > +#define TCG_TARGET_HAS_not_v64 0 > +#define TCG_TARGET_HAS_neg_v64 0 > + > +#define TCG_TARGET_HAS_andc_v128 TCG_TARGET_HAS_v128 > +#define TCG_TARGET_HAS_orc_v128 0 > +#define TCG_TARGET_HAS_not_v128 0 > +#define TCG_TARGET_HAS_neg_v128 0 > + > +#define TCG_TARGET_HAS_andc_v256 TCG_TARGET_HAS_v256 > +#define TCG_TARGET_HAS_orc_v256 0 > +#define TCG_TARGET_HAS_not_v256 0 > +#define TCG_TARGET_HAS_neg_v256 0 > + > #define TCG_TARGET_deposit_i32_valid(ofs, len) \ > (have_bmi2 || \ > ((ofs) == 0 && (len) == 8) || \ > diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h > index b1445a4c24..b84cd584fb 100644 > --- a/tcg/tcg-opc.h > +++ b/tcg/tcg-opc.h > @@ -212,13 +212,13 @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, > /* Host integer vector operations. */ > /* These opcodes are required whenever the base vector size is enabled. */ > > -DEF(mov_v64, 1, 1, 0, IMPL(TCG_TARGET_HAS_v64)) > -DEF(mov_v128, 1, 1, 0, IMPL(TCG_TARGET_HAS_v128)) > -DEF(mov_v256, 1, 1, 0, IMPL(TCG_TARGET_HAS_v256)) > +DEF(mov_v64, 1, 1, 0, TCG_OPF_NOT_PRESENT) > +DEF(mov_v128, 1, 1, 0, TCG_OPF_NOT_PRESENT) > +DEF(mov_v256, 1, 1, 0, TCG_OPF_NOT_PRESENT) > > -DEF(movi_v64, 1, 0, 1, IMPL(TCG_TARGET_HAS_v64)) > -DEF(movi_v128, 1, 0, 1, IMPL(TCG_TARGET_HAS_v128)) > -DEF(movi_v256, 1, 0, 1, IMPL(TCG_TARGET_HAS_v256)) > +DEF(movi_v64, 1, 0, 1, TCG_OPF_NOT_PRESENT) > +DEF(movi_v128, 1, 0, 1, TCG_OPF_NOT_PRESENT) > +DEF(movi_v256, 1, 0, 1, TCG_OPF_NOT_PRESENT) > > DEF(ld_v64, 1, 1, 1, IMPL(TCG_TARGET_HAS_v64)) > DEF(ld_v128, 1, 1, 1, IMPL(TCG_TARGET_HAS_v128)) > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index aeefb72aa0..0e01b54aa0 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -31,7 +31,9 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { > "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", > #else > "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", > + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, > #endif > + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", > }; > #endif > > @@ -61,6 +63,14 @@ static const int tcg_target_reg_alloc_order[] = { > TCG_REG_EDX, > TCG_REG_EAX, > #endif > + TCG_REG_XMM0, > + TCG_REG_XMM1, > + TCG_REG_XMM2, > + TCG_REG_XMM3, > + TCG_REG_XMM4, > + TCG_REG_XMM5, > + TCG_REG_XMM6, > + TCG_REG_XMM7, > }; > > static const int tcg_target_call_iarg_regs[] = { > @@ -94,7 +104,7 @@ static const int tcg_target_call_oarg_regs[] = { > #define TCG_CT_CONST_I32 0x400 > #define TCG_CT_CONST_WSZ 0x800 > > -/* Registers used with L constraint, which are the first argument > +/* Registers used with L constraint, which are the first argument > registers on x86_64, and two random call clobbered registers on > i386. */ > #if TCG_TARGET_REG_BITS == 64 > @@ -127,6 +137,16 @@ bool have_bmi1; > bool have_bmi2; > bool have_popcnt; > > +#ifndef have_sse2 > +bool have_sse2; > +#endif > +#ifdef have_avx2 > +#define have_avx1 have_avx2 > +#else > +static bool have_avx1; > +bool have_avx2; > +#endif > + > #ifdef CONFIG_CPUID_H > static bool have_movbe; > static bool have_lzcnt; > @@ -215,6 +235,10 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, > /* With TZCNT/LZCNT, we can have operand-size as an input. */ > ct->ct |= TCG_CT_CONST_WSZ; > break; > + case 'x': > + ct->ct |= TCG_CT_REG; > + tcg_regset_set32(ct->u.regs, 0, 0xff0000); > + break; > > /* qemu_ld/st address constraint */ > case 'L': > @@ -292,6 +316,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, > #endif > #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ > #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ > +#define P_VEXL 0x80000 /* Set VEX.L = 1 */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -324,13 +349,31 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, > #define OPC_MOVL_Iv (0xb8) > #define OPC_MOVBE_GyMy (0xf0 | P_EXT38) > #define OPC_MOVBE_MyGy (0xf1 | P_EXT38) > +#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16) > +#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16) > +#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3) > +#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3) > +#define OPC_MOVQ_GyMy (0x7e | P_EXT | P_SIMDF3) > +#define OPC_MOVQ_MyGy (0xd6 | P_EXT | P_DATA16) > #define OPC_MOVSBL (0xbe | P_EXT) > #define OPC_MOVSWL (0xbf | P_EXT) > #define OPC_MOVSLQ (0x63 | P_REXW) > #define OPC_MOVZBL (0xb6 | P_EXT) > #define OPC_MOVZWL (0xb7 | P_EXT) > +#define OPC_PADDB (0xfc | P_EXT | P_DATA16) > +#define OPC_PADDW (0xfd | P_EXT | P_DATA16) > +#define OPC_PADDD (0xfe | P_EXT | P_DATA16) > +#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) > +#define OPC_PAND (0xdb | P_EXT | P_DATA16) > +#define OPC_PANDN (0xdf | P_EXT | P_DATA16) > #define OPC_PDEP (0xf5 | P_EXT38 | P_SIMDF2) > #define OPC_PEXT (0xf5 | P_EXT38 | P_SIMDF3) > +#define OPC_POR (0xeb | P_EXT | P_DATA16) > +#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) > +#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) > +#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) > +#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) > +#define OPC_PXOR (0xef | P_EXT | P_DATA16) > #define OPC_POP_r32 (0x58) > #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) > #define OPC_PUSH_r32 (0x50) > @@ -500,7 +543,8 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) > tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } > > -static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > +static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, > + int rm, int index) > { > int tmp; > > @@ -515,14 +559,16 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > } else if (opc & P_EXT) { > tmp = 1; > } else { > - tcg_abort(); > + g_assert_not_reached(); > } > - tmp |= 0x40; /* VEX.X */ > tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ > + tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ > tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ > tcg_out8(s, tmp); > > tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ > + tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ > + > /* VEX.pp */ > if (opc & P_DATA16) { > tmp |= 1; /* 0x66 */ > @@ -538,7 +584,7 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > > static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) > { > - tcg_out_vex_pfx_opc(s, opc, r, v, rm); > + tcg_out_vex_pfx_opc(s, opc, r, v, rm, 0); > tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } > > @@ -565,7 +611,7 @@ static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r, > static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, > tcg_target_ulong data) > { > - tcg_out_vex_pfx_opc(s, opc, r, v, 0); > + tcg_out_vex_pfx_opc(s, opc, r, v, 0, 0); > tcg_out_sfx_pool_imm(s, r, data); > } > > @@ -574,8 +620,8 @@ static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, > mode for absolute addresses, ~RM is the size of the immediate operand > that will follow the instruction. */ > > -static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > - int index, int shift, intptr_t offset) > +static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, > + int shift, intptr_t offset) > { > int mod, len; > > @@ -586,7 +632,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; > intptr_t disp = offset - pc; > if (disp == (int32_t)disp) { > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (LOWREGMASK(r) << 3) | 5); > tcg_out32(s, disp); > return; > @@ -596,7 +641,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > use of the MODRM+SIB encoding and is therefore larger than > rip-relative addressing. */ > if (offset == (int32_t)offset) { > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (LOWREGMASK(r) << 3) | 4); > tcg_out8(s, (4 << 3) | 5); > tcg_out32(s, offset); > @@ -604,10 +648,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > } > > /* ??? The memory isn't directly addressable. */ > - tcg_abort(); > + g_assert_not_reached(); > } else { > /* Absolute address. */ > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (r << 3) | 5); > tcg_out32(s, offset); > return; > @@ -630,7 +673,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > that would be used for %esp is the escape to the two byte form. */ > if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { > /* Single byte MODRM format. */ > - tcg_out_opc(s, opc, r, rm, 0); > tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } else { > /* Two byte MODRM+SIB format. */ > @@ -644,7 +686,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > tcg_debug_assert(index != TCG_REG_ESP); > } > > - tcg_out_opc(s, opc, r, rm, index); > tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); > tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); > } > @@ -656,6 +697,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > } > } > > +static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > + int index, int shift, intptr_t offset) > +{ > + tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); > + tcg_out_sib_offset(s, r, rm, index, shift, offset); > +} > + > +static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, > + int rm, int index, int shift, > + intptr_t offset) > +{ > + tcg_out_vex_pfx_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); > + tcg_out_sib_offset(s, r, rm, index, shift, offset); > +} > + > /* A simplification of the above with no index or shift. */ > static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, > int rm, intptr_t offset) > @@ -663,6 +719,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, > tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); > } > > +static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, > + int v, int rm, intptr_t offset) > +{ > + tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); > +} > + > +static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm) > +{ > + if (have_avx1) { > + tcg_out_vex_modrm(s, opc, r, 0, rm); > + } else { > + tcg_out_modrm(s, opc, r, rm); > + } > +} > + > +static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r, > + int rm, intptr_t offset) > +{ > + if (have_avx1) { > + tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset); > + } else { > + tcg_out_modrm_offset(s, opc, r, rm, offset); > + } > +} > + > /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ > static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) > { > @@ -673,12 +754,32 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) > tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); > } > > -static inline void tcg_out_mov(TCGContext *s, TCGType type, > - TCGReg ret, TCGReg arg) > +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) > { > if (arg != ret) { > - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm(s, opc, ret, arg); > + int opc = 0; > + > + switch (type) { > + case TCG_TYPE_I64: > + opc = P_REXW; > + /* fallthru */ > + case TCG_TYPE_I32: > + opc |= OPC_MOVL_GvEv; > + tcg_out_modrm(s, opc, ret, arg); > + break; > + > + case TCG_TYPE_V256: > + opc = P_VEXL; > + /* fallthru */ > + case TCG_TYPE_V128: > + case TCG_TYPE_V64: > + opc |= OPC_MOVDQA_GyMy; > + tcg_out_maybe_vex_modrm(s, opc, ret, arg); > + break; > + > + default: > + g_assert_not_reached(); > + } > } > } > > @@ -687,6 +788,27 @@ static void tcg_out_movi(TCGContext *s, TCGType type, > { > tcg_target_long diff; > > + switch (type) { > + case TCG_TYPE_I32: > + case TCG_TYPE_I64: > + break; > + > + case TCG_TYPE_V64: > + case TCG_TYPE_V128: > + case TCG_TYPE_V256: > + /* ??? Revisit this as the implementation progresses. */ > + tcg_debug_assert(arg == 0); > + if (have_avx1) { > + tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); > + } else { > + tcg_out_modrm(s, OPC_PXOR, ret, ret); > + } > + return; > + > + default: > + g_assert_not_reached(); > + } > + > if (arg == 0) { > tgen_arithr(s, ARITH_XOR, ret, ret); > return; > @@ -750,18 +872,54 @@ static inline void tcg_out_pop(TCGContext *s, int reg) > tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); > } > > -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, > - TCGReg arg1, intptr_t arg2) > +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, > + TCGReg arg1, intptr_t arg2) > { > - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm_offset(s, opc, ret, arg1, arg2); > + switch (type) { > + case TCG_TYPE_I64: > + tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); > + break; > + case TCG_TYPE_I32: > + tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); > + break; > + case TCG_TYPE_V64: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2); > + break; > + case TCG_TYPE_V128: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2); > + break; > + case TCG_TYPE_V256: > + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL, > + ret, 0, arg1, arg2); > + break; > + default: > + g_assert_not_reached(); > + } > } > > -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, > - TCGReg arg1, intptr_t arg2) > +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, > + TCGReg arg1, intptr_t arg2) > { > - int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm_offset(s, opc, arg, arg1, arg2); > + switch (type) { > + case TCG_TYPE_I64: > + tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); > + break; > + case TCG_TYPE_I32: > + tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); > + break; > + case TCG_TYPE_V64: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2); > + break; > + case TCG_TYPE_V128: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2); > + break; > + case TCG_TYPE_V256: > + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL, > + arg, 0, arg1, arg2); > + break; > + default: > + g_assert_not_reached(); > + } > } > > static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, > @@ -773,6 +931,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, > return false; > } > rexw = P_REXW; > + } else if (type != TCG_TYPE_I32) { > + return false; > } > tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); > tcg_out32(s, val); > @@ -1914,6 +2074,15 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > case glue(glue(INDEX_op_, x), _i32) > #endif > > +#define OP_128_256(x) \ > + case glue(glue(INDEX_op_, x), _v256): \ > + rexw = P_VEXL; /* FALLTHRU */ \ > + case glue(glue(INDEX_op_, x), _v128) > + > +#define OP_64_128_256(x) \ > + OP_128_256(x): \ > + case glue(glue(INDEX_op_, x), _v64) > + > /* Hoist the loads of the most common arguments. */ > a0 = args[0]; > a1 = args[1]; > @@ -2379,19 +2548,94 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > } > break; > > + OP_64_128_256(add8): > + c = OPC_PADDB; > + goto gen_simd; > + OP_64_128_256(add16): > + c = OPC_PADDW; > + goto gen_simd; > + OP_64_128_256(add32): > + c = OPC_PADDD; > + goto gen_simd; > + OP_128_256(add64): > + c = OPC_PADDQ; > + goto gen_simd; > + OP_64_128_256(sub8): > + c = OPC_PSUBB; > + goto gen_simd; > + OP_64_128_256(sub16): > + c = OPC_PSUBW; > + goto gen_simd; > + OP_64_128_256(sub32): > + c = OPC_PSUBD; > + goto gen_simd; > + OP_128_256(sub64): > + c = OPC_PSUBQ; > + goto gen_simd; > + OP_64_128_256(and): > + c = OPC_PAND; > + goto gen_simd; > + OP_64_128_256(andc): > + c = OPC_PANDN; > + goto gen_simd; > + OP_64_128_256(or): > + c = OPC_POR; > + goto gen_simd; > + OP_64_128_256(xor): > + c = OPC_PXOR; > + gen_simd: > + if (have_avx1) { > + tcg_out_vex_modrm(s, c, a0, a1, a2); > + } else { > + tcg_out_modrm(s, c, a0, a2); > + } > + break; > + > + case INDEX_op_ld_v64: > + c = TCG_TYPE_V64; > + goto gen_simd_ld; > + case INDEX_op_ld_v128: > + c = TCG_TYPE_V128; > + goto gen_simd_ld; > + case INDEX_op_ld_v256: > + c = TCG_TYPE_V256; > + gen_simd_ld: > + tcg_out_ld(s, c, a0, a1, a2); > + break; > + > + case INDEX_op_st_v64: > + c = TCG_TYPE_V64; > + goto gen_simd_st; > + case INDEX_op_st_v128: > + c = TCG_TYPE_V128; > + goto gen_simd_st; > + case INDEX_op_st_v256: > + c = TCG_TYPE_V256; > + gen_simd_st: > + tcg_out_st(s, c, a0, a1, a2); > + break; > + > case INDEX_op_mb: > tcg_out_mb(s, a0); > break; > case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ > case INDEX_op_mov_i64: > + case INDEX_op_mov_v64: > + case INDEX_op_mov_v128: > + case INDEX_op_mov_v256: > case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ > case INDEX_op_movi_i64: > + case INDEX_op_movi_v64: > + case INDEX_op_movi_v128: > + case INDEX_op_movi_v256: > case INDEX_op_call: /* Always emitted via tcg_out_call. */ > default: > tcg_abort(); > } > > #undef OP_32_64 > +#undef OP_128_256 > +#undef OP_64_128_256 > } > > static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > @@ -2417,6 +2661,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > = { .args_ct_str = { "r", "r", "L", "L" } }; > static const TCGTargetOpDef L_L_L_L > = { .args_ct_str = { "L", "L", "L", "L" } }; > + static const TCGTargetOpDef x_0_x = { .args_ct_str = { "x", "0", "x" } }; > + static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; > + static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; > > switch (op) { > case INDEX_op_goto_ptr: > @@ -2620,6 +2867,52 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > return &s2; > } > > + case INDEX_op_ld_v64: > + case INDEX_op_ld_v128: > + case INDEX_op_ld_v256: > + case INDEX_op_st_v64: > + case INDEX_op_st_v128: > + case INDEX_op_st_v256: > + return &x_r; > + > + case INDEX_op_add8_v64: > + case INDEX_op_add8_v128: > + case INDEX_op_add16_v64: > + case INDEX_op_add16_v128: > + case INDEX_op_add32_v64: > + case INDEX_op_add32_v128: > + case INDEX_op_add64_v128: > + case INDEX_op_sub8_v64: > + case INDEX_op_sub8_v128: > + case INDEX_op_sub16_v64: > + case INDEX_op_sub16_v128: > + case INDEX_op_sub32_v64: > + case INDEX_op_sub32_v128: > + case INDEX_op_sub64_v128: > + case INDEX_op_and_v64: > + case INDEX_op_and_v128: > + case INDEX_op_andc_v64: > + case INDEX_op_andc_v128: > + case INDEX_op_or_v64: > + case INDEX_op_or_v128: > + case INDEX_op_xor_v64: > + case INDEX_op_xor_v128: > + return have_avx1 ? &x_x_x : &x_0_x; > + > + case INDEX_op_add8_v256: > + case INDEX_op_add16_v256: > + case INDEX_op_add32_v256: > + case INDEX_op_add64_v256: > + case INDEX_op_sub8_v256: > + case INDEX_op_sub16_v256: > + case INDEX_op_sub32_v256: > + case INDEX_op_sub64_v256: > + case INDEX_op_and_v256: > + case INDEX_op_andc_v256: > + case INDEX_op_or_v256: > + case INDEX_op_xor_v256: > + return &x_x_x; > + > default: > break; > } > @@ -2725,9 +3018,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) > static void tcg_target_init(TCGContext *s) > { > #ifdef CONFIG_CPUID_H > - unsigned a, b, c, d; > + unsigned a, b, c, d, b7 = 0; > int max = __get_cpuid_max(0, 0); > > + if (max >= 7) { > + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ > + __cpuid_count(7, 0, a, b7, c, d); > + have_bmi1 = (b7 & bit_BMI) != 0; > + have_bmi2 = (b7 & bit_BMI2) != 0; > + } > + > if (max >= 1) { > __cpuid(1, a, b, c, d); > #ifndef have_cmov > @@ -2736,17 +3036,26 @@ static void tcg_target_init(TCGContext *s) > available, we'll use a small forward branch. */ > have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_sse2 > + have_sse2 = (d & bit_SSE2) != 0; > +#endif > /* MOVBE is only available on Intel Atom and Haswell CPUs, so we > need to probe for it. */ > have_movbe = (c & bit_MOVBE) != 0; > have_popcnt = (c & bit_POPCNT) != 0; > - } > > - if (max >= 7) { > - /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ > - __cpuid_count(7, 0, a, b, c, d); > - have_bmi1 = (b & bit_BMI) != 0; > - have_bmi2 = (b & bit_BMI2) != 0; > +#ifndef have_avx2 > + /* There are a number of things we must check before we can be > + sure of not hitting invalid opcode. */ > + if (c & bit_OSXSAVE) { > + unsigned xcrl, xcrh; > + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); > + if (xcrl & 6 == 6) { My picky compiler complains: /home/alex/lsrc/qemu/qemu.git/tcg/i386/tcg-target.inc.c: In function ‘tcg_target_init’: /home/alex/lsrc/qemu/qemu.git/tcg/i386/tcg-target.inc.c:3053:22: error: suggest parentheses around comparison in operand of ‘&’ [-Werror=parentheses] if (xcrl & 6 == 6) { > + have_avx1 = (c & bit_AVX) != 0; > + have_avx2 = (b7 & bit_AVX2) != 0; > + } > + } > +#endif > } > > max = __get_cpuid_max(0x8000000, 0); > @@ -2763,6 +3072,13 @@ static void tcg_target_init(TCGContext *s) > } else { > tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff); > } > + if (have_sse2) { > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V64], 0, 0xff0000); > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V128], 0, 0xff0000); > + } > + if (have_avx2) { > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V256], 0, 0xff0000); > + } > > tcg_regset_clear(tcg_target_call_clobber_regs); > tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); -- Alex Bennée
On 08/22/2017 06:15 AM, Alex Bennée wrote: >> +#ifndef have_avx2 >> + /* There are a number of things we must check before we can be >> + sure of not hitting invalid opcode. */ >> + if (c & bit_OSXSAVE) { >> + unsigned xcrl, xcrh; >> + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); >> + if (xcrl & 6 == 6) { > > My picky compiler complains: > > /home/alex/lsrc/qemu/qemu.git/tcg/i386/tcg-target.inc.c: In function ‘tcg_target_init’: > /home/alex/lsrc/qemu/qemu.git/tcg/i386/tcg-target.inc.c:3053:22: error: suggest parentheses around comparison in operand of ‘&’ [-Werror=parentheses] > if (xcrl & 6 == 6) { Bah. I forgot that my default build uses -march=native, and my laptop has AVX2, so this bit wouldn't have been compile tested at all. Fixed on the branch. r~
Richard Henderson <richard.henderson@linaro.org> writes: > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/i386/tcg-target.h | 46 +++++- > tcg/tcg-opc.h | 12 +- > tcg/i386/tcg-target.inc.c | 382 ++++++++++++++++++++++++++++++++++++++++++---- > 3 files changed, 399 insertions(+), 41 deletions(-) > > diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h > index e512648c95..147f82062b 100644 > --- a/tcg/i386/tcg-target.h > +++ b/tcg/i386/tcg-target.h > @@ -30,11 +30,10 @@ > > #ifdef __x86_64__ > # define TCG_TARGET_REG_BITS 64 > -# define TCG_TARGET_NB_REGS 16 > #else > # define TCG_TARGET_REG_BITS 32 > -# define TCG_TARGET_NB_REGS 8 > #endif > +# define TCG_TARGET_NB_REGS 24 > > typedef enum { > TCG_REG_EAX = 0, > @@ -56,6 +55,19 @@ typedef enum { > TCG_REG_R13, > TCG_REG_R14, > TCG_REG_R15, > + > + /* SSE registers; 64-bit has access to 8 more, but we won't > + need more than a few and using only the first 8 minimizes > + the need for a rex prefix on the sse instructions. */ > + TCG_REG_XMM0, > + TCG_REG_XMM1, > + TCG_REG_XMM2, > + TCG_REG_XMM3, > + TCG_REG_XMM4, > + TCG_REG_XMM5, > + TCG_REG_XMM6, > + TCG_REG_XMM7, > + > TCG_REG_RAX = TCG_REG_EAX, > TCG_REG_RCX = TCG_REG_ECX, > TCG_REG_RDX = TCG_REG_EDX, > @@ -79,6 +91,17 @@ extern bool have_bmi1; > extern bool have_bmi2; > extern bool have_popcnt; > > +#ifdef __SSE2__ > +#define have_sse2 true > +#else > +extern bool have_sse2; > +#endif > +#ifdef __AVX2__ > +#define have_avx2 true > +#else > +extern bool have_avx2; > +#endif > + > /* optional instructions */ > #define TCG_TARGET_HAS_div2_i32 1 > #define TCG_TARGET_HAS_rot_i32 1 > @@ -147,6 +170,25 @@ extern bool have_popcnt; > #define TCG_TARGET_HAS_mulsh_i64 0 > #endif > > +#define TCG_TARGET_HAS_v64 have_sse2 > +#define TCG_TARGET_HAS_v128 have_sse2 > +#define TCG_TARGET_HAS_v256 have_avx2 > + > +#define TCG_TARGET_HAS_andc_v64 TCG_TARGET_HAS_v64 > +#define TCG_TARGET_HAS_orc_v64 0 > +#define TCG_TARGET_HAS_not_v64 0 > +#define TCG_TARGET_HAS_neg_v64 0 > + > +#define TCG_TARGET_HAS_andc_v128 TCG_TARGET_HAS_v128 > +#define TCG_TARGET_HAS_orc_v128 0 > +#define TCG_TARGET_HAS_not_v128 0 > +#define TCG_TARGET_HAS_neg_v128 0 > + > +#define TCG_TARGET_HAS_andc_v256 TCG_TARGET_HAS_v256 > +#define TCG_TARGET_HAS_orc_v256 0 > +#define TCG_TARGET_HAS_not_v256 0 > +#define TCG_TARGET_HAS_neg_v256 0 > + > #define TCG_TARGET_deposit_i32_valid(ofs, len) \ > (have_bmi2 || \ > ((ofs) == 0 && (len) == 8) || \ > diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h > index b1445a4c24..b84cd584fb 100644 > --- a/tcg/tcg-opc.h > +++ b/tcg/tcg-opc.h > @@ -212,13 +212,13 @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, > /* Host integer vector operations. */ > /* These opcodes are required whenever the base vector size is enabled. */ > > -DEF(mov_v64, 1, 1, 0, IMPL(TCG_TARGET_HAS_v64)) > -DEF(mov_v128, 1, 1, 0, IMPL(TCG_TARGET_HAS_v128)) > -DEF(mov_v256, 1, 1, 0, IMPL(TCG_TARGET_HAS_v256)) > +DEF(mov_v64, 1, 1, 0, TCG_OPF_NOT_PRESENT) > +DEF(mov_v128, 1, 1, 0, TCG_OPF_NOT_PRESENT) > +DEF(mov_v256, 1, 1, 0, TCG_OPF_NOT_PRESENT) > > -DEF(movi_v64, 1, 0, 1, IMPL(TCG_TARGET_HAS_v64)) > -DEF(movi_v128, 1, 0, 1, IMPL(TCG_TARGET_HAS_v128)) > -DEF(movi_v256, 1, 0, 1, IMPL(TCG_TARGET_HAS_v256)) > +DEF(movi_v64, 1, 0, 1, TCG_OPF_NOT_PRESENT) > +DEF(movi_v128, 1, 0, 1, TCG_OPF_NOT_PRESENT) > +DEF(movi_v256, 1, 0, 1, TCG_OPF_NOT_PRESENT) I don't follow, isn't the point of IMPL(TCG_TARGET_HAS_foo) to allow the definition when the backend adds #define TCG_TARGET_HAS_foo 1? > > DEF(ld_v64, 1, 1, 1, IMPL(TCG_TARGET_HAS_v64)) > DEF(ld_v128, 1, 1, 1, IMPL(TCG_TARGET_HAS_v128)) > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index aeefb72aa0..0e01b54aa0 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -31,7 +31,9 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { > "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", > #else > "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", > + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, > #endif > + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", > }; > #endif > > @@ -61,6 +63,14 @@ static const int tcg_target_reg_alloc_order[] = { > TCG_REG_EDX, > TCG_REG_EAX, > #endif > + TCG_REG_XMM0, > + TCG_REG_XMM1, > + TCG_REG_XMM2, > + TCG_REG_XMM3, > + TCG_REG_XMM4, > + TCG_REG_XMM5, > + TCG_REG_XMM6, > + TCG_REG_XMM7, > }; > > static const int tcg_target_call_iarg_regs[] = { > @@ -94,7 +104,7 @@ static const int tcg_target_call_oarg_regs[] = { > #define TCG_CT_CONST_I32 0x400 > #define TCG_CT_CONST_WSZ 0x800 > > -/* Registers used with L constraint, which are the first argument > +/* Registers used with L constraint, which are the first argument > registers on x86_64, and two random call clobbered registers on > i386. */ > #if TCG_TARGET_REG_BITS == 64 > @@ -127,6 +137,16 @@ bool have_bmi1; > bool have_bmi2; > bool have_popcnt; > > +#ifndef have_sse2 > +bool have_sse2; > +#endif > +#ifdef have_avx2 > +#define have_avx1 have_avx2 > +#else > +static bool have_avx1; > +bool have_avx2; > +#endif > + > #ifdef CONFIG_CPUID_H > static bool have_movbe; > static bool have_lzcnt; > @@ -215,6 +235,10 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, > /* With TZCNT/LZCNT, we can have operand-size as an input. */ > ct->ct |= TCG_CT_CONST_WSZ; > break; > + case 'x': > + ct->ct |= TCG_CT_REG; > + tcg_regset_set32(ct->u.regs, 0, 0xff0000); > + break; The documentation on constraints in the README is fairly minimal and we keep adding target specific ones so perhaps a single line comment here for clarity? > > /* qemu_ld/st address constraint */ > case 'L': > @@ -292,6 +316,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, > #endif > #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ > #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ > +#define P_VEXL 0x80000 /* Set VEX.L = 1 */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -324,13 +349,31 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, > #define OPC_MOVL_Iv (0xb8) > #define OPC_MOVBE_GyMy (0xf0 | P_EXT38) > #define OPC_MOVBE_MyGy (0xf1 | P_EXT38) > +#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16) > +#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16) > +#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3) > +#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3) > +#define OPC_MOVQ_GyMy (0x7e | P_EXT | P_SIMDF3) > +#define OPC_MOVQ_MyGy (0xd6 | P_EXT | P_DATA16) > #define OPC_MOVSBL (0xbe | P_EXT) > #define OPC_MOVSWL (0xbf | P_EXT) > #define OPC_MOVSLQ (0x63 | P_REXW) > #define OPC_MOVZBL (0xb6 | P_EXT) > #define OPC_MOVZWL (0xb7 | P_EXT) > +#define OPC_PADDB (0xfc | P_EXT | P_DATA16) > +#define OPC_PADDW (0xfd | P_EXT | P_DATA16) > +#define OPC_PADDD (0xfe | P_EXT | P_DATA16) > +#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) > +#define OPC_PAND (0xdb | P_EXT | P_DATA16) > +#define OPC_PANDN (0xdf | P_EXT | P_DATA16) > #define OPC_PDEP (0xf5 | P_EXT38 | P_SIMDF2) > #define OPC_PEXT (0xf5 | P_EXT38 | P_SIMDF3) > +#define OPC_POR (0xeb | P_EXT | P_DATA16) > +#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) > +#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) > +#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) > +#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) > +#define OPC_PXOR (0xef | P_EXT | P_DATA16) > #define OPC_POP_r32 (0x58) > #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) > #define OPC_PUSH_r32 (0x50) > @@ -500,7 +543,8 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) > tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } > > -static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > +static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, > + int rm, int index) > { > int tmp; > > @@ -515,14 +559,16 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > } else if (opc & P_EXT) { > tmp = 1; > } else { > - tcg_abort(); > + g_assert_not_reached(); > } > - tmp |= 0x40; /* VEX.X */ > tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ > + tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ > tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ > tcg_out8(s, tmp); > > tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ > + tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ > + > /* VEX.pp */ > if (opc & P_DATA16) { > tmp |= 1; /* 0x66 */ > @@ -538,7 +584,7 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) > > static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) > { > - tcg_out_vex_pfx_opc(s, opc, r, v, rm); > + tcg_out_vex_pfx_opc(s, opc, r, v, rm, 0); > tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } > > @@ -565,7 +611,7 @@ static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r, > static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, > tcg_target_ulong data) > { > - tcg_out_vex_pfx_opc(s, opc, r, v, 0); > + tcg_out_vex_pfx_opc(s, opc, r, v, 0, 0); > tcg_out_sfx_pool_imm(s, r, data); > } > > @@ -574,8 +620,8 @@ static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, > mode for absolute addresses, ~RM is the size of the immediate operand > that will follow the instruction. */ > > -static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > - int index, int shift, intptr_t offset) > +static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, > + int shift, intptr_t offset) > { > int mod, len; > > @@ -586,7 +632,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; > intptr_t disp = offset - pc; > if (disp == (int32_t)disp) { > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (LOWREGMASK(r) << 3) | 5); > tcg_out32(s, disp); > return; > @@ -596,7 +641,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > use of the MODRM+SIB encoding and is therefore larger than > rip-relative addressing. */ > if (offset == (int32_t)offset) { > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (LOWREGMASK(r) << 3) | 4); > tcg_out8(s, (4 << 3) | 5); > tcg_out32(s, offset); > @@ -604,10 +648,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > } > > /* ??? The memory isn't directly addressable. */ > - tcg_abort(); > + g_assert_not_reached(); > } else { > /* Absolute address. */ > - tcg_out_opc(s, opc, r, 0, 0); > tcg_out8(s, (r << 3) | 5); > tcg_out32(s, offset); > return; > @@ -630,7 +673,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > that would be used for %esp is the escape to the two byte form. */ > if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { > /* Single byte MODRM format. */ > - tcg_out_opc(s, opc, r, rm, 0); > tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); > } else { > /* Two byte MODRM+SIB format. */ > @@ -644,7 +686,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > tcg_debug_assert(index != TCG_REG_ESP); > } > > - tcg_out_opc(s, opc, r, rm, index); > tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); > tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); > } > @@ -656,6 +697,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > } > } > > +static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, > + int index, int shift, intptr_t offset) > +{ > + tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); > + tcg_out_sib_offset(s, r, rm, index, shift, offset); > +} > + > +static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, > + int rm, int index, int shift, > + intptr_t offset) > +{ > + tcg_out_vex_pfx_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); > + tcg_out_sib_offset(s, r, rm, index, shift, offset); > +} > + > /* A simplification of the above with no index or shift. */ > static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, > int rm, intptr_t offset) > @@ -663,6 +719,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, > tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); > } > > +static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, > + int v, int rm, intptr_t offset) > +{ > + tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); > +} > + > +static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm) > +{ > + if (have_avx1) { > + tcg_out_vex_modrm(s, opc, r, 0, rm); > + } else { > + tcg_out_modrm(s, opc, r, rm); > + } > +} > + > +static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r, > + int rm, intptr_t offset) > +{ > + if (have_avx1) { > + tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset); > + } else { > + tcg_out_modrm_offset(s, opc, r, rm, offset); > + } > +} > + > /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ > static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) > { > @@ -673,12 +754,32 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) > tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); > } > > -static inline void tcg_out_mov(TCGContext *s, TCGType type, > - TCGReg ret, TCGReg arg) > +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) > { > if (arg != ret) { > - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm(s, opc, ret, arg); > + int opc = 0; > + > + switch (type) { > + case TCG_TYPE_I64: > + opc = P_REXW; > + /* fallthru */ > + case TCG_TYPE_I32: > + opc |= OPC_MOVL_GvEv; > + tcg_out_modrm(s, opc, ret, arg); > + break; > + > + case TCG_TYPE_V256: > + opc = P_VEXL; > + /* fallthru */ > + case TCG_TYPE_V128: > + case TCG_TYPE_V64: > + opc |= OPC_MOVDQA_GyMy; > + tcg_out_maybe_vex_modrm(s, opc, ret, arg); > + break; > + > + default: > + g_assert_not_reached(); > + } > } > } > > @@ -687,6 +788,27 @@ static void tcg_out_movi(TCGContext *s, TCGType type, > { > tcg_target_long diff; > > + switch (type) { > + case TCG_TYPE_I32: > + case TCG_TYPE_I64: > + break; > + > + case TCG_TYPE_V64: > + case TCG_TYPE_V128: > + case TCG_TYPE_V256: > + /* ??? Revisit this as the implementation progresses. */ > + tcg_debug_assert(arg == 0); > + if (have_avx1) { > + tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); > + } else { > + tcg_out_modrm(s, OPC_PXOR, ret, ret); > + } > + return; > + > + default: > + g_assert_not_reached(); > + } > + > if (arg == 0) { > tgen_arithr(s, ARITH_XOR, ret, ret); > return; > @@ -750,18 +872,54 @@ static inline void tcg_out_pop(TCGContext *s, int reg) > tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); > } > > -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, > - TCGReg arg1, intptr_t arg2) > +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, > + TCGReg arg1, intptr_t arg2) > { > - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm_offset(s, opc, ret, arg1, arg2); > + switch (type) { > + case TCG_TYPE_I64: > + tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); > + break; > + case TCG_TYPE_I32: > + tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); > + break; > + case TCG_TYPE_V64: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2); > + break; > + case TCG_TYPE_V128: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2); > + break; > + case TCG_TYPE_V256: > + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL, > + ret, 0, arg1, arg2); > + break; > + default: > + g_assert_not_reached(); > + } > } > > -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, > - TCGReg arg1, intptr_t arg2) > +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, > + TCGReg arg1, intptr_t arg2) > { > - int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0); > - tcg_out_modrm_offset(s, opc, arg, arg1, arg2); > + switch (type) { > + case TCG_TYPE_I64: > + tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); > + break; > + case TCG_TYPE_I32: > + tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); > + break; > + case TCG_TYPE_V64: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2); > + break; > + case TCG_TYPE_V128: > + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2); > + break; > + case TCG_TYPE_V256: > + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL, > + arg, 0, arg1, arg2); > + break; > + default: > + g_assert_not_reached(); > + } > } > > static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, > @@ -773,6 +931,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, > return false; > } > rexw = P_REXW; > + } else if (type != TCG_TYPE_I32) { > + return false; > } > tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); > tcg_out32(s, val); > @@ -1914,6 +2074,15 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > case glue(glue(INDEX_op_, x), _i32) > #endif > > +#define OP_128_256(x) \ > + case glue(glue(INDEX_op_, x), _v256): \ > + rexw = P_VEXL; /* FALLTHRU */ \ > + case glue(glue(INDEX_op_, x), _v128) > + > +#define OP_64_128_256(x) \ > + OP_128_256(x): \ > + case glue(glue(INDEX_op_, x), _v64) > + > /* Hoist the loads of the most common arguments. */ > a0 = args[0]; > a1 = args[1]; > @@ -2379,19 +2548,94 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > } > break; > > + OP_64_128_256(add8): > + c = OPC_PADDB; > + goto gen_simd; > + OP_64_128_256(add16): > + c = OPC_PADDW; > + goto gen_simd; > + OP_64_128_256(add32): > + c = OPC_PADDD; > + goto gen_simd; > + OP_128_256(add64): > + c = OPC_PADDQ; > + goto gen_simd; > + OP_64_128_256(sub8): > + c = OPC_PSUBB; > + goto gen_simd; > + OP_64_128_256(sub16): > + c = OPC_PSUBW; > + goto gen_simd; > + OP_64_128_256(sub32): > + c = OPC_PSUBD; > + goto gen_simd; > + OP_128_256(sub64): > + c = OPC_PSUBQ; > + goto gen_simd; > + OP_64_128_256(and): > + c = OPC_PAND; > + goto gen_simd; > + OP_64_128_256(andc): > + c = OPC_PANDN; > + goto gen_simd; > + OP_64_128_256(or): > + c = OPC_POR; > + goto gen_simd; > + OP_64_128_256(xor): > + c = OPC_PXOR; > + gen_simd: > + if (have_avx1) { > + tcg_out_vex_modrm(s, c, a0, a1, a2); > + } else { > + tcg_out_modrm(s, c, a0, a2); > + } > + break; > + > + case INDEX_op_ld_v64: > + c = TCG_TYPE_V64; > + goto gen_simd_ld; > + case INDEX_op_ld_v128: > + c = TCG_TYPE_V128; > + goto gen_simd_ld; > + case INDEX_op_ld_v256: > + c = TCG_TYPE_V256; > + gen_simd_ld: > + tcg_out_ld(s, c, a0, a1, a2); > + break; > + > + case INDEX_op_st_v64: > + c = TCG_TYPE_V64; > + goto gen_simd_st; > + case INDEX_op_st_v128: > + c = TCG_TYPE_V128; > + goto gen_simd_st; > + case INDEX_op_st_v256: > + c = TCG_TYPE_V256; > + gen_simd_st: > + tcg_out_st(s, c, a0, a1, a2); > + break; > + > case INDEX_op_mb: > tcg_out_mb(s, a0); > break; > case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ > case INDEX_op_mov_i64: > + case INDEX_op_mov_v64: > + case INDEX_op_mov_v128: > + case INDEX_op_mov_v256: > case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ > case INDEX_op_movi_i64: > + case INDEX_op_movi_v64: > + case INDEX_op_movi_v128: > + case INDEX_op_movi_v256: > case INDEX_op_call: /* Always emitted via tcg_out_call. */ > default: > tcg_abort(); > } > > #undef OP_32_64 > +#undef OP_128_256 > +#undef OP_64_128_256 > } > > static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > @@ -2417,6 +2661,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > = { .args_ct_str = { "r", "r", "L", "L" } }; > static const TCGTargetOpDef L_L_L_L > = { .args_ct_str = { "L", "L", "L", "L" } }; > + static const TCGTargetOpDef x_0_x = { .args_ct_str = { "x", "0", "x" } }; > + static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; > + static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; > > switch (op) { > case INDEX_op_goto_ptr: > @@ -2620,6 +2867,52 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) > return &s2; > } > > + case INDEX_op_ld_v64: > + case INDEX_op_ld_v128: > + case INDEX_op_ld_v256: > + case INDEX_op_st_v64: > + case INDEX_op_st_v128: > + case INDEX_op_st_v256: > + return &x_r; > + > + case INDEX_op_add8_v64: > + case INDEX_op_add8_v128: > + case INDEX_op_add16_v64: > + case INDEX_op_add16_v128: > + case INDEX_op_add32_v64: > + case INDEX_op_add32_v128: > + case INDEX_op_add64_v128: > + case INDEX_op_sub8_v64: > + case INDEX_op_sub8_v128: > + case INDEX_op_sub16_v64: > + case INDEX_op_sub16_v128: > + case INDEX_op_sub32_v64: > + case INDEX_op_sub32_v128: > + case INDEX_op_sub64_v128: > + case INDEX_op_and_v64: > + case INDEX_op_and_v128: > + case INDEX_op_andc_v64: > + case INDEX_op_andc_v128: > + case INDEX_op_or_v64: > + case INDEX_op_or_v128: > + case INDEX_op_xor_v64: > + case INDEX_op_xor_v128: > + return have_avx1 ? &x_x_x : &x_0_x; > + > + case INDEX_op_add8_v256: > + case INDEX_op_add16_v256: > + case INDEX_op_add32_v256: > + case INDEX_op_add64_v256: > + case INDEX_op_sub8_v256: > + case INDEX_op_sub16_v256: > + case INDEX_op_sub32_v256: > + case INDEX_op_sub64_v256: > + case INDEX_op_and_v256: > + case INDEX_op_andc_v256: > + case INDEX_op_or_v256: > + case INDEX_op_xor_v256: > + return &x_x_x; > + > default: > break; > } > @@ -2725,9 +3018,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) > static void tcg_target_init(TCGContext *s) > { > #ifdef CONFIG_CPUID_H > - unsigned a, b, c, d; > + unsigned a, b, c, d, b7 = 0; > int max = __get_cpuid_max(0, 0); > > + if (max >= 7) { > + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ > + __cpuid_count(7, 0, a, b7, c, d); > + have_bmi1 = (b7 & bit_BMI) != 0; > + have_bmi2 = (b7 & bit_BMI2) != 0; > + } > + > if (max >= 1) { > __cpuid(1, a, b, c, d); > #ifndef have_cmov > @@ -2736,17 +3036,26 @@ static void tcg_target_init(TCGContext *s) > available, we'll use a small forward branch. */ > have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_sse2 > + have_sse2 = (d & bit_SSE2) != 0; > +#endif > /* MOVBE is only available on Intel Atom and Haswell CPUs, so we > need to probe for it. */ > have_movbe = (c & bit_MOVBE) != 0; > have_popcnt = (c & bit_POPCNT) != 0; > - } > > - if (max >= 7) { > - /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ > - __cpuid_count(7, 0, a, b, c, d); > - have_bmi1 = (b & bit_BMI) != 0; > - have_bmi2 = (b & bit_BMI2) != 0; > +#ifndef have_avx2 > + /* There are a number of things we must check before we can be > + sure of not hitting invalid opcode. */ > + if (c & bit_OSXSAVE) { > + unsigned xcrl, xcrh; > + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); > + if (xcrl & 6 == 6) { > + have_avx1 = (c & bit_AVX) != 0; > + have_avx2 = (b7 & bit_AVX2) != 0; > + } > + } > +#endif > } > > max = __get_cpuid_max(0x8000000, 0); > @@ -2763,6 +3072,13 @@ static void tcg_target_init(TCGContext *s) > } else { > tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff); > } > + if (have_sse2) { > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V64], 0, 0xff0000); > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V128], 0, 0xff0000); > + } > + if (have_avx2) { > + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V256], 0, 0xff0000); > + } > > tcg_regset_clear(tcg_target_call_clobber_regs); > tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX); -- Alex Bennée
Alex Bennée <alex.bennee@linaro.org> writes: > Richard Henderson <richard.henderson@linaro.org> writes: > >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> <snip> Also this commit breaks RISU: qemu-aarch64 build/aarch64-linux-gnu/risu testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin \ -t testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin.trace Gives: mismatch detail (master : apprentice): V29 : 000000000000000005388083c1444242 vs 00000000000000002a000e0416a30018 The insn is: 37c: 6f56a29d umull2 v29.4s, v20.8h, v6.h[1] Which is odd because I didn't think we'd touched that. You can find my bundle of testcases with trace files at: http://people.linaro.org/~alex.bennee/testcases/arm64.risu/aarch64-patterns-v8dot0.tar.xz Which is used in our master RISU tracking job: https://validation.linaro.org/results/query/~alex.bennee/master-aarch64-risu-results -- Alex Bennée
On 09/08/2017 06:10 AM, Alex Bennée wrote: > Also this commit breaks RISU: > > qemu-aarch64 build/aarch64-linux-gnu/risu > testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin \ > -t testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin.trace > > Gives: > > mismatch detail (master : apprentice): > V29 : 000000000000000005388083c1444242 vs 00000000000000002a000e0416a30018 > > The insn is: > > 37c: 6f56a29d umull2 v29.4s, v20.8h, v6.h[1] > > Which is odd because I didn't think we'd touched that. Indeed we didn't. Still, I'll check it out next week. r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 09/08/2017 06:10 AM, Alex Bennée wrote: >> Also this commit breaks RISU: >> >> qemu-aarch64 build/aarch64-linux-gnu/risu >> testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin \ >> -t testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin.trace >> >> Gives: >> >> mismatch detail (master : apprentice): >> V29 : 000000000000000005388083c1444242 vs 00000000000000002a000e0416a30018 >> >> The insn is: >> >> 37c: 6f56a29d umull2 v29.4s, v20.8h, v6.h[1] >> >> Which is odd because I didn't think we'd touched that. > > Indeed we didn't. Still, I'll check it out next week. OK it would help if I had objdumped the right file: 36c: 0e781fdd bic v29.8b, v30.8b, v24.8b 370: 00005af0 .inst 0x00005af0 ; undefined -- Alex Bennée
On 09/11/2017 02:07 AM, Alex Bennée wrote: > > Richard Henderson <richard.henderson@linaro.org> writes: > >> On 09/08/2017 06:10 AM, Alex Bennée wrote: >>> Also this commit breaks RISU: >>> >>> qemu-aarch64 build/aarch64-linux-gnu/risu >>> testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin \ >>> -t testcases.aarch64/insn_ANDSi_RES8_ANDS_RES_ANDv_ASRV__INC.risu.bin.trace >>> >>> Gives: >>> >>> mismatch detail (master : apprentice): >>> V29 : 000000000000000005388083c1444242 vs 00000000000000002a000e0416a30018 >>> >>> The insn is: >>> >>> 37c: 6f56a29d umull2 v29.4s, v20.8h, v6.h[1] >>> >>> Which is odd because I didn't think we'd touched that. >> >> Indeed we didn't. Still, I'll check it out next week. > > OK it would help if I had objdumped the right file: > > 36c: 0e781fdd bic v29.8b, v30.8b, v24.8b > 370: 00005af0 .inst 0x00005af0 ; undefined Thanks. The sse pandn operand order is ... surprising. Even though I know that I still managed to get it wrong. Fixed for v2. r~
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index e512648c95..147f82062b 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -30,11 +30,10 @@ #ifdef __x86_64__ # define TCG_TARGET_REG_BITS 64 -# define TCG_TARGET_NB_REGS 16 #else # define TCG_TARGET_REG_BITS 32 -# define TCG_TARGET_NB_REGS 8 #endif +# define TCG_TARGET_NB_REGS 24 typedef enum { TCG_REG_EAX = 0, @@ -56,6 +55,19 @@ typedef enum { TCG_REG_R13, TCG_REG_R14, TCG_REG_R15, + + /* SSE registers; 64-bit has access to 8 more, but we won't + need more than a few and using only the first 8 minimizes + the need for a rex prefix on the sse instructions. */ + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, + TCG_REG_XMM6, + TCG_REG_XMM7, + TCG_REG_RAX = TCG_REG_EAX, TCG_REG_RCX = TCG_REG_ECX, TCG_REG_RDX = TCG_REG_EDX, @@ -79,6 +91,17 @@ extern bool have_bmi1; extern bool have_bmi2; extern bool have_popcnt; +#ifdef __SSE2__ +#define have_sse2 true +#else +extern bool have_sse2; +#endif +#ifdef __AVX2__ +#define have_avx2 true +#else +extern bool have_avx2; +#endif + /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 #define TCG_TARGET_HAS_rot_i32 1 @@ -147,6 +170,25 @@ extern bool have_popcnt; #define TCG_TARGET_HAS_mulsh_i64 0 #endif +#define TCG_TARGET_HAS_v64 have_sse2 +#define TCG_TARGET_HAS_v128 have_sse2 +#define TCG_TARGET_HAS_v256 have_avx2 + +#define TCG_TARGET_HAS_andc_v64 TCG_TARGET_HAS_v64 +#define TCG_TARGET_HAS_orc_v64 0 +#define TCG_TARGET_HAS_not_v64 0 +#define TCG_TARGET_HAS_neg_v64 0 + +#define TCG_TARGET_HAS_andc_v128 TCG_TARGET_HAS_v128 +#define TCG_TARGET_HAS_orc_v128 0 +#define TCG_TARGET_HAS_not_v128 0 +#define TCG_TARGET_HAS_neg_v128 0 + +#define TCG_TARGET_HAS_andc_v256 TCG_TARGET_HAS_v256 +#define TCG_TARGET_HAS_orc_v256 0 +#define TCG_TARGET_HAS_not_v256 0 +#define TCG_TARGET_HAS_neg_v256 0 + #define TCG_TARGET_deposit_i32_valid(ofs, len) \ (have_bmi2 || \ ((ofs) == 0 && (len) == 8) || \ diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index b1445a4c24..b84cd584fb 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -212,13 +212,13 @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, /* Host integer vector operations. */ /* These opcodes are required whenever the base vector size is enabled. */ -DEF(mov_v64, 1, 1, 0, IMPL(TCG_TARGET_HAS_v64)) -DEF(mov_v128, 1, 1, 0, IMPL(TCG_TARGET_HAS_v128)) -DEF(mov_v256, 1, 1, 0, IMPL(TCG_TARGET_HAS_v256)) +DEF(mov_v64, 1, 1, 0, TCG_OPF_NOT_PRESENT) +DEF(mov_v128, 1, 1, 0, TCG_OPF_NOT_PRESENT) +DEF(mov_v256, 1, 1, 0, TCG_OPF_NOT_PRESENT) -DEF(movi_v64, 1, 0, 1, IMPL(TCG_TARGET_HAS_v64)) -DEF(movi_v128, 1, 0, 1, IMPL(TCG_TARGET_HAS_v128)) -DEF(movi_v256, 1, 0, 1, IMPL(TCG_TARGET_HAS_v256)) +DEF(movi_v64, 1, 0, 1, TCG_OPF_NOT_PRESENT) +DEF(movi_v128, 1, 0, 1, TCG_OPF_NOT_PRESENT) +DEF(movi_v256, 1, 0, 1, TCG_OPF_NOT_PRESENT) DEF(ld_v64, 1, 1, 1, IMPL(TCG_TARGET_HAS_v64)) DEF(ld_v128, 1, 1, 1, IMPL(TCG_TARGET_HAS_v128)) diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index aeefb72aa0..0e01b54aa0 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -31,7 +31,9 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", #else "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, #endif + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", }; #endif @@ -61,6 +63,14 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_EDX, TCG_REG_EAX, #endif + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, + TCG_REG_XMM6, + TCG_REG_XMM7, }; static const int tcg_target_call_iarg_regs[] = { @@ -94,7 +104,7 @@ static const int tcg_target_call_oarg_regs[] = { #define TCG_CT_CONST_I32 0x400 #define TCG_CT_CONST_WSZ 0x800 -/* Registers used with L constraint, which are the first argument +/* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ #if TCG_TARGET_REG_BITS == 64 @@ -127,6 +137,16 @@ bool have_bmi1; bool have_bmi2; bool have_popcnt; +#ifndef have_sse2 +bool have_sse2; +#endif +#ifdef have_avx2 +#define have_avx1 have_avx2 +#else +static bool have_avx1; +bool have_avx2; +#endif + #ifdef CONFIG_CPUID_H static bool have_movbe; static bool have_lzcnt; @@ -215,6 +235,10 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, /* With TZCNT/LZCNT, we can have operand-size as an input. */ ct->ct |= TCG_CT_CONST_WSZ; break; + case 'x': + ct->ct |= TCG_CT_REG; + tcg_regset_set32(ct->u.regs, 0, 0xff0000); + break; /* qemu_ld/st address constraint */ case 'L': @@ -292,6 +316,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #endif #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ +#define P_VEXL 0x80000 /* Set VEX.L = 1 */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -324,13 +349,31 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_MOVL_Iv (0xb8) #define OPC_MOVBE_GyMy (0xf0 | P_EXT38) #define OPC_MOVBE_MyGy (0xf1 | P_EXT38) +#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16) +#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16) +#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3) +#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3) +#define OPC_MOVQ_GyMy (0x7e | P_EXT | P_SIMDF3) +#define OPC_MOVQ_MyGy (0xd6 | P_EXT | P_DATA16) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) #define OPC_MOVZBL (0xb6 | P_EXT) #define OPC_MOVZWL (0xb7 | P_EXT) +#define OPC_PADDB (0xfc | P_EXT | P_DATA16) +#define OPC_PADDW (0xfd | P_EXT | P_DATA16) +#define OPC_PADDD (0xfe | P_EXT | P_DATA16) +#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) +#define OPC_PAND (0xdb | P_EXT | P_DATA16) +#define OPC_PANDN (0xdf | P_EXT | P_DATA16) #define OPC_PDEP (0xf5 | P_EXT38 | P_SIMDF2) #define OPC_PEXT (0xf5 | P_EXT38 | P_SIMDF3) +#define OPC_POR (0xeb | P_EXT | P_DATA16) +#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) +#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) +#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) +#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) +#define OPC_PXOR (0xef | P_EXT | P_DATA16) #define OPC_POP_r32 (0x58) #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) #define OPC_PUSH_r32 (0x50) @@ -500,7 +543,8 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } -static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) +static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, + int rm, int index) { int tmp; @@ -515,14 +559,16 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) } else if (opc & P_EXT) { tmp = 1; } else { - tcg_abort(); + g_assert_not_reached(); } - tmp |= 0x40; /* VEX.X */ tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ tcg_out8(s, tmp); tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ + tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ + /* VEX.pp */ if (opc & P_DATA16) { tmp |= 1; /* 0x66 */ @@ -538,7 +584,7 @@ static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm) static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) { - tcg_out_vex_pfx_opc(s, opc, r, v, rm); + tcg_out_vex_pfx_opc(s, opc, r, v, rm, 0); tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } @@ -565,7 +611,7 @@ static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r, static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, tcg_target_ulong data) { - tcg_out_vex_pfx_opc(s, opc, r, v, 0); + tcg_out_vex_pfx_opc(s, opc, r, v, 0, 0); tcg_out_sfx_pool_imm(s, r, data); } @@ -574,8 +620,8 @@ static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v, mode for absolute addresses, ~RM is the size of the immediate operand that will follow the instruction. */ -static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, - int index, int shift, intptr_t offset) +static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, + int shift, intptr_t offset) { int mod, len; @@ -586,7 +632,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; intptr_t disp = offset - pc; if (disp == (int32_t)disp) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 5); tcg_out32(s, disp); return; @@ -596,7 +641,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, use of the MODRM+SIB encoding and is therefore larger than rip-relative addressing. */ if (offset == (int32_t)offset) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (4 << 3) | 5); tcg_out32(s, offset); @@ -604,10 +648,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } /* ??? The memory isn't directly addressable. */ - tcg_abort(); + g_assert_not_reached(); } else { /* Absolute address. */ - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (r << 3) | 5); tcg_out32(s, offset); return; @@ -630,7 +673,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, that would be used for %esp is the escape to the two byte form. */ if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { /* Single byte MODRM format. */ - tcg_out_opc(s, opc, r, rm, 0); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } else { /* Two byte MODRM+SIB format. */ @@ -644,7 +686,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, tcg_debug_assert(index != TCG_REG_ESP); } - tcg_out_opc(s, opc, r, rm, index); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); } @@ -656,6 +697,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } } +static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, + int index, int shift, intptr_t offset) +{ + tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + +static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, + int rm, int index, int shift, + intptr_t offset) +{ + tcg_out_vex_pfx_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + /* A simplification of the above with no index or shift. */ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm, intptr_t offset) @@ -663,6 +719,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); } +static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, + int v, int rm, intptr_t offset) +{ + tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); +} + +static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm) +{ + if (have_avx1) { + tcg_out_vex_modrm(s, opc, r, 0, rm); + } else { + tcg_out_modrm(s, opc, r, rm); + } +} + +static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r, + int rm, intptr_t offset) +{ + if (have_avx1) { + tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset); + } else { + tcg_out_modrm_offset(s, opc, r, rm, offset); + } +} + /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) { @@ -673,12 +754,32 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); } -static inline void tcg_out_mov(TCGContext *s, TCGType type, - TCGReg ret, TCGReg arg) +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { if (arg != ret) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm(s, opc, ret, arg); + int opc = 0; + + switch (type) { + case TCG_TYPE_I64: + opc = P_REXW; + /* fallthru */ + case TCG_TYPE_I32: + opc |= OPC_MOVL_GvEv; + tcg_out_modrm(s, opc, ret, arg); + break; + + case TCG_TYPE_V256: + opc = P_VEXL; + /* fallthru */ + case TCG_TYPE_V128: + case TCG_TYPE_V64: + opc |= OPC_MOVDQA_GyMy; + tcg_out_maybe_vex_modrm(s, opc, ret, arg); + break; + + default: + g_assert_not_reached(); + } } } @@ -687,6 +788,27 @@ static void tcg_out_movi(TCGContext *s, TCGType type, { tcg_target_long diff; + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + break; + + case TCG_TYPE_V64: + case TCG_TYPE_V128: + case TCG_TYPE_V256: + /* ??? Revisit this as the implementation progresses. */ + tcg_debug_assert(arg == 0); + if (have_avx1) { + tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); + } else { + tcg_out_modrm(s, OPC_PXOR, ret, ret); + } + return; + + default: + g_assert_not_reached(); + } + if (arg == 0) { tgen_arithr(s, ARITH_XOR, ret, ret); return; @@ -750,18 +872,54 @@ static inline void tcg_out_pop(TCGContext *s, int reg) tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, ret, arg1, arg2); + switch (type) { + case TCG_TYPE_I64: + tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); + break; + case TCG_TYPE_I32: + tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); + break; + case TCG_TYPE_V64: + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL, + ret, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, arg, arg1, arg2); + switch (type) { + case TCG_TYPE_I64: + tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); + break; + case TCG_TYPE_I32: + tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); + break; + case TCG_TYPE_V64: + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL, + arg, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, @@ -773,6 +931,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, return false; } rexw = P_REXW; + } else if (type != TCG_TYPE_I32) { + return false; } tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); tcg_out32(s, val); @@ -1914,6 +2074,15 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, case glue(glue(INDEX_op_, x), _i32) #endif +#define OP_128_256(x) \ + case glue(glue(INDEX_op_, x), _v256): \ + rexw = P_VEXL; /* FALLTHRU */ \ + case glue(glue(INDEX_op_, x), _v128) + +#define OP_64_128_256(x) \ + OP_128_256(x): \ + case glue(glue(INDEX_op_, x), _v64) + /* Hoist the loads of the most common arguments. */ a0 = args[0]; a1 = args[1]; @@ -2379,19 +2548,94 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_64_128_256(add8): + c = OPC_PADDB; + goto gen_simd; + OP_64_128_256(add16): + c = OPC_PADDW; + goto gen_simd; + OP_64_128_256(add32): + c = OPC_PADDD; + goto gen_simd; + OP_128_256(add64): + c = OPC_PADDQ; + goto gen_simd; + OP_64_128_256(sub8): + c = OPC_PSUBB; + goto gen_simd; + OP_64_128_256(sub16): + c = OPC_PSUBW; + goto gen_simd; + OP_64_128_256(sub32): + c = OPC_PSUBD; + goto gen_simd; + OP_128_256(sub64): + c = OPC_PSUBQ; + goto gen_simd; + OP_64_128_256(and): + c = OPC_PAND; + goto gen_simd; + OP_64_128_256(andc): + c = OPC_PANDN; + goto gen_simd; + OP_64_128_256(or): + c = OPC_POR; + goto gen_simd; + OP_64_128_256(xor): + c = OPC_PXOR; + gen_simd: + if (have_avx1) { + tcg_out_vex_modrm(s, c, a0, a1, a2); + } else { + tcg_out_modrm(s, c, a0, a2); + } + break; + + case INDEX_op_ld_v64: + c = TCG_TYPE_V64; + goto gen_simd_ld; + case INDEX_op_ld_v128: + c = TCG_TYPE_V128; + goto gen_simd_ld; + case INDEX_op_ld_v256: + c = TCG_TYPE_V256; + gen_simd_ld: + tcg_out_ld(s, c, a0, a1, a2); + break; + + case INDEX_op_st_v64: + c = TCG_TYPE_V64; + goto gen_simd_st; + case INDEX_op_st_v128: + c = TCG_TYPE_V128; + goto gen_simd_st; + case INDEX_op_st_v256: + c = TCG_TYPE_V256; + gen_simd_st: + tcg_out_st(s, c, a0, a1, a2); + break; + case INDEX_op_mb: tcg_out_mb(s, a0); break; case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: + case INDEX_op_mov_v64: + case INDEX_op_mov_v128: + case INDEX_op_mov_v256: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ case INDEX_op_movi_i64: + case INDEX_op_movi_v64: + case INDEX_op_movi_v128: + case INDEX_op_movi_v256: case INDEX_op_call: /* Always emitted via tcg_out_call. */ default: tcg_abort(); } #undef OP_32_64 +#undef OP_128_256 +#undef OP_64_128_256 } static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) @@ -2417,6 +2661,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) = { .args_ct_str = { "r", "r", "L", "L" } }; static const TCGTargetOpDef L_L_L_L = { .args_ct_str = { "L", "L", "L", "L" } }; + static const TCGTargetOpDef x_0_x = { .args_ct_str = { "x", "0", "x" } }; + static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; + static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; switch (op) { case INDEX_op_goto_ptr: @@ -2620,6 +2867,52 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) return &s2; } + case INDEX_op_ld_v64: + case INDEX_op_ld_v128: + case INDEX_op_ld_v256: + case INDEX_op_st_v64: + case INDEX_op_st_v128: + case INDEX_op_st_v256: + return &x_r; + + case INDEX_op_add8_v64: + case INDEX_op_add8_v128: + case INDEX_op_add16_v64: + case INDEX_op_add16_v128: + case INDEX_op_add32_v64: + case INDEX_op_add32_v128: + case INDEX_op_add64_v128: + case INDEX_op_sub8_v64: + case INDEX_op_sub8_v128: + case INDEX_op_sub16_v64: + case INDEX_op_sub16_v128: + case INDEX_op_sub32_v64: + case INDEX_op_sub32_v128: + case INDEX_op_sub64_v128: + case INDEX_op_and_v64: + case INDEX_op_and_v128: + case INDEX_op_andc_v64: + case INDEX_op_andc_v128: + case INDEX_op_or_v64: + case INDEX_op_or_v128: + case INDEX_op_xor_v64: + case INDEX_op_xor_v128: + return have_avx1 ? &x_x_x : &x_0_x; + + case INDEX_op_add8_v256: + case INDEX_op_add16_v256: + case INDEX_op_add32_v256: + case INDEX_op_add64_v256: + case INDEX_op_sub8_v256: + case INDEX_op_sub16_v256: + case INDEX_op_sub32_v256: + case INDEX_op_sub64_v256: + case INDEX_op_and_v256: + case INDEX_op_andc_v256: + case INDEX_op_or_v256: + case INDEX_op_xor_v256: + return &x_x_x; + default: break; } @@ -2725,9 +3018,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) static void tcg_target_init(TCGContext *s) { #ifdef CONFIG_CPUID_H - unsigned a, b, c, d; + unsigned a, b, c, d, b7 = 0; int max = __get_cpuid_max(0, 0); + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b7, c, d); + have_bmi1 = (b7 & bit_BMI) != 0; + have_bmi2 = (b7 & bit_BMI2) != 0; + } + if (max >= 1) { __cpuid(1, a, b, c, d); #ifndef have_cmov @@ -2736,17 +3036,26 @@ static void tcg_target_init(TCGContext *s) available, we'll use a small forward branch. */ have_cmov = (d & bit_CMOV) != 0; #endif +#ifndef have_sse2 + have_sse2 = (d & bit_SSE2) != 0; +#endif /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ have_movbe = (c & bit_MOVBE) != 0; have_popcnt = (c & bit_POPCNT) != 0; - } - if (max >= 7) { - /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ - __cpuid_count(7, 0, a, b, c, d); - have_bmi1 = (b & bit_BMI) != 0; - have_bmi2 = (b & bit_BMI2) != 0; +#ifndef have_avx2 + /* There are a number of things we must check before we can be + sure of not hitting invalid opcode. */ + if (c & bit_OSXSAVE) { + unsigned xcrl, xcrh; + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); + if (xcrl & 6 == 6) { + have_avx1 = (c & bit_AVX) != 0; + have_avx2 = (b7 & bit_AVX2) != 0; + } + } +#endif } max = __get_cpuid_max(0x8000000, 0); @@ -2763,6 +3072,13 @@ static void tcg_target_init(TCGContext *s) } else { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff); } + if (have_sse2) { + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V64], 0, 0xff0000); + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V128], 0, 0xff0000); + } + if (have_avx2) { + tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_V256], 0, 0xff0000); + } tcg_regset_clear(tcg_target_call_clobber_regs); tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/i386/tcg-target.h | 46 +++++- tcg/tcg-opc.h | 12 +- tcg/i386/tcg-target.inc.c | 382 ++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 399 insertions(+), 41 deletions(-) -- 2.13.5