Message ID | 20170916023417.14599-2-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | TCG vectorization and example conversion | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Nothing uses or enables them yet. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/tcg-op.h | 26 +++++++ > tcg/tcg-opc.h | 37 ++++++++++ > tcg/tcg.h | 34 +++++++++ > tcg/tcg-op.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > tcg/tcg.c | 77 ++++++++++++++++++- > tcg/README | 46 ++++++++++++ > 6 files changed, 453 insertions(+), 1 deletion(-) > > diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h > index 5d3278f243..b9b0b9f46f 100644 > --- a/tcg/tcg-op.h > +++ b/tcg/tcg-op.h > @@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); > void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp); > void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); > > +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec); > +void tcg_gen_movi_vec(TCGv_vec, tcg_target_long); > +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); > +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a); > +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a); > +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a); > +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a); > +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a); > + > +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); > +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); > +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz); > +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz); > + > #if TARGET_LONG_BITS == 64 > #define tcg_gen_movi_tl tcg_gen_movi_i64 > #define tcg_gen_mov_tl tcg_gen_mov_i64 > diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h > index 956fb1e9f3..8200184fa9 100644 > --- a/tcg/tcg-opc.h > +++ b/tcg/tcg-opc.h > @@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1, > DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, > TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) > > +/* Host vector support. */ > + > +#define IMPLVEC \ > + IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256) > + > +DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT) > + > +/* ??? Simple, but perhaps dupiN would be more descriptive. */ > +DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT) > + > +DEF(ld_vec, 1, 1, 2, IMPLVEC) > +DEF(ldz_vec, 1, 1, 3, IMPLVEC) > +DEF(st_vec, 0, 2, 2, IMPLVEC) > + > +DEF(add8_vec, 1, 2, 1, IMPLVEC) > +DEF(add16_vec, 1, 2, 1, IMPLVEC) > +DEF(add32_vec, 1, 2, 1, IMPLVEC) > +DEF(add64_vec, 1, 2, 1, IMPLVEC) > + > +DEF(sub8_vec, 1, 2, 1, IMPLVEC) > +DEF(sub16_vec, 1, 2, 1, IMPLVEC) > +DEF(sub32_vec, 1, 2, 1, IMPLVEC) > +DEF(sub64_vec, 1, 2, 1, IMPLVEC) > + > +DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) > +DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) > +DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) > +DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) > + > +DEF(and_vec, 1, 2, 1, IMPLVEC) > +DEF(or_vec, 1, 2, 1, IMPLVEC) > +DEF(xor_vec, 1, 2, 1, IMPLVEC) > +DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) > +DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) > +DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) > + > #undef TLADDR_ARGS > #undef DATA64_ARGS > #undef IMPL > #undef IMPL64 > +#undef IMPLVEC > #undef DEF > diff --git a/tcg/tcg.h b/tcg/tcg.h > index 25662c36d4..7cd356e87f 100644 > --- a/tcg/tcg.h > +++ b/tcg/tcg.h > @@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet; > # error "Missing unsigned widening multiply" > #endif > > +#ifndef TCG_TARGET_HAS_v64 > +#define TCG_TARGET_HAS_v64 0 > +#define TCG_TARGET_HAS_v128 0 > +#define TCG_TARGET_HAS_v256 0 > +#define TCG_TARGET_HAS_neg_vec 0 > +#define TCG_TARGET_HAS_not_vec 0 > +#define TCG_TARGET_HAS_andc_vec 0 > +#define TCG_TARGET_HAS_orc_vec 0 > +#endif > + > #ifndef TARGET_INSN_START_EXTRA_WORDS > # define TARGET_INSN_START_WORDS 1 > #else > @@ -249,6 +259,11 @@ typedef struct TCGPool { > typedef enum TCGType { > TCG_TYPE_I32, > TCG_TYPE_I64, > + > + TCG_TYPE_V64, > + TCG_TYPE_V128, > + TCG_TYPE_V256, > + > TCG_TYPE_COUNT, /* number of different types */ > > /* An alias for the size of the host register. */ > @@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg; > * TCGv_i32 : 32 bit integer type > * TCGv_i64 : 64 bit integer type > * TCGv_ptr : a host pointer type > + * TCGv_vec : a host vector type; the exact size is not exposed > + to the CPU front-end code. Isn't this a guest vector type (which is pointed to by a host pointer)? > * TCGv : an integer type the same size as target_ulong > (an alias for either TCGv_i32 or TCGv_i64) > The compiler's type checking will complain if you mix them > @@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg; > typedef struct TCGv_i32_d *TCGv_i32; > typedef struct TCGv_i64_d *TCGv_i64; > typedef struct TCGv_ptr_d *TCGv_ptr; > +typedef struct TCGv_vec_d *TCGv_vec; > typedef TCGv_ptr TCGv_env; > #if TARGET_LONG_BITS == 32 > #define TCGv TCGv_i32 > @@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i) > return (TCGv_ptr)i; > } > > +static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i) > +{ > + return (TCGv_vec)i; > +} > + > static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t) > { > return (intptr_t)t; > @@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) > return (intptr_t)t; > } > > +static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t) > +{ > + return (intptr_t)t; > +} > + > #if TCG_TARGET_REG_BITS == 32 > #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t)) > #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1) > @@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) > #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b)) > #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b)) > #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b)) > +#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b)) > > /* Dummy definition to avoid compiler warnings. */ > #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1) > #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1) > #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1) > +#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1) > > #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1) > #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1) > #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1) > +#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1) > > /* call flags */ > /* Helper does not read globals (either directly or through an exception). It > @@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name); > > TCGv_i32 tcg_temp_new_internal_i32(int temp_local); > TCGv_i64 tcg_temp_new_internal_i64(int temp_local); > +TCGv_vec tcg_temp_new_vec(TCGType type); > +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match); > > void tcg_temp_free_i32(TCGv_i32 arg); > void tcg_temp_free_i64(TCGv_i64 arg); > +void tcg_temp_free_vec(TCGv_vec arg); > > static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset, > const char *name) > diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c > index 688d91755b..50b3177e5f 100644 > --- a/tcg/tcg-op.c > +++ b/tcg/tcg-op.c > @@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b) > GEN_ATOMIC_HELPER(xchg, mov2, 0) > > #undef GEN_ATOMIC_HELPER > + > +static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg ai = GET_TCGV_VEC(a); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGTemp *at = &tcg_ctx.temps[ai]; > + TCGType type = rt->base_type; > + > + tcg_debug_assert(at->base_type == type); > + tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64); > +} > + > +static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg ai = GET_TCGV_VEC(a); > + TCGArg bi = GET_TCGV_VEC(b); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGTemp *at = &tcg_ctx.temps[ai]; > + TCGTemp *bt = &tcg_ctx.temps[bi]; > + TCGType type = rt->base_type; > + > + tcg_debug_assert(at->base_type == type); > + tcg_debug_assert(bt->base_type == type); > + tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64); > +} > + > +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (!TCGV_EQUAL_VEC(r, a)) { > + tcg_gen_op2_vec(INDEX_op_mov_vec, r, a); > + } > +} > + > +void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGType type = rt->base_type; > + > + tcg_debug_assert(a == 0 || a == -1); > + tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64); > +} > + > +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg bi = GET_TCGV_PTR(b); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGType type = rt->base_type; > + > + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64); > +} > + > +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg bi = GET_TCGV_PTR(b); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGType type = rt->base_type; > + > + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64); > +} > + > +/* Load data into a vector R from B+O using TYPE. If R is wider than TYPE, > + fill the high bits with zeros. */ > +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg bi = GET_TCGV_PTR(b); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGType btype = rt->base_type; > + > + if (type < btype) { > + tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o, > + type - TCG_TYPE_V64, btype - TCG_TYPE_V64); > + } else { > + tcg_debug_assert(type == btype); > + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64); > + } > +} > + > +/* Store data from vector R into B+O using TYPE. If R is wider than TYPE, > + store only the low bits. */ > +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type) > +{ > + TCGArg ri = GET_TCGV_VEC(r); > + TCGArg bi = GET_TCGV_PTR(b); > + TCGTemp *rt = &tcg_ctx.temps[ri]; > + TCGType btype = rt->base_type; > + > + tcg_debug_assert(type <= btype); > + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64); > +} > + > +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b); > +} > + > +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b); > +} > + > +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b); > +} > + > +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b); > +} > + > +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b); > +} > + > +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b); > +} > + > +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b); > +} > + > +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b); > +} > + > +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b); > +} > + > +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b); > +} > + > +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b); > +} > + > +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + if (TCG_TARGET_HAS_andc_vec) { > + tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_not_vec(t, b); > + tcg_gen_and_vec(r, a, t); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) > +{ > + if (TCG_TARGET_HAS_orc_vec) { > + tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_not_vec(t, b); > + tcg_gen_or_vec(r, a, t); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (TCG_TARGET_HAS_not_vec) { > + tcg_gen_op2_vec(INDEX_op_orc_vec, r, a); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_movi_vec(t, -1); > + tcg_gen_xor_vec(r, a, t); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (TCG_TARGET_HAS_neg_vec) { > + tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_movi_vec(t, 0); > + tcg_gen_sub8_vec(r, t, a); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (TCG_TARGET_HAS_neg_vec) { > + tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_movi_vec(t, 0); > + tcg_gen_sub16_vec(r, t, a); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (TCG_TARGET_HAS_neg_vec) { > + tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_movi_vec(t, 0); > + tcg_gen_sub32_vec(r, t, a); > + tcg_temp_free_vec(t); > + } > +} > + > +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a) > +{ > + if (TCG_TARGET_HAS_neg_vec) { > + tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a); > + } else { > + TCGv_vec t = tcg_temp_new_vec_matching(r); > + tcg_gen_movi_vec(t, 0); > + tcg_gen_sub64_vec(r, t, a); > + tcg_temp_free_vec(t); > + } > +} > diff --git a/tcg/tcg.c b/tcg/tcg.c > index dff9999bc6..a4d55efdf0 100644 > --- a/tcg/tcg.c > +++ b/tcg/tcg.c > @@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type, > static bool tcg_out_ldst_finalize(TCGContext *s); > #endif > > -static TCGRegSet tcg_target_available_regs[2]; > +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT]; > static TCGRegSet tcg_target_call_clobber_regs; > > #if TCG_TARGET_INSN_UNIT_SIZE == 1 > @@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local) > return MAKE_TCGV_I64(idx); > } > > +TCGv_vec tcg_temp_new_vec(TCGType type) > +{ > + int idx; > + > +#ifdef CONFIG_DEBUG_TCG > + switch (type) { > + case TCG_TYPE_V64: > + assert(TCG_TARGET_HAS_v64); > + break; > + case TCG_TYPE_V128: > + assert(TCG_TARGET_HAS_v128); > + break; > + case TCG_TYPE_V256: > + assert(TCG_TARGET_HAS_v256); > + break; > + default: > + g_assert_not_reached(); > + } > +#endif > + > + idx = tcg_temp_new_internal(type, 0); > + return MAKE_TCGV_VEC(idx); > +} > + A one line comment wouldn't go amiss here. This looks like we are allocating a new temp of the same type as an existing temp? > +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match) > +{ > + TCGContext *s = &tcg_ctx; > + int idx = GET_TCGV_VEC(match); > + TCGTemp *ts; > + > + tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps); > + ts = &s->temps[idx]; > + tcg_debug_assert(ts->temp_allocated != 0); > + > + idx = tcg_temp_new_internal(ts->base_type, 0); > + return MAKE_TCGV_VEC(idx); > +} > + > static void tcg_temp_free_internal(int idx) > { > TCGContext *s = &tcg_ctx; > @@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg) > tcg_temp_free_internal(GET_TCGV_I64(arg)); > } > > +void tcg_temp_free_vec(TCGv_vec arg) > +{ > + tcg_temp_free_internal(GET_TCGV_VEC(arg)); > +} > + > TCGv_i32 tcg_const_i32(int32_t val) > { > TCGv_i32 t0; > @@ -753,6 +796,9 @@ int tcg_check_temp_count(void) > Test the runtime variable that controls each opcode. */ > bool tcg_op_supported(TCGOpcode op) > { > + const bool have_vec > + = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256; > + > switch (op) { > case INDEX_op_discard: > case INDEX_op_set_label: > @@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op) > case INDEX_op_mulsh_i64: > return TCG_TARGET_HAS_mulsh_i64; > > + case INDEX_op_mov_vec: > + case INDEX_op_movi_vec: > + case INDEX_op_ld_vec: > + case INDEX_op_ldz_vec: > + case INDEX_op_st_vec: > + case INDEX_op_add8_vec: > + case INDEX_op_add16_vec: > + case INDEX_op_add32_vec: > + case INDEX_op_add64_vec: > + case INDEX_op_sub8_vec: > + case INDEX_op_sub16_vec: > + case INDEX_op_sub32_vec: > + case INDEX_op_sub64_vec: > + case INDEX_op_and_vec: > + case INDEX_op_or_vec: > + case INDEX_op_xor_vec: > + return have_vec; > + case INDEX_op_not_vec: > + return have_vec && TCG_TARGET_HAS_not_vec; > + case INDEX_op_neg8_vec: > + case INDEX_op_neg16_vec: > + case INDEX_op_neg32_vec: > + case INDEX_op_neg64_vec: > + return have_vec && TCG_TARGET_HAS_neg_vec; > + case INDEX_op_andc_vec: > + return have_vec && TCG_TARGET_HAS_andc_vec; > + case INDEX_op_orc_vec: > + return have_vec && TCG_TARGET_HAS_orc_vec; > + > case NB_OPS: > break; > } > diff --git a/tcg/README b/tcg/README > index 03bfb6acd4..3bf3af67db 100644 > --- a/tcg/README > +++ b/tcg/README > @@ -503,6 +503,52 @@ of the memory access. > For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a > 64-bit memory access specified in flags. > > +********* Host vector operations > + > +All of the vector ops have a final constant argument that specifies the > +length of the vector operation LEN as 64 << LEN bits. That doesn't scan well. So would a 4 lane operation be encoded as 64 << 4? Is this because we are using the bottom bits for something? > + > +* mov_vec v0, v1, len > +* ld_vec v0, t1, len > +* st_vec v0, t1, len > + > + Move, load and store. > + > +* movi_vec v0, c, len > + > + Copy C across the entire vector. > + At present the only supported values for C are 0 and -1. I guess this is why the size in unimportant? This is for clearing or setting the whole of the vector? What does len mean in this case? > + > +* add8_vec v0, v1, v2, len > +* add16_vec v0, v1, v2, len > +* add32_vec v0, v1, v2, len > +* add64_vec v0, v1, v2, len > + > + v0 = v1 + v2, in elements of 8/16/32/64 bits, across len. > + > +* sub8_vec v0, v1, v2, len > +* sub16_vec v0, v1, v2, len > +* sub32_vec v0, v1, v2, len > +* sub64_vec v0, v1, v2, len > + > + Similarly, v0 = v1 - v2. > + > +* neg8_vec v0, v1, len > +* neg16_vec v0, v1, len > +* neg32_vec v0, v1, len > +* neg64_vec v0, v1, len > + > + Similarly, v0 = -v1. > + > +* and_vec v0, v1, v2, len > +* or_vec v0, v1, v2, len > +* xor_vec v0, v1, v2, len > +* andc_vec v0, v1, v2, len > +* orc_vec v0, v1, v2, len > +* not_vec v0, v1, len > + > + Similarly, logical operations. Similarly, logical operations with and without compliment? > + > ********* > > Note 1: Some shortcuts are defined when the last operand is known to be -- Alex Bennée
On 09/26/2017 12:28 PM, Alex Bennée wrote: >> * TCGv_ptr : a host pointer type >> + * TCGv_vec : a host vector type; the exact size is not exposed >> + to the CPU front-end code. > > Isn't this a guest vector type (which is pointed to by a host pointer)? No, it's a host vector, which we have created in response to expanding a guest vector operation. > A one line comment wouldn't go amiss here. This looks like we are > allocating a new temp of the same type as an existing temp? > >> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match) Yes. >> +All of the vector ops have a final constant argument that specifies the >> +length of the vector operation LEN as 64 << LEN bits. > > That doesn't scan well. So would a 4 lane operation be encoded as 64 << > 4? Is this because we are using the bottom bits for something? 64 << 0 = 64 64 << 1 = 128 64 << 2 = 256. I've fixed up the wording a bit. >> + Copy C across the entire vector. >> + At present the only supported values for C are 0 and -1. > > I guess this is why the size in unimportant? This is for clearing or > setting the whole of the vector? What does len mean in this case? Yes. Len still means the length of the whole vector. Elsewhere there's a comment about maybe using dupi{8,16,32,64}_vec instead. However I wanted to put that off until we do some more conversions and see exactly what's going to be needed. >> +* and_vec v0, v1, v2, len >> +* or_vec v0, v1, v2, len >> +* xor_vec v0, v1, v2, len >> +* andc_vec v0, v1, v2, len >> +* orc_vec v0, v1, v2, len >> +* not_vec v0, v1, len >> + >> + Similarly, logical operations. > > Similarly, logical operations with and without compliment? Sure. r~
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index 5d3278f243..b9b0b9f46f 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec); +void tcg_gen_movi_vec(TCGv_vec, tcg_target_long); +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a); +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a); +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a); +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a); +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a); + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz); +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz); + #if TARGET_LONG_BITS == 64 #define tcg_gen_movi_tl tcg_gen_movi_i64 #define tcg_gen_mov_tl tcg_gen_mov_i64 diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index 956fb1e9f3..8200184fa9 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1, DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) +/* Host vector support. */ + +#define IMPLVEC \ + IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256) + +DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT) + +/* ??? Simple, but perhaps dupiN would be more descriptive. */ +DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT) + +DEF(ld_vec, 1, 1, 2, IMPLVEC) +DEF(ldz_vec, 1, 1, 3, IMPLVEC) +DEF(st_vec, 0, 2, 2, IMPLVEC) + +DEF(add8_vec, 1, 2, 1, IMPLVEC) +DEF(add16_vec, 1, 2, 1, IMPLVEC) +DEF(add32_vec, 1, 2, 1, IMPLVEC) +DEF(add64_vec, 1, 2, 1, IMPLVEC) + +DEF(sub8_vec, 1, 2, 1, IMPLVEC) +DEF(sub16_vec, 1, 2, 1, IMPLVEC) +DEF(sub32_vec, 1, 2, 1, IMPLVEC) +DEF(sub64_vec, 1, 2, 1, IMPLVEC) + +DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) +DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) +DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) +DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) + +DEF(and_vec, 1, 2, 1, IMPLVEC) +DEF(or_vec, 1, 2, 1, IMPLVEC) +DEF(xor_vec, 1, 2, 1, IMPLVEC) +DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) +DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) +DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) + #undef TLADDR_ARGS #undef DATA64_ARGS #undef IMPL #undef IMPL64 +#undef IMPLVEC #undef DEF diff --git a/tcg/tcg.h b/tcg/tcg.h index 25662c36d4..7cd356e87f 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet; # error "Missing unsigned widening multiply" #endif +#ifndef TCG_TARGET_HAS_v64 +#define TCG_TARGET_HAS_v64 0 +#define TCG_TARGET_HAS_v128 0 +#define TCG_TARGET_HAS_v256 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_andc_vec 0 +#define TCG_TARGET_HAS_orc_vec 0 +#endif + #ifndef TARGET_INSN_START_EXTRA_WORDS # define TARGET_INSN_START_WORDS 1 #else @@ -249,6 +259,11 @@ typedef struct TCGPool { typedef enum TCGType { TCG_TYPE_I32, TCG_TYPE_I64, + + TCG_TYPE_V64, + TCG_TYPE_V128, + TCG_TYPE_V256, + TCG_TYPE_COUNT, /* number of different types */ /* An alias for the size of the host register. */ @@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg; * TCGv_i32 : 32 bit integer type * TCGv_i64 : 64 bit integer type * TCGv_ptr : a host pointer type + * TCGv_vec : a host vector type; the exact size is not exposed + to the CPU front-end code. * TCGv : an integer type the same size as target_ulong (an alias for either TCGv_i32 or TCGv_i64) The compiler's type checking will complain if you mix them @@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg; typedef struct TCGv_i32_d *TCGv_i32; typedef struct TCGv_i64_d *TCGv_i64; typedef struct TCGv_ptr_d *TCGv_ptr; +typedef struct TCGv_vec_d *TCGv_vec; typedef TCGv_ptr TCGv_env; #if TARGET_LONG_BITS == 32 #define TCGv TCGv_i32 @@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i) return (TCGv_ptr)i; } +static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i) +{ + return (TCGv_vec)i; +} + static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t) { return (intptr_t)t; @@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) return (intptr_t)t; } +static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t) +{ + return (intptr_t)t; +} + #if TCG_TARGET_REG_BITS == 32 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t)) #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1) @@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b)) #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b)) #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b)) +#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b)) /* Dummy definition to avoid compiler warnings. */ #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1) #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1) #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1) +#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1) #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1) #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1) #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1) +#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1) /* call flags */ /* Helper does not read globals (either directly or through an exception). It @@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name); TCGv_i32 tcg_temp_new_internal_i32(int temp_local); TCGv_i64 tcg_temp_new_internal_i64(int temp_local); +TCGv_vec tcg_temp_new_vec(TCGType type); +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match); void tcg_temp_free_i32(TCGv_i32 arg); void tcg_temp_free_i64(TCGv_i64 arg); +void tcg_temp_free_vec(TCGv_vec arg); static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset, const char *name) diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 688d91755b..50b3177e5f 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b) GEN_ATOMIC_HELPER(xchg, mov2, 0) #undef GEN_ATOMIC_HELPER + +static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg ai = GET_TCGV_VEC(a); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGTemp *at = &tcg_ctx.temps[ai]; + TCGType type = rt->base_type; + + tcg_debug_assert(at->base_type == type); + tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64); +} + +static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg ai = GET_TCGV_VEC(a); + TCGArg bi = GET_TCGV_VEC(b); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGTemp *at = &tcg_ctx.temps[ai]; + TCGTemp *bt = &tcg_ctx.temps[bi]; + TCGType type = rt->base_type; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(bt->base_type == type); + tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64); +} + +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a) +{ + if (!TCGV_EQUAL_VEC(r, a)) { + tcg_gen_op2_vec(INDEX_op_mov_vec, r, a); + } +} + +void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGType type = rt->base_type; + + tcg_debug_assert(a == 0 || a == -1); + tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64); +} + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg bi = GET_TCGV_PTR(b); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGType type = rt->base_type; + + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64); +} + +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg bi = GET_TCGV_PTR(b); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGType type = rt->base_type; + + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64); +} + +/* Load data into a vector R from B+O using TYPE. If R is wider than TYPE, + fill the high bits with zeros. */ +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg bi = GET_TCGV_PTR(b); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGType btype = rt->base_type; + + if (type < btype) { + tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o, + type - TCG_TYPE_V64, btype - TCG_TYPE_V64); + } else { + tcg_debug_assert(type == btype); + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64); + } +} + +/* Store data from vector R into B+O using TYPE. If R is wider than TYPE, + store only the low bits. */ +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type) +{ + TCGArg ri = GET_TCGV_VEC(r); + TCGArg bi = GET_TCGV_PTR(b); + TCGTemp *rt = &tcg_ctx.temps[ri]; + TCGType btype = rt->base_type; + + tcg_debug_assert(type <= btype); + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64); +} + +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b); +} + +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b); +} + +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b); +} + +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b); +} + +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b); +} + +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b); +} + +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b); +} + +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b); +} + +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b); +} + +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b); +} + +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b); +} + +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_andc_vec) { + tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(t, b); + tcg_gen_and_vec(r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_orc_vec) { + tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(t, b); + tcg_gen_or_vec(r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_not_vec) { + tcg_gen_op2_vec(INDEX_op_orc_vec, r, a); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_movi_vec(t, -1); + tcg_gen_xor_vec(r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_movi_vec(t, 0); + tcg_gen_sub8_vec(r, t, a); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_movi_vec(t, 0); + tcg_gen_sub16_vec(r, t, a); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_movi_vec(t, 0); + tcg_gen_sub32_vec(r, t, a); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_movi_vec(t, 0); + tcg_gen_sub64_vec(r, t, a); + tcg_temp_free_vec(t); + } +} diff --git a/tcg/tcg.c b/tcg/tcg.c index dff9999bc6..a4d55efdf0 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type, static bool tcg_out_ldst_finalize(TCGContext *s); #endif -static TCGRegSet tcg_target_available_regs[2]; +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT]; static TCGRegSet tcg_target_call_clobber_regs; #if TCG_TARGET_INSN_UNIT_SIZE == 1 @@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local) return MAKE_TCGV_I64(idx); } +TCGv_vec tcg_temp_new_vec(TCGType type) +{ + int idx; + +#ifdef CONFIG_DEBUG_TCG + switch (type) { + case TCG_TYPE_V64: + assert(TCG_TARGET_HAS_v64); + break; + case TCG_TYPE_V128: + assert(TCG_TARGET_HAS_v128); + break; + case TCG_TYPE_V256: + assert(TCG_TARGET_HAS_v256); + break; + default: + g_assert_not_reached(); + } +#endif + + idx = tcg_temp_new_internal(type, 0); + return MAKE_TCGV_VEC(idx); +} + +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match) +{ + TCGContext *s = &tcg_ctx; + int idx = GET_TCGV_VEC(match); + TCGTemp *ts; + + tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps); + ts = &s->temps[idx]; + tcg_debug_assert(ts->temp_allocated != 0); + + idx = tcg_temp_new_internal(ts->base_type, 0); + return MAKE_TCGV_VEC(idx); +} + static void tcg_temp_free_internal(int idx) { TCGContext *s = &tcg_ctx; @@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg) tcg_temp_free_internal(GET_TCGV_I64(arg)); } +void tcg_temp_free_vec(TCGv_vec arg) +{ + tcg_temp_free_internal(GET_TCGV_VEC(arg)); +} + TCGv_i32 tcg_const_i32(int32_t val) { TCGv_i32 t0; @@ -753,6 +796,9 @@ int tcg_check_temp_count(void) Test the runtime variable that controls each opcode. */ bool tcg_op_supported(TCGOpcode op) { + const bool have_vec + = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256; + switch (op) { case INDEX_op_discard: case INDEX_op_set_label: @@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op) case INDEX_op_mulsh_i64: return TCG_TARGET_HAS_mulsh_i64; + case INDEX_op_mov_vec: + case INDEX_op_movi_vec: + case INDEX_op_ld_vec: + case INDEX_op_ldz_vec: + case INDEX_op_st_vec: + case INDEX_op_add8_vec: + case INDEX_op_add16_vec: + case INDEX_op_add32_vec: + case INDEX_op_add64_vec: + case INDEX_op_sub8_vec: + case INDEX_op_sub16_vec: + case INDEX_op_sub32_vec: + case INDEX_op_sub64_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + return have_vec; + case INDEX_op_not_vec: + return have_vec && TCG_TARGET_HAS_not_vec; + case INDEX_op_neg8_vec: + case INDEX_op_neg16_vec: + case INDEX_op_neg32_vec: + case INDEX_op_neg64_vec: + return have_vec && TCG_TARGET_HAS_neg_vec; + case INDEX_op_andc_vec: + return have_vec && TCG_TARGET_HAS_andc_vec; + case INDEX_op_orc_vec: + return have_vec && TCG_TARGET_HAS_orc_vec; + case NB_OPS: break; } diff --git a/tcg/README b/tcg/README index 03bfb6acd4..3bf3af67db 100644 --- a/tcg/README +++ b/tcg/README @@ -503,6 +503,52 @@ of the memory access. For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a 64-bit memory access specified in flags. +********* Host vector operations + +All of the vector ops have a final constant argument that specifies the +length of the vector operation LEN as 64 << LEN bits. + +* mov_vec v0, v1, len +* ld_vec v0, t1, len +* st_vec v0, t1, len + + Move, load and store. + +* movi_vec v0, c, len + + Copy C across the entire vector. + At present the only supported values for C are 0 and -1. + +* add8_vec v0, v1, v2, len +* add16_vec v0, v1, v2, len +* add32_vec v0, v1, v2, len +* add64_vec v0, v1, v2, len + + v0 = v1 + v2, in elements of 8/16/32/64 bits, across len. + +* sub8_vec v0, v1, v2, len +* sub16_vec v0, v1, v2, len +* sub32_vec v0, v1, v2, len +* sub64_vec v0, v1, v2, len + + Similarly, v0 = v1 - v2. + +* neg8_vec v0, v1, len +* neg16_vec v0, v1, len +* neg32_vec v0, v1, len +* neg64_vec v0, v1, len + + Similarly, v0 = -v1. + +* and_vec v0, v1, v2, len +* or_vec v0, v1, v2, len +* xor_vec v0, v1, v2, len +* andc_vec v0, v1, v2, len +* orc_vec v0, v1, v2, len +* not_vec v0, v1, len + + Similarly, logical operations. + ********* Note 1: Some shortcuts are defined when the last operand is known to be
Nothing uses or enables them yet. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/tcg-op.h | 26 +++++++ tcg/tcg-opc.h | 37 ++++++++++ tcg/tcg.h | 34 +++++++++ tcg/tcg-op.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tcg/tcg.c | 77 ++++++++++++++++++- tcg/README | 46 ++++++++++++ 6 files changed, 453 insertions(+), 1 deletion(-) -- 2.13.5