@@ -137,6 +137,7 @@ typedef enum {
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 1
+#define TCG_TARGET_HAS_dupm_vec 0
#define TCG_TARGET_DEFAULT_MO (0)
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
@@ -187,6 +187,7 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 1
+#define TCG_TARGET_HAS_dupm_vec 0
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
@@ -152,6 +152,7 @@ extern bool have_isa_3_00;
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 1
+#define TCG_TARGET_HAS_dupm_vec 0
void flush_icache_range(uintptr_t start, uintptr_t stop);
void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t);
@@ -950,6 +950,7 @@ void tcg_gen_atomic_umax_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
@@ -211,6 +211,7 @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
+DEF(dupm_vec, 1, 1, 1, TCG_OPF_VECTOR | IMPL(TCG_TARGET_HAS_dupm_vec))
DEF(dup_vec, 1, 1, 0, IMPLVEC)
DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
@@ -185,6 +185,7 @@ typedef uint64_t TCGRegSet;
#define TCG_TARGET_HAS_mul_vec 0
#define TCG_TARGET_HAS_sat_vec 0
#define TCG_TARGET_HAS_minmax_vec 0
+#define TCG_TARGET_HAS_dupm_vec 0
#else
#define TCG_TARGET_MAYBE_vec 1
#endif
@@ -390,6 +390,40 @@ static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
return 0;
}
+static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_vec t_vec)
+{
+ uint32_t i = 0;
+
+ switch (type) {
+ case TCG_TYPE_V256:
+ /* Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16. The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ */
+ for (; i + 32 <= oprsz; i += 32) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
+ }
+ /* fallthru */
+ case TCG_TYPE_V128:
+ for (; i + 16 <= oprsz; i += 16) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
+ }
+ break;
+ case TCG_TYPE_V64:
+ for (; i < oprsz; i += 8) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
* Only one of IN_32 or IN_64 may be set;
* IN_C is used if IN_32 and IN_64 are unset.
@@ -429,49 +463,11 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
} else if (in_64) {
tcg_gen_dup_i64_vec(vece, t_vec, in_64);
} else {
- switch (vece) {
- case MO_8:
- tcg_gen_dup8i_vec(t_vec, in_c);
- break;
- case MO_16:
- tcg_gen_dup16i_vec(t_vec, in_c);
- break;
- case MO_32:
- tcg_gen_dup32i_vec(t_vec, in_c);
- break;
- default:
- tcg_gen_dup64i_vec(t_vec, in_c);
- break;
- }
+ tcg_gen_dupi_vec(vece, t_vec, in_c);
}
-
- i = 0;
- switch (type) {
- case TCG_TYPE_V256:
- /* Recall that ARM SVE allows vector sizes that are not a
- * power of 2, but always a multiple of 16. The intent is
- * that e.g. size == 80 would be expanded with 2x32 + 1x16.
- */
- for (; i + 32 <= oprsz; i += 32) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
- }
- /* fallthru */
- case TCG_TYPE_V128:
- for (; i + 16 <= oprsz; i += 16) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
- }
- break;
- case TCG_TYPE_V64:
- for (; i < oprsz; i += 8) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
- }
- break;
- default:
- g_assert_not_reached();
- }
-
+ do_dup_store(type, dofs, oprsz, maxsz, t_vec);
tcg_temp_free_vec(t_vec);
- goto done;
+ return;
}
/* Otherwise, inline with an integer type, unless "large". */
@@ -1287,6 +1283,16 @@ void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz)
{
+ if (TCG_TARGET_HAS_dupm_vec) {
+ TCGType type = choose_vector_type(INDEX_op_dupm_vec, vece, oprsz, 0);
+ if (type != 0) {
+ TCGv_vec t_vec = tcg_temp_new_vec(type);
+ tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
+ do_dup_store(type, dofs, oprsz, maxsz, t_vec);
+ tcg_temp_free_vec(t_vec);
+ return;
+ }
+ }
if (vece <= MO_32) {
TCGv_i32 in = tcg_temp_new_i32();
switch (vece) {
@@ -194,6 +194,17 @@ void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a)
vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
}
+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec r, TCGv_ptr b,
+ tcg_target_long ofs)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGArg bi = tcgv_ptr_arg(b);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ vec_gen_3(INDEX_op_dupm_vec, type, vece, ri, bi, ofs);
+}
+
static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o)
{
TCGArg ri = tcgv_vec_arg(r);
@@ -1623,6 +1623,8 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_smax_vec:
case INDEX_op_umax_vec:
return have_vec && TCG_TARGET_HAS_minmax_vec;
+ case INDEX_op_dupm_vec:
+ return have_vec && TCG_TARGET_HAS_dupm_vec;
default:
tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
Allow the backend to expand dup from memory directly, instead of forcing the value into a temp first. This is especially important if integer/vector register moves do not exist. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/aarch64/tcg-target.h | 1 + tcg/i386/tcg-target.h | 1 + tcg/ppc/tcg-target.h | 1 + tcg/tcg-op.h | 1 + tcg/tcg-opc.h | 1 + tcg/tcg.h | 1 + tcg/tcg-op-gvec.c | 88 +++++++++++++++++++++------------------- tcg/tcg-op-vec.c | 11 +++++ tcg/tcg.c | 2 + 9 files changed, 66 insertions(+), 41 deletions(-) -- 2.17.2