@@ -1124,6 +1124,7 @@ DEF_HELPER_FLAGS_5(sve_ftmad_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve_ftmad_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_ldr, TCG_CALL_NO_WG, void, env, ptr, tl, int)
+DEF_HELPER_FLAGS_4(sve_str, TCG_CALL_NO_WG, void, env, ptr, tl, int)
DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
@@ -4191,7 +4191,7 @@ static bool sve_probe_page(SVEHostPage *info, bool nofault,
}
/*
- * Load contiguous data, unpredicated.
+ * Load/store contiguous data, unpredicated.
*
* Note that unpredicated load/store of vector/predicate registers
* are defined as a stream of bytes, which equates to little-endian
@@ -4261,6 +4261,67 @@ void HELPER(sve_ldr)(CPUARMState *env, void *vd, target_ulong addr, int size)
}
}
+void HELPER(sve_str)(CPUARMState *env, void *vd, target_ulong addr, int size)
+{
+ int mem_idx = cpu_mmu_index(env, false);
+ int in_page = -((int)addr | TARGET_PAGE_MASK);
+ uintptr_t ra = GETPC();
+ uint64_t val;
+ void *host;
+ int i;
+
+ /* Small stores are expanded inline. */
+ tcg_debug_assert(size > 2 * 8);
+
+ if (likely(size <= in_page)) {
+ host = probe_write(env, addr, size, mem_idx, ra);
+ if (likely(host != NULL)) {
+ for (i = 0; i + 8 <= size; i += 8) {
+ stq_le_p(host + i, *(uint64_t *)(vd + i));
+ }
+
+ /* Predicate load length may be any multiple of 2. */
+ if (unlikely(i != size)) {
+ val = *(uint64_t *)(vd + i);
+ if (size & 4) {
+ stl_le_p(host + i, val);
+ i += 4;
+ val >>= 32;
+ }
+ if (size & 2) {
+ stw_le_p(host + i, val);
+ }
+ }
+ return;
+ }
+ } else {
+ (void)probe_write(env, addr, in_page, mem_idx, ra);
+ (void)probe_write(env, addr + in_page, size - in_page, mem_idx, ra);
+ }
+
+ /*
+ * Note there is no endian-specific target store function, so to handle
+ * aarch64_be-linux-user we need to bswap the big-endian store.
+ */
+ for (i = 0; i + 8 <= size; i += 8) {
+ val = *(uint64_t *)(vd + i);
+ cpu_stq_data_ra(env, addr + i, le_bswap64(val), ra);
+ }
+
+ /* Predicate load length may be any multiple of 2. */
+ if (unlikely(i != size)) {
+ val = *(uint64_t *)(vd + i);
+ if (size & 4) {
+ cpu_stl_data_ra(env, addr + i, le_bswap32(val), ra);
+ i += 4;
+ val >>= 32;
+ }
+ if (size & 2) {
+ cpu_stw_data_ra(env, addr + i, le_bswap16(val), ra);
+ }
+ }
+}
+
/*
* Analyse contiguous data, protected by a governing predicate.
*/
@@ -4430,78 +4430,52 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm)
int len_remain = len % 8;
int nparts = len / 8 + ctpop8(len_remain);
int midx = get_mem_index(s);
- TCGv_i64 addr, t0;
+ TCGv_i64 dirty_addr, clean_addr, t0;
+ int i;
+
+ dirty_addr = read_cpu_reg_sp(s, rn, true);
+ tcg_gen_addi_i64(dirty_addr, dirty_addr, imm);
+
+ clean_addr = gen_mte_checkN(s, dirty_addr, true, rn != 31, len, MO_8);
+
+ /* Limit tcg code expansion by doing large loads out of line. */
+ if (nparts > 4) {
+ TCGv_ptr t_rd = tcg_temp_new_ptr();
+ TCGv_i32 t_len = tcg_const_i32(len);
+
+ tcg_gen_addi_ptr(t_rd, cpu_env, vofs);
+ gen_helper_sve_str(cpu_env, t_rd, clean_addr, t_len);
+ tcg_temp_free_ptr(t_rd);
+ tcg_temp_free_i32(t_len);
+ return;
+ }
- addr = tcg_temp_new_i64();
t0 = tcg_temp_new_i64();
-
- /* Note that unpredicated load/store of vector/predicate registers
- * are defined as a stream of bytes, which equates to little-endian
- * operations on larger quantities. There is no nice way to force
- * a little-endian store for aarch64_be-linux-user out of line.
- *
- * Attempt to keep code expansion to a minimum by limiting the
- * amount of unrolling done.
- */
- if (nparts <= 4) {
- int i;
-
- for (i = 0; i < len_align; i += 8) {
- tcg_gen_ld_i64(t0, cpu_env, vofs + i);
- tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i);
- tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
- }
- } else {
- TCGLabel *loop = gen_new_label();
- TCGv_ptr t2, i = tcg_const_local_ptr(0);
-
- gen_set_label(loop);
-
- t2 = tcg_temp_new_ptr();
- tcg_gen_add_ptr(t2, cpu_env, i);
- tcg_gen_ld_i64(t0, t2, vofs);
-
- /* Minimize the number of local temps that must be re-read from
- * the stack each iteration. Instead, re-compute values other
- * than the loop counter.
- */
- tcg_gen_addi_ptr(t2, i, imm);
- tcg_gen_extu_ptr_i64(addr, t2);
- tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn));
- tcg_temp_free_ptr(t2);
-
- tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
-
- tcg_gen_addi_ptr(i, i, 8);
-
- tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
- tcg_temp_free_ptr(i);
+ for (i = 0; i < len_align; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, vofs + i);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 8);
}
/* Predicate register stores can be any multiple of 2. */
if (len_remain) {
tcg_gen_ld_i64(t0, cpu_env, vofs + len_align);
- tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align);
-
switch (len_remain) {
- case 2:
- case 4:
- case 8:
- tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain));
- break;
-
case 6:
- tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL);
- tcg_gen_addi_i64(addr, addr, 4);
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
+ tcg_gen_addi_i64(clean_addr, clean_addr, 4);
tcg_gen_shri_i64(t0, t0, 32);
- tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW);
+ /* fall through */
+ case 2:
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW);
+ break;
+ case 4:
+ tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL);
break;
-
default:
g_assert_not_reached();
}
}
- tcg_temp_free_i64(addr);
tcg_temp_free_i64(t0);
}
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper-sve.h | 1 + target/arm/sve_helper.c | 63 ++++++++++++++++++++++++++- target/arm/translate-sve.c | 88 ++++++++++++++------------------------ 3 files changed, 94 insertions(+), 58 deletions(-) -- 2.20.1