diff mbox series

[v1,16/19] target/arm: Relax ordered/atomic alignment checks for LSE2

Message ID 20230216030854.1212208-17-richard.henderson@linaro.org
State New
Headers show
Series target/arm: Implement FEAT_LSE2 | expand

Commit Message

Richard Henderson Feb. 16, 2023, 3:08 a.m. UTC
FEAT_LSE2 only requires that atomic operations not cross a
16-byte boundary.  Ordered operations may be completely
unaligned if SCTLR.nAA is set.

Because this alignment check is so special, do it by hand.
Make sure not to keep TCG temps live across the branch.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper-a64.h    |  3 ++
 target/arm/helper-a64.c    |  7 +++
 target/arm/translate-a64.c | 95 ++++++++++++++++++++++++++++++++------
 3 files changed, 92 insertions(+), 13 deletions(-)

Comments

Peter Maydell Feb. 23, 2023, 4:49 p.m. UTC | #1
On Thu, 16 Feb 2023 at 03:09, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> FEAT_LSE2 only requires that atomic operations not cross a
> 16-byte boundary.  Ordered operations may be completely
> unaligned if SCTLR.nAA is set.
>
> Because this alignment check is so special, do it by hand.
> Make sure not to keep TCG temps live across the branch.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>


> +static void check_lse2_align(DisasContext *s, int rn, int imm,
> +                             bool is_write, MemOp mop)
> +{
> +    TCGv_i32 tmp;
> +    TCGv_i64 addr;
> +    TCGLabel *over_label;
> +    MMUAccessType type;
> +    int mmu_idx;
> +
> +    tmp = tcg_temp_new_i32();
> +    tcg_gen_extrl_i64_i32(tmp, cpu_reg_sp(s, rn));
> +    tcg_gen_addi_i32(tmp, tmp, imm & 15);
> +    tcg_gen_andi_i32(tmp, tmp, 15);
> +    tcg_gen_addi_i32(tmp, tmp, memop_size(mop));
> +
> +    over_label = gen_new_label();
> +    tcg_gen_brcond_i32(TCG_COND_LEU, tmp, tcg_constant_i32(16), over_label);

This brcond ends the basic block and destroys the content
of TCG temporaries, which is bad because some of the
callsites have set some of those up before calling this
function (eg gen_compare_and_swap() has called cpu_reg()
which might have created and initialized a temporary
for xZR).

Using a brcond anywhere other than directly in a top level
function where you can see it and work around this awkward
property seems rather fragile.

(Ideally there would be a variant of brcond that didn't
trash temporaries, because almost all the time that is
an annoying hazard rather than a useful property.)

> +    tcg_temp_free_i32(tmp);
> +
> +    addr = tcg_temp_new_i64();
> +    tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm);
> +
> +    type = is_write ? MMU_DATA_STORE : MMU_DATA_LOAD,
> +    mmu_idx = get_mem_index(s);
> +    gen_helper_unaligned_access(cpu_env, addr, tcg_constant_i32(type),
> +                                tcg_constant_i32(mmu_idx));
> +    tcg_temp_free_i64(addr);
> +
> +    gen_set_label(over_label);
> +
> +}

thanks
-- PMM
Richard Henderson Feb. 23, 2023, 5:16 p.m. UTC | #2
On 2/23/23 06:49, Peter Maydell wrote:
> On Thu, 16 Feb 2023 at 03:09, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> FEAT_LSE2 only requires that atomic operations not cross a
>> 16-byte boundary.  Ordered operations may be completely
>> unaligned if SCTLR.nAA is set.
>>
>> Because this alignment check is so special, do it by hand.
>> Make sure not to keep TCG temps live across the branch.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> 
> 
>> +static void check_lse2_align(DisasContext *s, int rn, int imm,
>> +                             bool is_write, MemOp mop)
>> +{
>> +    TCGv_i32 tmp;
>> +    TCGv_i64 addr;
>> +    TCGLabel *over_label;
>> +    MMUAccessType type;
>> +    int mmu_idx;
>> +
>> +    tmp = tcg_temp_new_i32();
>> +    tcg_gen_extrl_i64_i32(tmp, cpu_reg_sp(s, rn));
>> +    tcg_gen_addi_i32(tmp, tmp, imm & 15);
>> +    tcg_gen_andi_i32(tmp, tmp, 15);
>> +    tcg_gen_addi_i32(tmp, tmp, memop_size(mop));
>> +
>> +    over_label = gen_new_label();
>> +    tcg_gen_brcond_i32(TCG_COND_LEU, tmp, tcg_constant_i32(16), over_label);
> 
> This brcond ends the basic block and destroys the content
> of TCG temporaries, which is bad because some of the
> callsites have set some of those up before calling this
> function (eg gen_compare_and_swap() has called cpu_reg()
> which might have created and initialized a temporary
> for xZR).

xzr uses tcg_constant_i64(), which has no lifetime issues.

> 
> Using a brcond anywhere other than directly in a top level
> function where you can see it and work around this awkward
> property seems rather fragile.
> 
> (Ideally there would be a variant of brcond that didn't
> trash temporaries, because almost all the time that is
> an annoying hazard rather than a useful property.)

I've cc'd you on a patch set that fixes all the temporary lifetime stuff.

v1: https://patchew.org/QEMU/20230130205935.1157347-1-richard.henderson@linaro.org/
v2: https://patchew.org/QEMU/20230222232715.15034-1-richard.henderson@linaro.org/


r~
Peter Maydell Feb. 23, 2023, 5:22 p.m. UTC | #3
On Thu, 23 Feb 2023 at 17:16, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 2/23/23 06:49, Peter Maydell wrote:
> > On Thu, 16 Feb 2023 at 03:09, Richard Henderson
> > <richard.henderson@linaro.org> wrote:
> >>
> >> FEAT_LSE2 only requires that atomic operations not cross a
> >> 16-byte boundary.  Ordered operations may be completely
> >> unaligned if SCTLR.nAA is set.
> >>
> >> Because this alignment check is so special, do it by hand.
> >> Make sure not to keep TCG temps live across the branch.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >
> >
> >> +static void check_lse2_align(DisasContext *s, int rn, int imm,
> >> +                             bool is_write, MemOp mop)
> >> +{
> >> +    TCGv_i32 tmp;
> >> +    TCGv_i64 addr;
> >> +    TCGLabel *over_label;
> >> +    MMUAccessType type;
> >> +    int mmu_idx;
> >> +
> >> +    tmp = tcg_temp_new_i32();
> >> +    tcg_gen_extrl_i64_i32(tmp, cpu_reg_sp(s, rn));
> >> +    tcg_gen_addi_i32(tmp, tmp, imm & 15);
> >> +    tcg_gen_andi_i32(tmp, tmp, 15);
> >> +    tcg_gen_addi_i32(tmp, tmp, memop_size(mop));
> >> +
> >> +    over_label = gen_new_label();
> >> +    tcg_gen_brcond_i32(TCG_COND_LEU, tmp, tcg_constant_i32(16), over_label);
> >
> > This brcond ends the basic block and destroys the content
> > of TCG temporaries, which is bad because some of the
> > callsites have set some of those up before calling this
> > function (eg gen_compare_and_swap() has called cpu_reg()
> > which might have created and initialized a temporary
> > for xZR).
>
> xzr uses tcg_constant_i64(), which has no lifetime issues.

Hmm? cpu_reg() calls new_tmp_a64_zero() calls new_tmp_a64()
calls tcg_temp_new_i64(). What am I missing ?

> I've cc'd you on a patch set that fixes all the temporary lifetime stuff.
>
> v1: https://patchew.org/QEMU/20230130205935.1157347-1-richard.henderson@linaro.org/
> v2: https://patchew.org/QEMU/20230222232715.15034-1-richard.henderson@linaro.org/

Cool!

thanks
-- PMM
Richard Henderson Feb. 23, 2023, 5:27 p.m. UTC | #4
On 2/23/23 07:22, Peter Maydell wrote:
> On Thu, 23 Feb 2023 at 17:16, Richard Henderson
>> xzr uses tcg_constant_i64(), which has no lifetime issues.
> 
> Hmm? cpu_reg() calls new_tmp_a64_zero() calls new_tmp_a64()
> calls tcg_temp_new_i64(). What am I missing ?

Whoops, bad memory vs other target/foo/ -- aarch64 doesn't have separate 
get-register-for-source and get-register-for-destination accessors.


r~
diff mbox series

Patch

diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h
index ff56807247..3d5957c11f 100644
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@@ -110,3 +110,6 @@  DEF_HELPER_FLAGS_2(st2g_stub, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_2(ldgm, TCG_CALL_NO_WG, i64, env, i64)
 DEF_HELPER_FLAGS_3(stgm, TCG_CALL_NO_WG, void, env, i64, i64)
 DEF_HELPER_FLAGS_3(stzgm_tags, TCG_CALL_NO_WG, void, env, i64, i64)
+
+DEF_HELPER_FLAGS_4(unaligned_access, TCG_CALL_NO_WG,
+                   noreturn, env, i64, i32, i32)
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 0972a4bdd0..abbe3f7077 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -952,3 +952,10 @@  void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
 
     memset(mem, 0, blocklen);
 }
+
+void HELPER(unaligned_access)(CPUARMState *env, uint64_t addr,
+                              uint32_t access_type, uint32_t mmu_idx)
+{
+    arm_cpu_do_unaligned_access(env_cpu(env), addr, access_type,
+                                mmu_idx, GETPC());
+}
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 56c9d63664..78103f723d 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -310,6 +310,77 @@  TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write,
     return clean_data_tbi(s, addr);
 }
 
+/*
+ * Generate the special alignment check that applies to AccType_ATOMIC
+ * and AccType_ORDERED insns under FEAT_LSE2: the access need not be
+ * naturally aligned, but it must not cross a 16-byte boundary.
+ * See AArch64.CheckAlignment().
+ */
+static void check_lse2_align(DisasContext *s, int rn, int imm,
+                             bool is_write, MemOp mop)
+{
+    TCGv_i32 tmp;
+    TCGv_i64 addr;
+    TCGLabel *over_label;
+    MMUAccessType type;
+    int mmu_idx;
+
+    tmp = tcg_temp_new_i32();
+    tcg_gen_extrl_i64_i32(tmp, cpu_reg_sp(s, rn));
+    tcg_gen_addi_i32(tmp, tmp, imm & 15);
+    tcg_gen_andi_i32(tmp, tmp, 15);
+    tcg_gen_addi_i32(tmp, tmp, memop_size(mop));
+
+    over_label = gen_new_label();
+    tcg_gen_brcond_i32(TCG_COND_LEU, tmp, tcg_constant_i32(16), over_label);
+    tcg_temp_free_i32(tmp);
+
+    addr = tcg_temp_new_i64();
+    tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm);
+
+    type = is_write ? MMU_DATA_STORE : MMU_DATA_LOAD,
+    mmu_idx = get_mem_index(s);
+    gen_helper_unaligned_access(cpu_env, addr, tcg_constant_i32(type),
+                                tcg_constant_i32(mmu_idx));
+    tcg_temp_free_i64(addr);
+
+    gen_set_label(over_label);
+
+}
+
+/* Handle the alignment check for AccType_ATOMIC instructions. */
+static MemOp check_atomic_align(DisasContext *s, int rn, MemOp mop)
+{
+    MemOp size = mop & MO_SIZE;
+
+    if (size == MO_8) {
+        return mop;
+    }
+    if (size >= MO_128 || !dc_isar_feature(aa64_lse2, s)) {
+        return mop | MO_ALIGN | s->be_data;
+    }
+    check_lse2_align(s, rn, 0, true, mop);
+    return mop | s->be_data;
+}
+
+/* Handle the alignment check for AccType_ORDERED instructions. */
+static MemOp check_ordered_align(DisasContext *s, int rn, int imm,
+                                 bool is_write, MemOp mop)
+{
+    MemOp size = mop & MO_SIZE;
+
+    if (size == MO_8) {
+        return mop;
+    }
+    if (size >= MO_128 || !dc_isar_feature(aa64_lse2, s)) {
+        return mop | MO_ALIGN | s->be_data;
+    }
+    if (!s->naa) {
+        check_lse2_align(s, rn, imm, is_write, mop);
+    }
+    return mop | s->be_data;
+}
+
 typedef struct DisasCompare64 {
     TCGCond cond;
     TCGv_i64 value;
@@ -2525,8 +2596,7 @@  static void gen_load_exclusive(DisasContext *s, int rt, int rt2, int rn,
     if (memop == MO_128) {
         memop |= MO_ATMAX_8;
     }
-    memop |= MO_ALIGN;
-    memop = finalize_memop(s, memop);
+    memop = check_atomic_align(s, rn, memop);
 
     s->is_ldex = true;
     dirty_addr = cpu_reg_sp(s, rn);
@@ -2663,7 +2733,7 @@  static void gen_compare_and_swap(DisasContext *s, int rs, int rt,
     if (rn == 31) {
         gen_check_sp_alignment(s);
     }
-    memop = finalize_memop(s, size | MO_ALIGN);
+    memop = check_atomic_align(s, rn, size);
     clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, memop);
     tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt,
                                memidx, memop);
@@ -2685,7 +2755,7 @@  static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt,
     }
 
     /* This is a single atomic access, despite the "pair". */
-    memop = finalize_memop(s, (size + 1) | MO_ALIGN);
+    memop = check_atomic_align(s, rn, size + 1);
     clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, memop);
 
     if (size == 2) {
@@ -2809,8 +2879,7 @@  static void disas_ldst_excl(DisasContext *s, uint32_t insn)
             gen_check_sp_alignment(s);
         }
         tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
-        /* TODO: ARMv8.4-LSE SCTLR.nAA */
-        memop = finalize_memop(s, size | MO_ALIGN);
+        memop = check_ordered_align(s, rn, 0, true, size);
         clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
                                     true, rn != 31, memop);
         do_gpr_st(s, cpu_reg(s, rt), clean_addr, memop, true, rt,
@@ -2828,8 +2897,7 @@  static void disas_ldst_excl(DisasContext *s, uint32_t insn)
         if (rn == 31) {
             gen_check_sp_alignment(s);
         }
-        /* TODO: ARMv8.4-LSE SCTLR.nAA */
-        memop = finalize_memop(s, size | MO_ALIGN);
+        memop = check_ordered_align(s, rn, 0, false, size);
         clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn),
                                     false, rn != 31, memop);
         do_gpr_ld(s, cpu_reg(s, rt), clean_addr, memop, false, true,
@@ -3510,7 +3578,7 @@  static void disas_ldst_atomic(DisasContext *s, uint32_t insn,
     bool a = extract32(insn, 23, 1);
     TCGv_i64 tcg_rs, tcg_rt, clean_addr;
     AtomicThreeOpFn *fn = NULL;
-    MemOp mop = finalize_memop(s, size | MO_ALIGN);
+    MemOp mop = size;
 
     if (is_vector || !dc_isar_feature(aa64_atomics, s)) {
         unallocated_encoding(s);
@@ -3561,6 +3629,8 @@  static void disas_ldst_atomic(DisasContext *s, uint32_t insn,
     if (rn == 31) {
         gen_check_sp_alignment(s);
     }
+
+    mop = check_atomic_align(s, rn, mop);
     clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, mop);
 
     if (o3_opc == 014) {
@@ -3685,16 +3755,13 @@  static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn)
     bool is_store = false;
     bool extend = false;
     bool iss_sf;
-    MemOp mop;
+    MemOp mop = size;
 
     if (!dc_isar_feature(aa64_rcpc_8_4, s)) {
         unallocated_encoding(s);
         return;
     }
 
-    /* TODO: ARMv8.4-LSE SCTLR.nAA */
-    mop = finalize_memop(s, size | MO_ALIGN);
-
     switch (opc) {
     case 0: /* STLURB */
         is_store = true;
@@ -3726,6 +3793,8 @@  static void disas_ldst_ldapr_stlr(DisasContext *s, uint32_t insn)
         gen_check_sp_alignment(s);
     }
 
+    mop = check_ordered_align(s, rn, offset, is_store, mop);
+
     dirty_addr = read_cpu_reg_sp(s, rn, 1);
     tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
     clean_addr = clean_data_tbi(s, dirty_addr);