Message ID | 20230606094814.3581397-27-peter.maydell@linaro.org |
---|---|
State | Accepted |
Commit | e6dd5e782becfe6d51f3575c086f5bd7162421d0 |
Headers | show |
Series | [PULL,01/42] arm: move KVM breakpoints helpers | expand |
On Tue, 6 Jun 2023 10:47:58 +0100 Peter Maydell <peter.maydell@linaro.org> wrote: > From: Richard Henderson <richard.henderson@linaro.org> > > Round len_align to 16 instead of 8, handling an odd 8-byte as part > of the tail. Use MO_ATOM_NONE to indicate that all of these memory > ops have only byte atomicity. > > Reviewed-by: Peter Maydell <peter.maydell@linaro.org> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > Message-id: 20230530191438.411344-8-richard.henderson@linaro.org > Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Early in debugging but a git bisect pointed at this patch causing: ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached Bail out! ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached Aborted Just after Run /sbin/init arm64 / virt running on an x86 host. cpu max. Just reverting this patch results in length check failures. I reverted as follows revert: Revert "target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r" revert: Revert "target/arm: Sink gen_mte_check1 into load/store_exclusive" revert: Revert "target/arm: Load/store integer pair with one tcg operation" revert: Revert "target/arm: Hoist finalize_memop out of do_gpr_{ld, st}" revert: Revert "target/arm: Hoist finalize_memop out of do_fp_{ld, st}" revert: Revert "target/arm: Pass memop to gen_mte_check1*" revert: Revert "target/arm: Pass single_memop to gen_mte_checkN" revert: Revert "target/arm: Check alignment in helper_mte_check" revert: Revert "target/arm: Add SCTLR.nAA to TBFLAG_A64" revert: Revert "target/arm: Relax ordered/atomic alignment checks for LSE2" revert: Revert "target/arm: Move mte check for store-exclusive" and all is good - obviously that's probably massive overkill. Jonathan > --- > target/arm/tcg/translate-sve.c | 95 +++++++++++++++++++++++++--------- > 1 file changed, 70 insertions(+), 25 deletions(-) > > diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c > index d9d5810ddea..57051a4729a 100644 > --- a/target/arm/tcg/translate-sve.c > +++ b/target/arm/tcg/translate-sve.c > @@ -4167,11 +4167,12 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, > void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > int len, int rn, int imm) > { > - int len_align = QEMU_ALIGN_DOWN(len, 8); > - int len_remain = len % 8; > - int nparts = len / 8 + ctpop8(len_remain); > + int len_align = QEMU_ALIGN_DOWN(len, 16); > + int len_remain = len % 16; > + int nparts = len / 16 + ctpop8(len_remain); > int midx = get_mem_index(s); > TCGv_i64 dirty_addr, clean_addr, t0, t1; > + TCGv_i128 t16; > > dirty_addr = tcg_temp_new_i64(); > tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); > @@ -4188,10 +4189,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > int i; > > t0 = tcg_temp_new_i64(); > - for (i = 0; i < len_align; i += 8) { > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); > + t1 = tcg_temp_new_i64(); > + t16 = tcg_temp_new_i128(); > + > + for (i = 0; i < len_align; i += 16) { > + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_extr_i128_i64(t0, t1, t16); > tcg_gen_st_i64(t0, base, vofs + i); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + tcg_gen_st_i64(t1, base, vofs + i + 8); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > } > } else { > TCGLabel *loop = gen_new_label(); > @@ -4200,14 +4207,21 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > tcg_gen_movi_ptr(i, 0); > gen_set_label(loop); > > - t0 = tcg_temp_new_i64(); > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + t16 = tcg_temp_new_i128(); > + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > > tp = tcg_temp_new_ptr(); > tcg_gen_add_ptr(tp, base, i); > - tcg_gen_addi_ptr(i, i, 8); > + tcg_gen_addi_ptr(i, i, 16); > + > + t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > + tcg_gen_extr_i128_i64(t0, t1, t16); > + > tcg_gen_st_i64(t0, tp, vofs); > + tcg_gen_st_i64(t1, tp, vofs + 8); > > tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); > } > @@ -4216,6 +4230,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > * Predicate register loads can be any multiple of 2. > * Note that we still store the entire 64-bit unit into cpu_env. > */ > + if (len_remain >= 8) { > + t0 = tcg_temp_new_i64(); > + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); > + tcg_gen_st_i64(t0, base, vofs + len_align); > + len_remain -= 8; > + len_align += 8; > + if (len_remain) { > + tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + } > + } > if (len_remain) { > t0 = tcg_temp_new_i64(); > switch (len_remain) { > @@ -4223,14 +4247,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > case 4: > case 8: > tcg_gen_qemu_ld_i64(t0, clean_addr, midx, > - MO_LE | ctz32(len_remain)); > + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); > break; > > case 6: > t1 = tcg_temp_new_i64(); > - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); > + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); > tcg_gen_addi_i64(clean_addr, clean_addr, 4); > - tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); > + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); > tcg_gen_deposit_i64(t0, t0, t1, 32, 32); > break; > > @@ -4245,11 +4269,12 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, > void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, > int len, int rn, int imm) > { > - int len_align = QEMU_ALIGN_DOWN(len, 8); > - int len_remain = len % 8; > - int nparts = len / 8 + ctpop8(len_remain); > + int len_align = QEMU_ALIGN_DOWN(len, 16); > + int len_remain = len % 16; > + int nparts = len / 16 + ctpop8(len_remain); > int midx = get_mem_index(s); > - TCGv_i64 dirty_addr, clean_addr, t0; > + TCGv_i64 dirty_addr, clean_addr, t0, t1; > + TCGv_i128 t16; > > dirty_addr = tcg_temp_new_i64(); > tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); > @@ -4267,10 +4292,15 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, > int i; > > t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > + t16 = tcg_temp_new_i128(); > for (i = 0; i < len_align; i += 8) { > tcg_gen_ld_i64(t0, base, vofs + i); > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + tcg_gen_ld_i64(t1, base, vofs + i + 8); > + tcg_gen_concat_i64_i128(t16, t0, t1); > + tcg_gen_qemu_st_i128(t16, clean_addr, midx, > + MO_LE | MO_128 | MO_ATOM_NONE); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > } > } else { > TCGLabel *loop = gen_new_label(); > @@ -4280,18 +4310,33 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, > gen_set_label(loop); > > t0 = tcg_temp_new_i64(); > + t1 = tcg_temp_new_i64(); > tp = tcg_temp_new_ptr(); > tcg_gen_add_ptr(tp, base, i); > tcg_gen_ld_i64(t0, tp, vofs); > - tcg_gen_addi_ptr(i, i, 8); > + tcg_gen_ld_i64(t1, tp, vofs + 8); > + tcg_gen_addi_ptr(i, i, 16); > > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); > - tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + t16 = tcg_temp_new_i128(); > + tcg_gen_concat_i64_i128(t16, t0, t1); > + > + tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ); > + tcg_gen_addi_i64(clean_addr, clean_addr, 16); > > tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); > } > > /* Predicate register stores can be any multiple of 2. */ > + if (len_remain >= 8) { > + t0 = tcg_temp_new_i64(); > + tcg_gen_st_i64(t0, base, vofs + len_align); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); > + len_remain -= 8; > + len_align += 8; > + if (len_remain) { > + tcg_gen_addi_i64(clean_addr, clean_addr, 8); > + } > + } > if (len_remain) { > t0 = tcg_temp_new_i64(); > tcg_gen_ld_i64(t0, base, vofs + len_align); > @@ -4301,14 +4346,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, > case 4: > case 8: > tcg_gen_qemu_st_i64(t0, clean_addr, midx, > - MO_LE | ctz32(len_remain)); > + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); > break; > > case 6: > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); > tcg_gen_addi_i64(clean_addr, clean_addr, 4); > tcg_gen_shri_i64(t0, t0, 32); > - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); > + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); > break; > > default:
On 12/06/2023 16:20, Jonathan Cameron via wrote: > On Tue, 6 Jun 2023 10:47:58 +0100 > Peter Maydell <peter.maydell@linaro.org> wrote: > >> From: Richard Henderson <richard.henderson@linaro.org> >> >> Round len_align to 16 instead of 8, handling an odd 8-byte as part >> of the tail. Use MO_ATOM_NONE to indicate that all of these memory >> ops have only byte atomicity. >> >> Reviewed-by: Peter Maydell <peter.maydell@linaro.org> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> Message-id: 20230530191438.411344-8-richard.henderson@linaro.org >> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> > > Early in debugging but a git bisect pointed at this patch causing: > > ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached > Bail out! ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached > Aborted > > Just after Run /sbin/init arm64 / virt running on an x86 host. > cpu max. > > Just reverting this patch results in length check failures. > I reverted as follows > > > revert: Revert "target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r" > revert: Revert "target/arm: Sink gen_mte_check1 into load/store_exclusive" > revert: Revert "target/arm: Load/store integer pair with one tcg operation" > revert: Revert "target/arm: Hoist finalize_memop out of do_gpr_{ld, st}" > revert: Revert "target/arm: Hoist finalize_memop out of do_fp_{ld, st}" > revert: Revert "target/arm: Pass memop to gen_mte_check1*" > revert: Revert "target/arm: Pass single_memop to gen_mte_checkN" > revert: Revert "target/arm: Check alignment in helper_mte_check" > revert: Revert "target/arm: Add SCTLR.nAA to TBFLAG_A64" > revert: Revert "target/arm: Relax ordered/atomic alignment checks for LSE2" > revert: Revert "target/arm: Move mte check for store-exclusive" > > and all is good - obviously that's probably massive overkill. > > Jonathan Could this possibly be the same as https://gitlab.com/qemu-project/qemu/-/issues/1704? ATB, Mark.
On Mon, 12 Jun 2023 19:40:29 +0100 Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> wrote: > On 12/06/2023 16:20, Jonathan Cameron via wrote: > > > On Tue, 6 Jun 2023 10:47:58 +0100 > > Peter Maydell <peter.maydell@linaro.org> wrote: > > > >> From: Richard Henderson <richard.henderson@linaro.org> > >> > >> Round len_align to 16 instead of 8, handling an odd 8-byte as part > >> of the tail. Use MO_ATOM_NONE to indicate that all of these memory > >> ops have only byte atomicity. > >> > >> Reviewed-by: Peter Maydell <peter.maydell@linaro.org> > >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > >> Message-id: 20230530191438.411344-8-richard.henderson@linaro.org > >> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> > > > > Early in debugging but a git bisect pointed at this patch causing: > > > > ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached > > Bail out! ERROR:../../tcg/tcg.c:4317:temp_load: code should not be reached > > Aborted > > > > Just after Run /sbin/init arm64 / virt running on an x86 host. > > cpu max. > > > > Just reverting this patch results in length check failures. > > I reverted as follows > > > > > > revert: Revert "target/arm: Use tcg_gen_qemu_{ld, st}_i128 in gen_sve_{ld, st}r" > > revert: Revert "target/arm: Sink gen_mte_check1 into load/store_exclusive" > > revert: Revert "target/arm: Load/store integer pair with one tcg operation" > > revert: Revert "target/arm: Hoist finalize_memop out of do_gpr_{ld, st}" > > revert: Revert "target/arm: Hoist finalize_memop out of do_fp_{ld, st}" > > revert: Revert "target/arm: Pass memop to gen_mte_check1*" > > revert: Revert "target/arm: Pass single_memop to gen_mte_checkN" > > revert: Revert "target/arm: Check alignment in helper_mte_check" > > revert: Revert "target/arm: Add SCTLR.nAA to TBFLAG_A64" > > revert: Revert "target/arm: Relax ordered/atomic alignment checks for LSE2" > > revert: Revert "target/arm: Move mte check for store-exclusive" > > > > and all is good - obviously that's probably massive overkill. > > > > Jonathan > > Could this possibly be the same as https://gitlab.com/qemu-project/qemu/-/issues/1704? Seems likely. I'll watch that issue for a resolution. Thanks! Jonathan > > > ATB, > > Mark. >
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index d9d5810ddea..57051a4729a 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -4167,11 +4167,12 @@ TRANS_FEAT(UCVTF_dd, aa64_sve, gen_gvec_fpst_arg_zpz, void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, int len, int rn, int imm) { - int len_align = QEMU_ALIGN_DOWN(len, 8); - int len_remain = len % 8; - int nparts = len / 8 + ctpop8(len_remain); + int len_align = QEMU_ALIGN_DOWN(len, 16); + int len_remain = len % 16; + int nparts = len / 16 + ctpop8(len_remain); int midx = get_mem_index(s); TCGv_i64 dirty_addr, clean_addr, t0, t1; + TCGv_i128 t16; dirty_addr = tcg_temp_new_i64(); tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); @@ -4188,10 +4189,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, int i; t0 = tcg_temp_new_i64(); - for (i = 0; i < len_align; i += 8) { - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); + t1 = tcg_temp_new_i64(); + t16 = tcg_temp_new_i128(); + + for (i = 0; i < len_align; i += 16) { + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_extr_i128_i64(t0, t1, t16); tcg_gen_st_i64(t0, base, vofs + i); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_st_i64(t1, base, vofs + i + 8); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); } } else { TCGLabel *loop = gen_new_label(); @@ -4200,14 +4207,21 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, tcg_gen_movi_ptr(i, 0); gen_set_label(loop); - t0 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + t16 = tcg_temp_new_i128(); + tcg_gen_qemu_ld_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); tp = tcg_temp_new_ptr(); tcg_gen_add_ptr(tp, base, i); - tcg_gen_addi_ptr(i, i, 8); + tcg_gen_addi_ptr(i, i, 16); + + t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + tcg_gen_extr_i128_i64(t0, t1, t16); + tcg_gen_st_i64(t0, tp, vofs); + tcg_gen_st_i64(t1, tp, vofs + 8); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); } @@ -4216,6 +4230,16 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, * Predicate register loads can be any multiple of 2. * Note that we still store the entire 64-bit unit into cpu_env. */ + if (len_remain >= 8) { + t0 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + tcg_gen_st_i64(t0, base, vofs + len_align); + len_remain -= 8; + len_align += 8; + if (len_remain) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + } if (len_remain) { t0 = tcg_temp_new_i64(); switch (len_remain) { @@ -4223,14 +4247,14 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_ld_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain)); + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); break; case 6: t1 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); tcg_gen_addi_i64(clean_addr, clean_addr, 4); - tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); tcg_gen_deposit_i64(t0, t0, t1, 32, 32); break; @@ -4245,11 +4269,12 @@ void gen_sve_ldr(DisasContext *s, TCGv_ptr base, int vofs, void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, int len, int rn, int imm) { - int len_align = QEMU_ALIGN_DOWN(len, 8); - int len_remain = len % 8; - int nparts = len / 8 + ctpop8(len_remain); + int len_align = QEMU_ALIGN_DOWN(len, 16); + int len_remain = len % 16; + int nparts = len / 16 + ctpop8(len_remain); int midx = get_mem_index(s); - TCGv_i64 dirty_addr, clean_addr, t0; + TCGv_i64 dirty_addr, clean_addr, t0, t1; + TCGv_i128 t16; dirty_addr = tcg_temp_new_i64(); tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); @@ -4267,10 +4292,15 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, int i; t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); + t16 = tcg_temp_new_i128(); for (i = 0; i < len_align; i += 8) { tcg_gen_ld_i64(t0, base, vofs + i); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_ld_i64(t1, base, vofs + i + 8); + tcg_gen_concat_i64_i128(t16, t0, t1); + tcg_gen_qemu_st_i128(t16, clean_addr, midx, + MO_LE | MO_128 | MO_ATOM_NONE); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); } } else { TCGLabel *loop = gen_new_label(); @@ -4280,18 +4310,33 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, gen_set_label(loop); t0 = tcg_temp_new_i64(); + t1 = tcg_temp_new_i64(); tp = tcg_temp_new_ptr(); tcg_gen_add_ptr(tp, base, i); tcg_gen_ld_i64(t0, tp, vofs); - tcg_gen_addi_ptr(i, i, 8); + tcg_gen_ld_i64(t1, tp, vofs + 8); + tcg_gen_addi_ptr(i, i, 16); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ); - tcg_gen_addi_i64(clean_addr, clean_addr, 8); + t16 = tcg_temp_new_i128(); + tcg_gen_concat_i64_i128(t16, t0, t1); + + tcg_gen_qemu_st_i128(t16, clean_addr, midx, MO_LEUQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 16); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); } /* Predicate register stores can be any multiple of 2. */ + if (len_remain >= 8) { + t0 = tcg_temp_new_i64(); + tcg_gen_st_i64(t0, base, vofs + len_align); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUQ | MO_ATOM_NONE); + len_remain -= 8; + len_align += 8; + if (len_remain) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + } + } if (len_remain) { t0 = tcg_temp_new_i64(); tcg_gen_ld_i64(t0, base, vofs + len_align); @@ -4301,14 +4346,14 @@ void gen_sve_str(DisasContext *s, TCGv_ptr base, int vofs, case 4: case 8: tcg_gen_qemu_st_i64(t0, clean_addr, midx, - MO_LE | ctz32(len_remain)); + MO_LE | ctz32(len_remain) | MO_ATOM_NONE); break; case 6: - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL | MO_ATOM_NONE); tcg_gen_addi_i64(clean_addr, clean_addr, 4); tcg_gen_shri_i64(t0, t0, 32); - tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW | MO_ATOM_NONE); break; default: