@@ -151,9 +151,7 @@ typedef enum {
void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
-#ifdef CONFIG_SOFTMMU
#define TCG_TARGET_NEED_LDST_LABELS
-#endif
#define TCG_TARGET_NEED_POOL_LABELS
#endif /* AARCH64_TCG_TARGET_H */
@@ -10,6 +10,7 @@
* See the COPYING file in the top-level directory for details.
*/
+#include "../tcg-ldst.c.inc"
#include "../tcg-pool.c.inc"
#include "qemu/bitops.h"
@@ -390,6 +391,10 @@ typedef enum {
I3305_LDR_v64 = 0x5c000000,
I3305_LDR_v128 = 0x9c000000,
+ /* Load/store exclusive */
+ I3306_LDAR = 0x08808000 | LDST_LD << 22, /* plus MO << 30 */
+ I3306_STLR = 0x08808000 | LDST_ST << 22, /* plus MO << 30 */
+
/* Load/store register. Described here as 3.3.12, but the helper
that emits them can transform to 3.3.10 or 3.3.13. */
I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
@@ -443,6 +448,7 @@ typedef enum {
I3404_ANDI = 0x12000000,
I3404_ORRI = 0x32000000,
I3404_EORI = 0x52000000,
+ I3404_ANDSI = 0x72000000,
/* Move wide immediate instructions. */
I3405_MOVN = 0x12800000,
@@ -453,6 +459,9 @@ typedef enum {
I3406_ADR = 0x10000000,
I3406_ADRP = 0x90000000,
+ /* Add/subtract extended register. */
+ I3501_ADDEXT = 0x0b200000,
+
/* Add/subtract shifted register instructions (without a shift). */
I3502_ADD = 0x0b000000,
I3502_ADDS = 0x2b000000,
@@ -623,6 +632,14 @@ static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
}
+static void G_GNUC_UNUSED
+tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, MemOp sz,
+ TCGReg rs, TCGReg rt, TCGReg rt2, TCGReg rn)
+{
+ tcg_out32(s, insn | (sz << 30) | (rs << 16) |
+ (rt2 << 10) | (rn << 5) | rt);
+}
+
static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
TCGReg rt, int imm19)
{
@@ -705,6 +722,13 @@ static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
}
+static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
+ TCGReg rd, TCGReg rn,
+ TCGReg rm, MemOp ext)
+{
+ tcg_out32(s, insn | 1 << 31 | rm << 16 | ext << 13 | rn << 5 | rd);
+}
+
/* This function is for both 3.5.2 (Add/Subtract shifted register), for
the rare occasion when we actually want to supply a shift amount. */
static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
@@ -1328,8 +1352,9 @@ static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
if (offset == sextract64(offset, 0, 26)) {
tcg_out_insn(s, 3206, B, offset);
} else {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
- tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+ /* Choose X9 as a call-clobbered non-LR temporary. */
+ tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X9, (intptr_t)target);
+ tcg_out_insn(s, 3207, BR, TCG_REG_X9);
}
}
@@ -1541,9 +1566,14 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
}
}
-#ifdef CONFIG_SOFTMMU
-#include "../tcg-ldst.c.inc"
+static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
+{
+ ptrdiff_t offset = tcg_pcrel_diff(s, target);
+ tcg_debug_assert(offset == sextract64(offset, 0, 21));
+ tcg_out_insn(s, 3406, ADR, rd, offset);
+}
+#ifdef CONFIG_SOFTMMU
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* MemOpIdx oi, uintptr_t ra)
*/
@@ -1577,13 +1607,6 @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
#endif
};
-static inline void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-{
- ptrdiff_t offset = tcg_pcrel_diff(s, target);
- tcg_debug_assert(offset == sextract64(offset, 0, 21));
- tcg_out_insn(s, 3406, ADR, rd, offset);
-}
-
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
{
MemOpIdx oi = lb->oi;
@@ -1714,15 +1737,85 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, MemOp opc,
tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
}
+#else
+
+static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
+ unsigned a_bits)
+{
+ unsigned a_mask = (1 << a_bits) - 1;
+ TCGLabelQemuLdst *label = new_ldst_label(s);
+
+ label->is_ld = is_ld;
+ label->addrlo_reg = addr_reg;
+
+ /* tst addr, #mask */
+ tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
+
+ label->label_ptr[0] = s->code_ptr;
+
+ /* b.ne slow_path */
+ tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
+
+ label->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+
+static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
+{
+ if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
+ return false;
+ }
+
+ tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_X1, l->addrlo_reg);
+ tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
+
+ /*
+ * "Tail call" to the helper, with the return address back inline,
+ * just for the clarity of the debugging traceback -- the helper
+ * cannot return.
+ */
+ tcg_out_adr(s, TCG_REG_LR, l->raddr);
+ tcg_out_goto_long(s, (const void *)(l->is_ld ? helper_unaligned_ld
+ : helper_unaligned_st));
+ return true;
+}
+
+static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+ return tcg_out_fail_alignment(s, l);
+}
+
+static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
+{
+ return tcg_out_fail_alignment(s, l);
+}
+
+static void tcg_out_qemu_ld_acquire(TCGContext *s, MemOp memop, TCGType ext,
+ TCGReg data_r, TCGReg addr_r)
+{
+ MemOp size = memop & MO_SIZE;
+
+ tcg_out_insn(s, 3306, LDAR, size,
+ TCG_REG_XZR, data_r, TCG_REG_XZR, addr_r);
+ if (memop & MO_SIGN) {
+ tcg_out_sxt(s, ext, size, data_r, data_r);
+ }
+}
+
+static void tcg_out_qemu_st_release(TCGContext *s, MemOp memop,
+ TCGReg data_r, TCGReg addr_r)
+{
+ MemOp size = memop & MO_SIZE;
+
+ tcg_out_insn(s, 3306, STLR, size,
+ TCG_REG_XZR, data_r, TCG_REG_XZR, addr_r);
+}
+
#endif /* CONFIG_SOFTMMU */
static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
TCGReg data_r, TCGReg addr_r,
TCGType otype, TCGReg off_r)
{
- /* Byte swapping is left to middle-end expansion. */
- tcg_debug_assert((memop & MO_BSWAP) == 0);
-
switch (memop & MO_SSIZE) {
case MO_UB:
tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
@@ -1756,9 +1849,6 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
TCGReg data_r, TCGReg addr_r,
TCGType otype, TCGReg off_r)
{
- /* Byte swapping is left to middle-end expansion. */
- tcg_debug_assert((memop & MO_BSWAP) == 0);
-
switch (memop & MO_SIZE) {
case MO_8:
tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
@@ -1782,6 +1872,10 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
{
MemOp memop = get_memop(oi);
const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+
+ /* Byte swapping is left to middle-end expansion. */
+ tcg_debug_assert((memop & MO_BSWAP) == 0);
+
#ifdef CONFIG_SOFTMMU
unsigned mem_index = get_mmuidx(oi);
tcg_insn_unit *label_ptr;
@@ -1792,6 +1886,28 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
s->code_ptr, label_ptr);
#else /* !CONFIG_SOFTMMU */
+ unsigned a_bits = get_alignment_bits(memop);
+
+ if (a_bits) {
+ /*
+ * If alignment required, and equals the access size, then
+ * use load-acquire for the size effect of alignment checking.
+ * Despite the extra memory barrier, for a ThunderX2 host,
+ * this is is about 40% faster. It is always smaller.
+ */
+ if (a_bits == (memop & MO_SIZE)) {
+ if (USE_GUEST_BASE) {
+ tcg_out_insn(s, 3501, ADDEXT, TCG_REG_TMP, TCG_REG_GUEST_BASE,
+ addr_reg, TARGET_LONG_BITS == 64 ? MO_64 : MO_32);
+ addr_reg = TCG_REG_TMP;
+ }
+ tcg_out_qemu_ld_acquire(s, memop, ext, data_reg, addr_reg);
+ return;
+ }
+
+ tcg_out_test_alignment(s, true, addr_reg, a_bits);
+ }
+
if (USE_GUEST_BASE) {
tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
TCG_REG_GUEST_BASE, otype, addr_reg);
@@ -1807,6 +1923,10 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
{
MemOp memop = get_memop(oi);
const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+
+ /* Byte swapping is left to middle-end expansion. */
+ tcg_debug_assert((memop & MO_BSWAP) == 0);
+
#ifdef CONFIG_SOFTMMU
unsigned mem_index = get_mmuidx(oi);
tcg_insn_unit *label_ptr;
@@ -1817,6 +1937,26 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
data_reg, addr_reg, s->code_ptr, label_ptr);
#else /* !CONFIG_SOFTMMU */
+ unsigned a_bits = get_alignment_bits(memop);
+
+ if (a_bits) {
+ /*
+ * If alignment required, and equals the access size, then
+ * use store-release for the size effect of alignment checking.
+ */
+ if (a_bits == (memop & MO_SIZE)) {
+ if (USE_GUEST_BASE) {
+ tcg_out_insn(s, 3501, ADDEXT, TCG_REG_TMP, TCG_REG_GUEST_BASE,
+ addr_reg, TARGET_LONG_BITS == 64 ? MO_64 : MO_32);
+ addr_reg = TCG_REG_TMP;
+ }
+ tcg_out_qemu_st_release(s, memop, data_reg, addr_reg);
+ return;
+ }
+
+ tcg_out_test_alignment(s, false, addr_reg, a_bits);
+ }
+
if (USE_GUEST_BASE) {
tcg_out_qemu_st_direct(s, memop, data_reg,
TCG_REG_GUEST_BASE, otype, addr_reg);
Use load-acquire / store-release for the normal case of alignment matching the access size. Otherwise, emit a test + branch sequence invoking helper_unaligned_mmu. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/aarch64/tcg-target.h | 2 - tcg/aarch64/tcg-target.c.inc | 174 +++++++++++++++++++++++++++++++---- 2 files changed, 157 insertions(+), 19 deletions(-) -- 2.25.1