diff mbox series

[v3,38/39] tcg/arm: Use LDRD to load tlb mask+table

Message ID 20190508000641.19090-39-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: Move the softmmu tlb to CPUNegativeOffsetState | expand

Commit Message

Richard Henderson May 8, 2019, 12:06 a.m. UTC
This changes the code generation for the tlb from e.g.

	ldr      ip, [r6, #-0x10]
	ldr      r2, [r6, #-0xc]
	and      ip, ip, r4, lsr #8
	ldrd     r0, r1, [r2, ip]!
	ldr      r2, [r2, #0x18]

to

	ldrd     r0, r1, [r6, #-0x10]
	and      r0, r0, r4, lsr #8
	ldrd     r2, r3, [r1, r0]!
	ldr      r1, [r1, #0x18]

for armv7 hosts.  Rearranging the register allocation in
order to avoid overlap between the two ldrd pairs causes
the patch to be larger than it ordinarily would be.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
v3: Add QEMU_BUILD_BUG_ON for mask/table ordering; comment fixes.
---
 tcg/arm/tcg-target.inc.c | 92 +++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 39 deletions(-)

-- 
2.17.1

Comments

Alistair Francis May 10, 2019, 9:08 p.m. UTC | #1
On Tue, May 7, 2019 at 5:32 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> This changes the code generation for the tlb from e.g.

>

>         ldr      ip, [r6, #-0x10]

>         ldr      r2, [r6, #-0xc]

>         and      ip, ip, r4, lsr #8

>         ldrd     r0, r1, [r2, ip]!

>         ldr      r2, [r2, #0x18]

>

> to

>

>         ldrd     r0, r1, [r6, #-0x10]

>         and      r0, r0, r4, lsr #8

>         ldrd     r2, r3, [r1, r0]!

>         ldr      r1, [r1, #0x18]

>

> for armv7 hosts.  Rearranging the register allocation in

> order to avoid overlap between the two ldrd pairs causes

> the patch to be larger than it ordinarily would be.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

> v3: Add QEMU_BUILD_BUG_ON for mask/table ordering; comment fixes.

> ---

>  tcg/arm/tcg-target.inc.c | 92 +++++++++++++++++++++++-----------------

>  1 file changed, 53 insertions(+), 39 deletions(-)

>

> diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c

> index ad32b04e13..ac813abfb8 100644

> --- a/tcg/arm/tcg-target.inc.c

> +++ b/tcg/arm/tcg-target.inc.c

> @@ -267,6 +267,7 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,

>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);

>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);

>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);

> +        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);

>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);

>  #endif

>          break;

> @@ -1224,6 +1225,10 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,

>  QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);

>  QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);

>

> +/* These offsets are built into the LDRD below.  */

> +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);

> +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);

> +

>  /* Load and compare a TLB entry, leaving the flags set.  Returns the register

>     containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */

>

> @@ -1238,47 +1243,54 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,

>      unsigned s_bits = opc & MO_SIZE;

>      unsigned a_bits = get_alignment_bits(opc);

>

> -    /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */

> -    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off);

> -    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off);

> -

> -    /* Extract the tlb index from the address into TMP.  */

> -    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo,

> -                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));

> -

>      /*

> -     * Add the tlb_table pointer, creating the CPUTLBEntry address in R2.

> -     * Load the tlb comparator into R0/R1 and the fast path addend into R2.

> +     * We don't support inline unaligned acceses, but we can easily

> +     * support overalignment checks.

>       */

> -    if (cmp_off == 0) {

> -       if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

> -            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);

> -        } else {

> -            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);

> -        }

> -    } else {

> -        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,

> -                       TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0);

> -        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

> -            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);

> -        } else {

> -            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);

> -       }

> -    }

> -    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {

> -        tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);

> -    }

> -

> -    /* Load the tlb addend.  */

> -    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,

> -                    offsetof(CPUTLBEntry, addend));

> -

> -    /* Check alignment.  We don't support inline unaligned acceses,

> -       but we can easily support overalignment checks.  */

>      if (a_bits < s_bits) {

>          a_bits = s_bits;

>      }

>

> +    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */

> +    if (use_armv6_instructions) {

> +        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);

> +    } else {

> +        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);

> +        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);

> +    }

> +

> +    /* Extract the tlb index from the address into R0.  */

> +    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,

> +                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));

> +

> +    /*

> +     * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.

> +     * Load the tlb comparator into R2/R3 and the fast path addend into R1.

> +     */

> +    if (cmp_off == 0) {

> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

> +            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);

> +        } else {

> +            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);

> +        }

> +    } else {

> +        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,

> +                        TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);

> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

> +            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);

> +        } else {

> +            tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);

> +        }

> +    }

> +    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {

> +        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);

> +    }

> +

> +    /* Load the tlb addend.  */

> +    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,

> +                    offsetof(CPUTLBEntry, addend));

> +

> +    /* Check alignment, check comparators.  */

>      if (use_armv7_instructions) {

>          tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));

>          int rot = encode_imm(mask);

> @@ -1291,22 +1303,24 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,

>              tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,

>                              addrlo, TCG_REG_TMP, 0);

>          }

> -        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0);

> +        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);

>      } else {

>          if (a_bits) {

>              tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,

>                              (1 << a_bits) - 1);

>          }

> +        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,

> +                        SHIFT_IMM_LSR(TARGET_PAGE_BITS));

>          tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,

> -                        0, TCG_REG_R0, TCG_REG_TMP,

> +                        0, TCG_REG_R2, TCG_REG_TMP,

>                          SHIFT_IMM_LSL(TARGET_PAGE_BITS));

>      }

>

>      if (TARGET_LONG_BITS == 64) {

> -        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);

> +        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);


This is complex and I'm probably misunderstanding something but isn't
it possible for TCG_REG_R3 to not be set if use_armv6_instructions is
true and TARGET_LONG_BITS is 64?

Alistair

>      }

>

> -    return TCG_REG_R2;

> +    return TCG_REG_R1;

>  }

>

>  /* Record the context of a call to the out of line helper code for the slow

> --

> 2.17.1

>

>
Richard Henderson May 11, 2019, 7:13 p.m. UTC | #2
On 5/10/19 2:08 PM, Alistair Francis wrote:
>> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

>> +            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);

...
> 

> This is complex and I'm probably misunderstanding something but isn't

> it possible for TCG_REG_R3 to not be set if use_armv6_instructions is

> true and TARGET_LONG_BITS is 64?


No, the LDRD instruction loads data into both R2 and R2+1 = R3.


r~
Alistair Francis May 13, 2019, 11:10 p.m. UTC | #3
On Sat, May 11, 2019 at 12:13 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> On 5/10/19 2:08 PM, Alistair Francis wrote:

> >> +        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {

> >> +            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);

> ...

> >

> > This is complex and I'm probably misunderstanding something but isn't

> > it possible for TCG_REG_R3 to not be set if use_armv6_instructions is

> > true and TARGET_LONG_BITS is 64?

>

> No, the LDRD instruction loads data into both R2 and R2+1 = R3.


Ah ok. This looks fine to me then but I don't think I fully grasp it
enough to Ack it.

Alistair

>

>

> r~
diff mbox series

Patch

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index ad32b04e13..ac813abfb8 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -267,6 +267,7 @@  static const char *target_parse_constraint(TCGArgConstraint *ct,
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
 #endif
         break;
@@ -1224,6 +1225,10 @@  static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
 
+/* These offsets are built into the LDRD below.  */
+QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
+QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
+
 /* Load and compare a TLB entry, leaving the flags set.  Returns the register
    containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
 
@@ -1238,47 +1243,54 @@  static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     unsigned s_bits = opc & MO_SIZE;
     unsigned a_bits = get_alignment_bits(opc);
 
-    /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off);
-    tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off);
-
-    /* Extract the tlb index from the address into TMP.  */
-    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo,
-                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
-
     /*
-     * Add the tlb_table pointer, creating the CPUTLBEntry address in R2.
-     * Load the tlb comparator into R0/R1 and the fast path addend into R2.
+     * We don't support inline unaligned acceses, but we can easily
+     * support overalignment checks.
      */
-    if (cmp_off == 0) {
-	if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
-        } else {
-            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP);
-        }
-    } else {
-        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
-		        TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0);
-        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
-        } else {
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
-	}
-    }
-    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
-        tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4);
-    }
-
-    /* Load the tlb addend.  */
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend));
-
-    /* Check alignment.  We don't support inline unaligned acceses,
-       but we can easily support overalignment checks.  */
     if (a_bits < s_bits) {
         a_bits = s_bits;
     }
 
+    /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
+    if (use_armv6_instructions) {
+        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
+    } else {
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off);
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off);
+    }
+
+    /* Extract the tlb index from the address into R0.  */
+    tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
+                    SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
+
+    /*
+     * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
+     * Load the tlb comparator into R2/R3 and the fast path addend into R1.
+     */
+    if (cmp_off == 0) {
+        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
+            tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
+        } else {
+            tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
+        }
+    } else {
+        tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
+                        TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
+        if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
+            tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
+        } else {
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
+        }
+    }
+    if (!use_armv6_instructions && TARGET_LONG_BITS == 64) {
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4);
+    }
+
+    /* Load the tlb addend.  */
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
+                    offsetof(CPUTLBEntry, addend));
+
+    /* Check alignment, check comparators.  */
     if (use_armv7_instructions) {
         tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
         int rot = encode_imm(mask);
@@ -1291,22 +1303,24 @@  static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
             tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
                             addrlo, TCG_REG_TMP, 0);
         }
-        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0);
+        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
     } else {
         if (a_bits) {
             tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
                             (1 << a_bits) - 1);
         }
+        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo,
+                        SHIFT_IMM_LSR(TARGET_PAGE_BITS));
         tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
-                        0, TCG_REG_R0, TCG_REG_TMP,
+                        0, TCG_REG_R2, TCG_REG_TMP,
                         SHIFT_IMM_LSL(TARGET_PAGE_BITS));
     }
 
     if (TARGET_LONG_BITS == 64) {
-        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);
+        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);
     }
 
-    return TCG_REG_R2;
+    return TCG_REG_R1;
 }
 
 /* Record the context of a call to the out of line helper code for the slow