Message ID | 20190508000641.19090-39-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | tcg: Move the softmmu tlb to CPUNegativeOffsetState | expand |
On Tue, May 7, 2019 at 5:32 PM Richard Henderson <richard.henderson@linaro.org> wrote: > > This changes the code generation for the tlb from e.g. > > ldr ip, [r6, #-0x10] > ldr r2, [r6, #-0xc] > and ip, ip, r4, lsr #8 > ldrd r0, r1, [r2, ip]! > ldr r2, [r2, #0x18] > > to > > ldrd r0, r1, [r6, #-0x10] > and r0, r0, r4, lsr #8 > ldrd r2, r3, [r1, r0]! > ldr r1, [r1, #0x18] > > for armv7 hosts. Rearranging the register allocation in > order to avoid overlap between the two ldrd pairs causes > the patch to be larger than it ordinarily would be. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > v3: Add QEMU_BUILD_BUG_ON for mask/table ordering; comment fixes. > --- > tcg/arm/tcg-target.inc.c | 92 +++++++++++++++++++++++----------------- > 1 file changed, 53 insertions(+), 39 deletions(-) > > diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c > index ad32b04e13..ac813abfb8 100644 > --- a/tcg/arm/tcg-target.inc.c > +++ b/tcg/arm/tcg-target.inc.c > @@ -267,6 +267,7 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, > tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0); > tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); > tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2); > + tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3); > tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14); > #endif > break; > @@ -1224,6 +1225,10 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg, > QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); > QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256); > > +/* These offsets are built into the LDRD below. */ > +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); > +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4); > + > /* Load and compare a TLB entry, leaving the flags set. Returns the register > containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */ > > @@ -1238,47 +1243,54 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, > unsigned s_bits = opc & MO_SIZE; > unsigned a_bits = get_alignment_bits(opc); > > - /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ > - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off); > - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off); > - > - /* Extract the tlb index from the address into TMP. */ > - tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo, > - SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)); > - > /* > - * Add the tlb_table pointer, creating the CPUTLBEntry address in R2. > - * Load the tlb comparator into R0/R1 and the fast path addend into R2. > + * We don't support inline unaligned acceses, but we can easily > + * support overalignment checks. > */ > - if (cmp_off == 0) { > - if (use_armv6_instructions && TARGET_LONG_BITS == 64) { > - tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP); > - } else { > - tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP); > - } > - } else { > - tcg_out_dat_reg(s, COND_AL, ARITH_ADD, > - TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0); > - if (use_armv6_instructions && TARGET_LONG_BITS == 64) { > - tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); > - } else { > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); > - } > - } > - if (!use_armv6_instructions && TARGET_LONG_BITS == 64) { > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4); > - } > - > - /* Load the tlb addend. */ > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, > - offsetof(CPUTLBEntry, addend)); > - > - /* Check alignment. We don't support inline unaligned acceses, > - but we can easily support overalignment checks. */ > if (a_bits < s_bits) { > a_bits = s_bits; > } > > + /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}. */ > + if (use_armv6_instructions) { > + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off); > + } else { > + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off); > + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off); > + } > + > + /* Extract the tlb index from the address into R0. */ > + tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo, > + SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)); > + > + /* > + * Add the tlb_table pointer, creating the CPUTLBEntry address in R1. > + * Load the tlb comparator into R2/R3 and the fast path addend into R1. > + */ > + if (cmp_off == 0) { > + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { > + tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); > + } else { > + tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); > + } > + } else { > + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, > + TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0); > + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { > + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); > + } else { > + tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); > + } > + } > + if (!use_armv6_instructions && TARGET_LONG_BITS == 64) { > + tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4); > + } > + > + /* Load the tlb addend. */ > + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1, > + offsetof(CPUTLBEntry, addend)); > + > + /* Check alignment, check comparators. */ > if (use_armv7_instructions) { > tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1)); > int rot = encode_imm(mask); > @@ -1291,22 +1303,24 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, > tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP, > addrlo, TCG_REG_TMP, 0); > } > - tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0); > + tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0); > } else { > if (a_bits) { > tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, > (1 << a_bits) - 1); > } > + tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo, > + SHIFT_IMM_LSR(TARGET_PAGE_BITS)); > tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP, > - 0, TCG_REG_R0, TCG_REG_TMP, > + 0, TCG_REG_R2, TCG_REG_TMP, > SHIFT_IMM_LSL(TARGET_PAGE_BITS)); > } > > if (TARGET_LONG_BITS == 64) { > - tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0); > + tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0); This is complex and I'm probably misunderstanding something but isn't it possible for TCG_REG_R3 to not be set if use_armv6_instructions is true and TARGET_LONG_BITS is 64? Alistair > } > > - return TCG_REG_R2; > + return TCG_REG_R1; > } > > /* Record the context of a call to the out of line helper code for the slow > -- > 2.17.1 > >
On 5/10/19 2:08 PM, Alistair Francis wrote: >> + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { >> + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); ... > > This is complex and I'm probably misunderstanding something but isn't > it possible for TCG_REG_R3 to not be set if use_armv6_instructions is > true and TARGET_LONG_BITS is 64? No, the LDRD instruction loads data into both R2 and R2+1 = R3. r~
On Sat, May 11, 2019 at 12:13 PM Richard Henderson <richard.henderson@linaro.org> wrote: > > On 5/10/19 2:08 PM, Alistair Francis wrote: > >> + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { > >> + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); > ... > > > > This is complex and I'm probably misunderstanding something but isn't > > it possible for TCG_REG_R3 to not be set if use_armv6_instructions is > > true and TARGET_LONG_BITS is 64? > > No, the LDRD instruction loads data into both R2 and R2+1 = R3. Ah ok. This looks fine to me then but I don't think I fully grasp it enough to Ack it. Alistair > > > r~
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index ad32b04e13..ac813abfb8 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -267,6 +267,7 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2); + tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14); #endif break; @@ -1224,6 +1225,10 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg, QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256); +/* These offsets are built into the LDRD below. */ +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); +QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4); + /* Load and compare a TLB entry, leaving the flags set. Returns the register containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */ @@ -1238,47 +1243,54 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, unsigned s_bits = opc & MO_SIZE; unsigned a_bits = get_alignment_bits(opc); - /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP, TCG_AREG0, mask_off); - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R2, TCG_AREG0, table_off); - - /* Extract the tlb index from the address into TMP. */ - tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_TMP, TCG_REG_TMP, addrlo, - SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)); - /* - * Add the tlb_table pointer, creating the CPUTLBEntry address in R2. - * Load the tlb comparator into R0/R1 and the fast path addend into R2. + * We don't support inline unaligned acceses, but we can easily + * support overalignment checks. */ - if (cmp_off == 0) { - if (use_armv6_instructions && TARGET_LONG_BITS == 64) { - tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP); - } else { - tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R2, TCG_REG_TMP); - } - } else { - tcg_out_dat_reg(s, COND_AL, ARITH_ADD, - TCG_REG_R2, TCG_REG_R2, TCG_REG_TMP, 0); - if (use_armv6_instructions && TARGET_LONG_BITS == 64) { - tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); - } else { - tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); - } - } - if (!use_armv6_instructions && TARGET_LONG_BITS == 64) { - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4); - } - - /* Load the tlb addend. */ - tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, - offsetof(CPUTLBEntry, addend)); - - /* Check alignment. We don't support inline unaligned acceses, - but we can easily support overalignment checks. */ if (a_bits < s_bits) { a_bits = s_bits; } + /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}. */ + if (use_armv6_instructions) { + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off); + } else { + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R0, TCG_AREG0, mask_off); + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R1, TCG_AREG0, table_off); + } + + /* Extract the tlb index from the address into R0. */ + tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo, + SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS)); + + /* + * Add the tlb_table pointer, creating the CPUTLBEntry address in R1. + * Load the tlb comparator into R2/R3 and the fast path addend into R1. + */ + if (cmp_off == 0) { + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { + tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); + } else { + tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); + } + } else { + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, + TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0); + if (use_armv6_instructions && TARGET_LONG_BITS == 64) { + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + } else { + tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + } + } + if (!use_armv6_instructions && TARGET_LONG_BITS == 64) { + tcg_out_ld32_12(s, COND_AL, TCG_REG_R3, TCG_REG_R1, cmp_off + 4); + } + + /* Load the tlb addend. */ + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1, + offsetof(CPUTLBEntry, addend)); + + /* Check alignment, check comparators. */ if (use_armv7_instructions) { tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1)); int rot = encode_imm(mask); @@ -1291,22 +1303,24 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP, addrlo, TCG_REG_TMP, 0); } - tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0); + tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0); } else { if (a_bits) { tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, (1 << a_bits) - 1); } + tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo, + SHIFT_IMM_LSR(TARGET_PAGE_BITS)); tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP, - 0, TCG_REG_R0, TCG_REG_TMP, + 0, TCG_REG_R2, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); } if (TARGET_LONG_BITS == 64) { - tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0); + tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0); } - return TCG_REG_R2; + return TCG_REG_R1; } /* Record the context of a call to the out of line helper code for the slow
This changes the code generation for the tlb from e.g. ldr ip, [r6, #-0x10] ldr r2, [r6, #-0xc] and ip, ip, r4, lsr #8 ldrd r0, r1, [r2, ip]! ldr r2, [r2, #0x18] to ldrd r0, r1, [r6, #-0x10] and r0, r0, r4, lsr #8 ldrd r2, r3, [r1, r0]! ldr r1, [r1, #0x18] for armv7 hosts. Rearranging the register allocation in order to avoid overlap between the two ldrd pairs causes the patch to be larger than it ordinarily would be. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- v3: Add QEMU_BUILD_BUG_ON for mask/table ordering; comment fixes. --- tcg/arm/tcg-target.inc.c | 92 +++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 39 deletions(-) -- 2.17.1