Message ID | 20221004195241.46491-20-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | [PULL,01/20] cpu: cache CPUClass in CPUState for hot code paths | expand |
04.10.2022 22:52, Richard Henderson wrote: > From: Leandro Lupori <leandro.lupori@eldorado.org.br> > > PowerPC64 processors handle direct branches better than indirect > ones, resulting in less stalled cycles and branch misses. > > However, PPC's tb_target_set_jmp_target() was only using direct > branches for 16-bit jumps, while PowerPC64's unconditional branch > instructions are able to handle displacements of up to 26 bits. > To take advantage of this, now jumps whose displacements fit in > between 17 and 26 bits are also converted to direct branches. > > Reviewed-by: Richard Henderson <richard.henderson@linaro.org> > Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br> > [rth: Expanded some commentary.] > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/ppc/tcg-target.c.inc | 119 +++++++++++++++++++++++++++++---------- > 1 file changed, 88 insertions(+), 31 deletions(-) > > diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc > index 1cbd047ab3..e3dba47697 100644 > --- a/tcg/ppc/tcg-target.c.inc > +++ b/tcg/ppc/tcg-target.c.inc ... > + /* > + * There's no convenient way to get the compiler to allocate a pair > + * of registers at an even index, so copy into r6/r7 and clobber. > + */ > + asm("mr %%r6, %1\n\t" > + "mr %%r7, %2\n\t" > + "stq %%r6, %0" > + : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); This is the only place in qemu where __int128 is used (other places name it __int128_t), and is used *unconditionally*. Is it right? In particular, this breaks compilation on powerpc: cc -m32 -Ilibqemu-aarch64-softmmu.fa.p... -c ../../tcg/tcg.c In file included from ../../tcg/tcg.c:432: /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc: In function ‘ppc64_replace4’: /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc:1885:18: error: expected expression before ‘__int128’ 1885 | : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); | ^~~~~~~~ /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc:1885:29: error: expected ‘)’ before ‘rw’ 1885 | : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); | ~ ^~ Thanks, /mjt
>> From: Leandro Lupori <leandro.lupori@eldorado.org.br>
And this address bounces for me, FWIW:
eldorado-org-br.mail.protection.outlook.com[104.47.70.110] said:
550 5.4.1 Recipient address rejected: Access denied. AS(201806281)
/mjt
It's also has a race condition. Please see https://lore.kernel.org/qemu-devel/20221206041715.314209-18-richard.henderson@linaro.org/ r~ On Thu, 15 Dec 2022, 13:33 Michael Tokarev, <mjt@tls.msk.ru> wrote: > 04.10.2022 22:52, Richard Henderson wrote: > > From: Leandro Lupori <leandro.lupori@eldorado.org.br> > > > > PowerPC64 processors handle direct branches better than indirect > > ones, resulting in less stalled cycles and branch misses. > > > > However, PPC's tb_target_set_jmp_target() was only using direct > > branches for 16-bit jumps, while PowerPC64's unconditional branch > > instructions are able to handle displacements of up to 26 bits. > > To take advantage of this, now jumps whose displacements fit in > > between 17 and 26 bits are also converted to direct branches. > > > > Reviewed-by: Richard Henderson <richard.henderson@linaro.org> > > Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br> > > [rth: Expanded some commentary.] > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > > --- > > tcg/ppc/tcg-target.c.inc | 119 +++++++++++++++++++++++++++++---------- > > 1 file changed, 88 insertions(+), 31 deletions(-) > > > > diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc > > index 1cbd047ab3..e3dba47697 100644 > > --- a/tcg/ppc/tcg-target.c.inc > > +++ b/tcg/ppc/tcg-target.c.inc > ... > > > + /* > > + * There's no convenient way to get the compiler to allocate a pair > > + * of registers at an even index, so copy into r6/r7 and clobber. > > + */ > > + asm("mr %%r6, %1\n\t" > > + "mr %%r7, %2\n\t" > > + "stq %%r6, %0" > > + : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); > > This is the only place in qemu where __int128 is used (other places name > it __int128_t), and is used *unconditionally*. Is it right? > > In particular, this breaks compilation on powerpc: > > cc -m32 -Ilibqemu-aarch64-softmmu.fa.p... -c ../../tcg/tcg.c > In file included from ../../tcg/tcg.c:432: > /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc: In function ‘ppc64_replace4’: > /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc:1885:18: error: expected > expression before ‘__int128’ > 1885 | : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", > "r7"); > | ^~~~~~~~ > /<<PKGBUILDDIR>>/tcg/ppc/tcg-target.c.inc:1885:29: error: expected ‘)’ > before ‘rw’ > 1885 | : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", > "r7"); > | ~ ^~ > > Thanks, > > /mjt >
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 1cbd047ab3..e3dba47697 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1847,44 +1847,101 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0) tcg_out32(s, insn); } +static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2) +{ + if (HOST_BIG_ENDIAN) { + return (uint64_t)i1 << 32 | i2; + } + return (uint64_t)i2 << 32 | i1; +} + +static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw, + tcg_insn_unit i0, tcg_insn_unit i1) +{ +#if TCG_TARGET_REG_BITS == 64 + qatomic_set((uint64_t *)rw, make_pair(i0, i1)); + flush_idcache_range(rx, rw, 8); +#else + qemu_build_not_reached(); +#endif +} + +static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw, + tcg_insn_unit i0, tcg_insn_unit i1, + tcg_insn_unit i2, tcg_insn_unit i3) +{ + uint64_t p[2]; + + p[!HOST_BIG_ENDIAN] = make_pair(i0, i1); + p[HOST_BIG_ENDIAN] = make_pair(i2, i3); + + /* + * There's no convenient way to get the compiler to allocate a pair + * of registers at an even index, so copy into r6/r7 and clobber. + */ + asm("mr %%r6, %1\n\t" + "mr %%r7, %2\n\t" + "stq %%r6, %0" + : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7"); + flush_idcache_range(rx, rw, 16); +} + void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, uintptr_t jmp_rw, uintptr_t addr) { - if (TCG_TARGET_REG_BITS == 64) { - tcg_insn_unit i1, i2; - intptr_t tb_diff = addr - tc_ptr; - intptr_t br_diff = addr - (jmp_rx + 4); - uint64_t pair; + tcg_insn_unit i0, i1, i2, i3; + intptr_t tb_diff = addr - tc_ptr; + intptr_t br_diff = addr - (jmp_rx + 4); + intptr_t lo, hi; - /* This does not exercise the range of the branch, but we do - still need to be able to load the new value of TCG_REG_TB. - But this does still happen quite often. */ - if (tb_diff == (int16_t)tb_diff) { - i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff); - i2 = B | (br_diff & 0x3fffffc); - } else { - intptr_t lo = (int16_t)tb_diff; - intptr_t hi = (int32_t)(tb_diff - lo); - assert(tb_diff == hi + lo); - i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16); - i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo); - } -#if HOST_BIG_ENDIAN - pair = (uint64_t)i1 << 32 | i2; -#else - pair = (uint64_t)i2 << 32 | i1; -#endif - - /* As per the enclosing if, this is ppc64. Avoid the _Static_assert - within qatomic_set that would fail to build a ppc32 host. */ - qatomic_set__nocheck((uint64_t *)jmp_rw, pair); - flush_idcache_range(jmp_rx, jmp_rw, 8); - } else { + if (TCG_TARGET_REG_BITS == 32) { intptr_t diff = addr - jmp_rx; tcg_debug_assert(in_range_b(diff)); qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc)); flush_idcache_range(jmp_rx, jmp_rw, 4); + return; } + + /* + * For 16-bit displacements, we can use a single add + branch. + * This happens quite often. + */ + if (tb_diff == (int16_t)tb_diff) { + i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff); + i1 = B | (br_diff & 0x3fffffc); + ppc64_replace2(jmp_rx, jmp_rw, i0, i1); + return; + } + + lo = (int16_t)tb_diff; + hi = (int32_t)(tb_diff - lo); + assert(tb_diff == hi + lo); + i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16); + i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo); + + /* + * Without stq from 2.07, we can only update two insns, + * and those must be the ones that load the target address. + */ + if (!have_isa_2_07) { + ppc64_replace2(jmp_rx, jmp_rw, i0, i1); + return; + } + + /* + * For 26-bit displacements, we can use a direct branch. + * Otherwise we still need the indirect branch, which we + * must restore after a potential direct branch write. + */ + br_diff -= 4; + if (in_range_b(br_diff)) { + i2 = B | (br_diff & 0x3fffffc); + i3 = NOP; + } else { + i2 = MTSPR | RS(TCG_REG_TB) | CTR; + i3 = BCCTR | BO_ALWAYS; + } + ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3); } static void tcg_out_call_int(TCGContext *s, int lk, @@ -2574,8 +2631,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, if (s->tb_jmp_insn_offset) { /* Direct jump. */ if (TCG_TARGET_REG_BITS == 64) { - /* Ensure the next insns are 8-byte aligned. */ - if ((uintptr_t)s->code_ptr & 7) { + /* Ensure the next insns are 8 or 16-byte aligned. */ + while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) { tcg_out32(s, NOP); } s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);