Message ID | 20221202065200.224537-3-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | tcg/s390x: misc patches | expand |
On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote: > This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and > several follow-up patches. The primary motivation is to reduce the > less-tested code paths, pre-z10. Secondarily, this allows the > unconditional use of TCG_TARGET_HAS_direct_jump, which might be more > important for performance than any slight increase in code size. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/s390x/tcg-target.h | 2 +- > tcg/s390x/tcg-target.c.inc | 176 +++++-------------------------------- > 2 files changed, 23 insertions(+), 155 deletions(-) Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com> I have a few questions/ideas for the future below. > diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h > index 22d70d431b..645f522058 100644 > --- a/tcg/s390x/tcg-target.h > +++ b/tcg/s390x/tcg-target.h > @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3]; > #define TCG_TARGET_HAS_mulsh_i32 0 > #define TCG_TARGET_HAS_extrl_i64_i32 0 > #define TCG_TARGET_HAS_extrh_i64_i32 0 > -#define TCG_TARGET_HAS_direct_jump HAVE_FACILITY(GEN_INST_EXT) > +#define TCG_TARGET_HAS_direct_jump 1 This change doesn't seem to affect that, but what is the minimum supported s390x qemu host? z900? > #define TCG_TARGET_HAS_qemu_st8_i32 0 > > #define TCG_TARGET_HAS_div2_i64 1 > diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc > index cb00bb6999..8a4bec0a28 100644 > --- a/tcg/s390x/tcg-target.c.inc > +++ b/tcg/s390x/tcg-target.c.inc > @@ -65,12 +65,6 @@ > /* A scratch register that may be be used throughout the backend. */ > #define TCG_TMP0 TCG_REG_R1 > > -/* A scratch register that holds a pointer to the beginning of the TB. > - We don't need this when we have pc-relative loads with the general > - instructions extension facility. */ > -#define TCG_REG_TB TCG_REG_R12 > -#define USE_REG_TB (!HAVE_FACILITY(GEN_INST_EXT)) > - > #ifndef CONFIG_SOFTMMU > #define TCG_GUEST_BASE_REG TCG_REG_R13 > #endif > @@ -813,8 +807,8 @@ static bool maybe_out_small_movi(TCGContext *s, TCGType type, > } > > /* load a register with an immediate value */ > -static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, > - tcg_target_long sval, bool in_prologue) > +static void tcg_out_movi(TCGContext *s, TCGType type, > + TCGReg ret, tcg_target_long sval) > { > tcg_target_ulong uval; > > @@ -853,14 +847,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, > tcg_out_insn(s, RIL, LARL, ret, off); > return; > } > - } else if (USE_REG_TB && !in_prologue) { > - ptrdiff_t off = tcg_tbrel_diff(s, (void *)sval); > - if (off == sextract64(off, 0, 20)) { > - /* This is certain to be an address within TB, and therefore > - OFF will be negative; don't try RX_LA. */ > - tcg_out_insn(s, RXY, LAY, ret, TCG_REG_TB, TCG_REG_NONE, off); > - return; > - } > } > > /* A 32-bit unsigned value can be loaded in 2 insns. And given > @@ -876,10 +862,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, > if (HAVE_FACILITY(GEN_INST_EXT)) { > tcg_out_insn(s, RIL, LGRL, ret, 0); > new_pool_label(s, sval, R_390_PC32DBL, s->code_ptr - 2, 2); > - } else if (USE_REG_TB && !in_prologue) { > - tcg_out_insn(s, RXY, LG, ret, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, sval, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > } else { > TCGReg base = ret ? ret : TCG_TMP0; > tcg_out_insn(s, RIL, LARL, base, 0); > @@ -888,12 +870,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, > } > } I did some benchmarking of various ways to load constants in context of GCC in the past, and it turned out that LLIHF+OILF is more efficient than literal pool [1]. > -static void tcg_out_movi(TCGContext *s, TCGType type, > - TCGReg ret, tcg_target_long sval) > -{ > - tcg_out_movi_int(s, type, ret, sval, false); > -} > - > /* Emit a load/store type instruction. Inputs are: > DATA: The register to be loaded or stored. > BASE+OFS: The effective address. > @@ -1020,35 +996,6 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, > return false; > } > > -/* load data from an absolute host address */ > -static void tcg_out_ld_abs(TCGContext *s, TCGType type, > - TCGReg dest, const void *abs) > -{ > - intptr_t addr = (intptr_t)abs; > - > - if (HAVE_FACILITY(GEN_INST_EXT) && !(addr & 1)) { > - ptrdiff_t disp = tcg_pcrel_diff(s, abs) >> 1; > - if (disp == (int32_t)disp) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RIL, LRL, dest, disp); > - } else { > - tcg_out_insn(s, RIL, LGRL, dest, disp); > - } > - return; > - } > - } > - if (USE_REG_TB) { > - ptrdiff_t disp = tcg_tbrel_diff(s, abs); > - if (disp == sextract64(disp, 0, 20)) { > - tcg_out_ld(s, type, dest, TCG_REG_TB, disp); > - return; > - } > - } > - > - tcg_out_movi(s, TCG_TYPE_PTR, dest, addr & ~0xffff); > - tcg_out_ld(s, type, dest, dest, addr & 0xffff); > -} > - > static inline void tcg_out_risbg(TCGContext *s, TCGReg dest, TCGReg src, > int msb, int lsb, int ofs, int z) > { > @@ -1243,17 +1190,7 @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) > return; > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (USE_REG_TB) { > - if (!maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - tcg_out_insn(s, RXY, NG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val & valid, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > - return; > - } > - } else { > - tcg_out_movi(s, type, TCG_TMP0, val); > - } > + tcg_out_movi(s, type, TCG_TMP0, val); > if (type == TCG_TYPE_I32) { > tcg_out_insn(s, RR, NR, dest, TCG_TMP0); > } else { > @@ -1297,24 +1234,11 @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) > } > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RR, OR, dest, TCG_TMP0); > - } else { > - tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); > - } > - } else if (USE_REG_TB) { > - tcg_out_insn(s, RXY, OG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > + tcg_out_movi(s, type, TCG_TMP0, val); > + if (type == TCG_TYPE_I32) { > + tcg_out_insn(s, RR, OR, dest, TCG_TMP0); > } else { > - /* Perform the OR via sequential modifications to the high and > - low parts. Do this via recursion to handle 16-bit vs 32-bit > - masks in each half. */ > - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); > - tgen_ori(s, type, dest, val & 0x00000000ffffffffull); > - tgen_ori(s, type, dest, val & 0xffffffff00000000ull); > + tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); > } > } > > @@ -1332,26 +1256,11 @@ static void tgen_xori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) > } > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > - } else { > - tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > - } > - } else if (USE_REG_TB) { > - tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > + tcg_out_movi(s, type, TCG_TMP0, val); > + if (type == TCG_TYPE_I32) { > + tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > } else { > - /* Perform the xor by parts. */ > - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); > - if (val & 0xffffffff) { > - tcg_out_insn(s, RIL, XILF, dest, val); > - } > - if (val > 0xffffffff) { > - tcg_out_insn(s, RIL, XIHF, dest, val >> 32); > - } > + tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > } > } Wouldn't it be worth keeping XILF/XIFH here? I don't have any numbers right now, but it looks more compact/efficient than a load + XGR. Same for OGR above; I even wonder if both implementations could be unified. > @@ -1395,19 +1304,6 @@ static int tgen_cmp(TCGContext *s, TCGType type, TCGCond c, TCGReg r1, > if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) { > c2 = TCG_TMP0; > /* fall through to reg-reg */ > - } else if (USE_REG_TB) { > - if (type == TCG_TYPE_I32) { > - op = (is_unsigned ? RXY_CLY : RXY_CY); > - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, (uint32_t)c2, R_390_20, s->code_ptr - 2, > - 4 - tcg_tbrel_diff(s, NULL)); > - } else { > - op = (is_unsigned ? RXY_CLG : RXY_CG); > - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, c2, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > - } > - goto exit; > } else { > if (type == TCG_TYPE_I32) { > op = (is_unsigned ? RIL_CLRL : RIL_CRL); > @@ -2101,43 +1997,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > > case INDEX_op_goto_tb: > a0 = args[0]; > - if (s->tb_jmp_insn_offset) { > - /* > - * branch displacement must be aligned for atomic patching; > - * see if we need to add extra nop before branch > - */ > - if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { > - tcg_out16(s, NOP); > - } > - tcg_debug_assert(!USE_REG_TB); > - tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); > - s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > - s->code_ptr += 2; > - } else { > - /* load address stored at s->tb_jmp_target_addr + a0 */ > - tcg_out_ld_abs(s, TCG_TYPE_PTR, TCG_REG_TB, > - tcg_splitwx_to_rx(s->tb_jmp_target_addr + a0)); > - /* and go there */ > - tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_TB); > + tcg_debug_assert(s->tb_jmp_insn_offset); > + /* > + * branch displacement must be aligned for atomic patching; > + * see if we need to add extra nop before branch > + */ > + if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { > + tcg_out16(s, NOP); > } > + tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); > + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > + tcg_out32(s, 0); > set_jmp_reset_offset(s, a0); This seems to work in practice, but I don't think patching branch offsets is allowed by PoP (in a multi-threaded environment). For example, I had to do [2] in order to work around this limitation in ftrace. > - > - /* For the unlinked path of goto_tb, we need to reset > - TCG_REG_TB to the beginning of this TB. */ > - if (USE_REG_TB) { > - int ofs = -tcg_current_code_size(s); > - /* All TB are restricted to 64KiB by unwind info. */ > - tcg_debug_assert(ofs == sextract64(ofs, 0, 20)); > - tcg_out_insn(s, RXY, LAY, TCG_REG_TB, > - TCG_REG_TB, TCG_REG_NONE, ofs); > - } > break; > > case INDEX_op_goto_ptr: > a0 = args[0]; > - if (USE_REG_TB) { > - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, a0); > - } > tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0); > break; > > @@ -3405,9 +3280,6 @@ static void tcg_target_init(TCGContext *s) > /* XXX many insns can't be used with R0, so we better avoid it for now */ > tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0); > tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); > - if (USE_REG_TB) { > - tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB); > - } > } A third benefit seems to be that we now have one more register to allocate. > > #define FRAME_SIZE ((int)(TCG_TARGET_CALL_STACK_OFFSET \ > @@ -3428,16 +3300,12 @@ static void tcg_target_qemu_prologue(TCGContext *s) > > #ifndef CONFIG_SOFTMMU > if (guest_base >= 0x80000) { > - tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true); > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); > tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); > } > #endif > > tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); > - if (USE_REG_TB) { > - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, > - tcg_target_call_iarg_regs[1]); > - } > > /* br %r3 (go to TB) */ > tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, tcg_target_call_iarg_regs[1]); > -- > 2.34.1 [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=9776b4653bc4f8b568ea49fea4a7d7460e58b68a [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de5012b41e5c900a8a3875c7a825394c5f624c05
On 12/6/22 13:29, Ilya Leoshkevich wrote: > On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote: >> This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and >> several follow-up patches. The primary motivation is to reduce the >> less-tested code paths, pre-z10. Secondarily, this allows the >> unconditional use of TCG_TARGET_HAS_direct_jump, which might be more >> important for performance than any slight increase in code size. >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> tcg/s390x/tcg-target.h | 2 +- >> tcg/s390x/tcg-target.c.inc | 176 +++++-------------------------------- >> 2 files changed, 23 insertions(+), 155 deletions(-) > > Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com> > > I have a few questions/ideas for the future below. > >> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h >> index 22d70d431b..645f522058 100644 >> --- a/tcg/s390x/tcg-target.h >> +++ b/tcg/s390x/tcg-target.h >> @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3]; >> #define TCG_TARGET_HAS_mulsh_i32 0 >> #define TCG_TARGET_HAS_extrl_i64_i32 0 >> #define TCG_TARGET_HAS_extrh_i64_i32 0 >> -#define TCG_TARGET_HAS_direct_jump HAVE_FACILITY(GEN_INST_EXT) >> +#define TCG_TARGET_HAS_direct_jump 1 > > This change doesn't seem to affect that, but what is the minimum > supported s390x qemu host? z900? Possibly z990, if I'm reading the gcc processor_flags_table[] correctly; long-displacement-facility is definitely a minimum. We probably should revisit what the minimum for TCG should be, assert those features at startup, and drop the corresponding runtime tests. > I did some benchmarking of various ways to load constants in context of > GCC in the past, and it turned out that LLIHF+OILF is more efficient > than literal pool [1]. Interesting. If we include extended-immediate-facility (base_GEN9_GA1, z9-109?) in the bare minimum that would definitely simplify a few things. >> - /* Use the constant pool if USE_REG_TB, but not for small constants. */ >> - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { >> - if (type == TCG_TYPE_I32) { >> - tcg_out_insn(s, RR, XR, dest, TCG_TMP0); >> - } else { >> - tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); >> - } >> - } else if (USE_REG_TB) { >> - tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, 0); >> - new_pool_label(s, val, R_390_20, s->code_ptr - 2, >> - tcg_tbrel_diff(s, NULL)); >> + tcg_out_movi(s, type, TCG_TMP0, val); >> + if (type == TCG_TYPE_I32) { >> + tcg_out_insn(s, RR, XR, dest, TCG_TMP0); >> } else { >> - /* Perform the xor by parts. */ >> - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); >> - if (val & 0xffffffff) { >> - tcg_out_insn(s, RIL, XILF, dest, val); >> - } >> - if (val > 0xffffffff) { >> - tcg_out_insn(s, RIL, XIHF, dest, val >> 32); >> - } >> + tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); >> } >> } > > Wouldn't it be worth keeping XILF/XIFH here? I don't know. It's difficult for me to guess whether a dependency chain like val -> xor -> xor (3 insns with serial dependencies) is better than val --> xor load -/ (3 insns, but only one serial dependency) is better. But there may also be instruction fusion going on at the micro-architectural level, so that there's really only one xor. If you have suggestions, I'm all ears. > I don't have any numbers right now, but it looks more compact/efficient > than a load + XGR. If we assume general-instruction-extension-facility (z10?), LGRL + XGR is smaller than XILF + XIFH, ignoring the constant pool entry which might be shared, and modulo the µarch questions above. > Same for OGR above; I even wonder if both implementations could be > unified. Sadly not, because of OILL et al. There are no 16-bit xor immediate insns. >> + /* >> + * branch displacement must be aligned for atomic patching; >> + * see if we need to add extra nop before branch >> + */ >> + if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { >> + tcg_out16(s, NOP); >> } >> + tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); >> + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); >> + tcg_out32(s, 0); >> set_jmp_reset_offset(s, a0); > > This seems to work in practice, but I don't think patching branch > offsets is allowed by PoP (in a multi-threaded environment). For > example, I had to do [2] in order to work around this limitation in > ftrace. Really? How does the processor distinguish between overwriting opcode/condition vs overwriting immediate operand when invalidating cached instructions? If overwriting operand truly isn't correct, then I think we have to use indirect branch always for goto_tb. > A third benefit seems to be that we now have one more register to > allocate. Yes. It's call clobbered, so it isn't live so often, but sometimes. r~
On 12/6/22 16:22, Richard Henderson wrote: >> Wouldn't it be worth keeping XILF/XIFH here? > > I don't know. It's difficult for me to guess whether a dependency chain like > > val -> xor -> xor > > (3 insns with serial dependencies) is better than > > val --> xor > load -/ > > (3 insns, but only one serial dependency) is better. But there may also be instruction > fusion going on at the micro-architectural level, so that there's really only one xor. > > If you have suggestions, I'm all ears. Related microarchitectural question: If a 32-bit insn and a 64-bit insn have a similar size encoding (and perhaps even if they don't), is it better to produce a 64-bit output so that the hw doesn't have a false dependency on the upper 32-bits of the register? Just wondering whether most of the distinction between 32-bit and 64-bit opcodes ought to be discarded, simplifying code generation. The only items that seem most likely to have real execution time differences are multiply and divide. r~
On 06/12/2022 23.22, Richard Henderson wrote: > On 12/6/22 13:29, Ilya Leoshkevich wrote: >> On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote: >>> This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and >>> several follow-up patches. The primary motivation is to reduce the >>> less-tested code paths, pre-z10. Secondarily, this allows the >>> unconditional use of TCG_TARGET_HAS_direct_jump, which might be more >>> important for performance than any slight increase in code size. >>> >>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >>> --- >>> tcg/s390x/tcg-target.h | 2 +- >>> tcg/s390x/tcg-target.c.inc | 176 +++++-------------------------------- >>> 2 files changed, 23 insertions(+), 155 deletions(-) >> >> Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com> >> >> I have a few questions/ideas for the future below. >>> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h >>> index 22d70d431b..645f522058 100644 >>> --- a/tcg/s390x/tcg-target.h >>> +++ b/tcg/s390x/tcg-target.h >>> @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3]; >>> #define TCG_TARGET_HAS_mulsh_i32 0 >>> #define TCG_TARGET_HAS_extrl_i64_i32 0 >>> #define TCG_TARGET_HAS_extrh_i64_i32 0 >>> -#define TCG_TARGET_HAS_direct_jump HAVE_FACILITY(GEN_INST_EXT) >>> +#define TCG_TARGET_HAS_direct_jump 1 >> >> This change doesn't seem to affect that, but what is the minimum >> supported s390x qemu host? z900? > > Possibly z990, if I'm reading the gcc processor_flags_table[] correctly; > long-displacement-facility is definitely a minimum. > > We probably should revisit what the minimum for TCG should be, assert those > features at startup, and drop the corresponding runtime tests. If we consider the official IBM support statement: https://www.ibm.com/support/pages/system/files/inline-files/IBM%20Mainframe%20Life%20Cycle%20History%20V2.10%20-%20Sept%2013%202022_1.pdf ... that would mean that the z10 and all older machines are not supported anymore. Thomas
On 12/7/22 01:45, Thomas Huth wrote: > On 06/12/2022 23.22, Richard Henderson wrote: >> On 12/6/22 13:29, Ilya Leoshkevich wrote: >>> This change doesn't seem to affect that, but what is the minimum >>> supported s390x qemu host? z900? >> >> Possibly z990, if I'm reading the gcc processor_flags_table[] correctly; >> long-displacement-facility is definitely a minimum. >> >> We probably should revisit what the minimum for TCG should be, assert those features at >> startup, and drop the corresponding runtime tests. > > If we consider the official IBM support statement: > > https://www.ibm.com/support/pages/system/files/inline-files/IBM%20Mainframe%20Life%20Cycle%20History%20V2.10%20-%20Sept%2013%202022_1.pdf > > ... that would mean that the z10 and all older machines are not supported anymore. Thanks for the pointer. It would appear that z114 exits support at the end of this month, which would leave z12 as minimum supported cpu. Even assuming z196 gets us extended-immediate, general-insn-extension, load-on-condition, and distinct-operands, which are all quite important to TCG, and constitute almost all of the current runtime checks. The other metric would be matching the set of supported cpus from the set of supported os distributions, but I would be ready to believe z196 is below the minimum there too. r~
On Wed, 2022-12-07 at 08:55 -0600, Richard Henderson wrote: > On 12/7/22 01:45, Thomas Huth wrote: > > On 06/12/2022 23.22, Richard Henderson wrote: > > > On 12/6/22 13:29, Ilya Leoshkevich wrote: > > > > This change doesn't seem to affect that, but what is the > > > > minimum > > > > supported s390x qemu host? z900? > > > > > > Possibly z990, if I'm reading the gcc processor_flags_table[] > > > correctly; > > > long-displacement-facility is definitely a minimum. > > > > > > We probably should revisit what the minimum for TCG should be, > > > assert those features at > > > startup, and drop the corresponding runtime tests. > > > > If we consider the official IBM support statement: > > > > https://www.ibm.com/support/pages/system/files/inline-files/IBM%20Mainframe%20Life%20Cycle%20History%20V2.10%20-%20Sept%2013%202022_1.pdf > > > > ... that would mean that the z10 and all older machines are not > > supported anymore. > > Thanks for the pointer. It would appear that z114 exits support at > the end of this month, > which would leave z12 as minimum supported cpu. > > Even assuming z196 gets us extended-immediate, general-insn- > extension, load-on-condition, > and distinct-operands, which are all quite important to TCG, and > constitute almost all of > the current runtime checks. > > The other metric would be matching the set of supported cpus from the > set of supported os > distributions, but I would be ready to believe z196 is below the > minimum there too. > > > r~ I think it should be safe to raise the minimum required hardware for TCG to z196: * The oldest supported RHEL is v7, it requires z196: https://access.redhat.com/product-life-cycles/ https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/installation_guide/chap-installation-planning-s390 * The oldest supported SLES is v12, it requires z196: https://www.suse.com/de-de/lifecycle/ https://documentation.suse.com/sles/12-SP5/html/SLES-all/cha-zseries.html * The oldest supported Ubuntu is v16.04, it requires zEC12+: https://wiki.ubuntu.com/S390X Best regards, Ilya
Am 07.12.22 um 21:40 schrieb Ilya Leoshkevich: > On Wed, 2022-12-07 at 08:55 -0600, Richard Henderson wrote: >> On 12/7/22 01:45, Thomas Huth wrote: >>> On 06/12/2022 23.22, Richard Henderson wrote: >>>> On 12/6/22 13:29, Ilya Leoshkevich wrote: >>>>> This change doesn't seem to affect that, but what is the >>>>> minimum >>>>> supported s390x qemu host? z900? >>>> >>>> Possibly z990, if I'm reading the gcc processor_flags_table[] >>>> correctly; >>>> long-displacement-facility is definitely a minimum. >>>> >>>> We probably should revisit what the minimum for TCG should be, >>>> assert those features at >>>> startup, and drop the corresponding runtime tests. >>> >>> If we consider the official IBM support statement: >>> >>> https://www.ibm.com/support/pages/system/files/inline-files/IBM%20Mainframe%20Life%20Cycle%20History%20V2.10%20-%20Sept%2013%202022_1.pdf >>> >>> ... that would mean that the z10 and all older machines are not >>> supported anymore. >> >> Thanks for the pointer. It would appear that z114 exits support at >> the end of this month, >> which would leave z12 as minimum supported cpu. >> >> Even assuming z196 gets us extended-immediate, general-insn- >> extension, load-on-condition, >> and distinct-operands, which are all quite important to TCG, and >> constitute almost all of >> the current runtime checks. >> >> The other metric would be matching the set of supported cpus from the >> set of supported os >> distributions, but I would be ready to believe z196 is below the >> minimum there too. >> >> >> r~ > > I think it should be safe to raise the minimum required hardware for > TCG to z196: We recently raised the minimum hardware for the Linux kernel to be z10, so that would be super-safe, but z196 is certainly a sane minimum. > > * The oldest supported RHEL is v7, it requires z196: > > https://access.redhat.com/product-life-cycles/ > https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/installation_guide/chap-installation-planning-s390 > > * The oldest supported SLES is v12, it requires z196: > > https://www.suse.com/de-de/lifecycle/ > https://documentation.suse.com/sles/12-SP5/html/SLES-all/cha-zseries.html > > * The oldest supported Ubuntu is v16.04, it requires zEC12+: > https://wiki.ubuntu.com/S390X > > Best regards, > Ilya
On Tue, 2022-12-06 at 16:22 -0600, Richard Henderson wrote: > On 12/6/22 13:29, Ilya Leoshkevich wrote: > > On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote: > > > This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and > > > several follow-up patches. The primary motivation is to reduce > > > the > > > less-tested code paths, pre-z10. Secondarily, this allows the > > > unconditional use of TCG_TARGET_HAS_direct_jump, which might be > > > more > > > important for performance than any slight increase in code size. > > > > > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > > > --- > > > tcg/s390x/tcg-target.h | 2 +- > > > tcg/s390x/tcg-target.c.inc | 176 +++++------------------------- > > > ------- > > > 2 files changed, 23 insertions(+), 155 deletions(-) > > > > Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com> > > > > I have a few questions/ideas for the future below. > > > ... > > > > > - /* Use the constant pool if USE_REG_TB, but not for small > > > constants. */ > > > - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { > > > - if (type == TCG_TYPE_I32) { > > > - tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > > > - } else { > > > - tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > > > - } > > > - } else if (USE_REG_TB) { > > > - tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, > > > 0); > > > - new_pool_label(s, val, R_390_20, s->code_ptr - 2, > > > - tcg_tbrel_diff(s, NULL)); > > > + tcg_out_movi(s, type, TCG_TMP0, val); > > > + if (type == TCG_TYPE_I32) { > > > + tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > > > } else { > > > - /* Perform the xor by parts. */ > > > - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); > > > - if (val & 0xffffffff) { > > > - tcg_out_insn(s, RIL, XILF, dest, val); > > > - } > > > - if (val > 0xffffffff) { > > > - tcg_out_insn(s, RIL, XIHF, dest, val >> 32); > > > - } > > > + tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > > > } > > > } > > > > Wouldn't it be worth keeping XILF/XIFH here? > > I don't know. It's difficult for me to guess whether a dependency > chain like > > val -> xor -> xor > > (3 insns with serial dependencies) is better than > > val --> xor > load -/ > > (3 insns, but only one serial dependency) is better. But there may > also be instruction > fusion going on at the micro-architectural level, so that there's > really only one xor. > > If you have suggestions, I'm all ears. I ran some experiments, and it turned out you were right: xilf+xihf is slower exactly because of dependency chains. So no need to change anything here and sorry for the noise. > > I don't have any numbers right now, but it looks more > > compact/efficient > > than a load + XGR. > > If we assume general-instruction-extension-facility (z10?), LGRL + > XGR is smaller than > XILF + XIFH, ignoring the constant pool entry which might be shared, > and modulo the µarch > questions above. > > > > Same for OGR above; I even wonder if both implementations could be > > unified. > > Sadly not, because of OILL et al. There are no 16-bit xor immediate > insns. > > > > + /* > > > + * branch displacement must be aligned for atomic > > > patching; > > > + * see if we need to add extra nop before branch > > > + */ > > > + if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { > > > + tcg_out16(s, NOP); > > > } > > > + tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); > > > + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > > > + tcg_out32(s, 0); > > > set_jmp_reset_offset(s, a0); > > > > This seems to work in practice, but I don't think patching branch > > offsets is allowed by PoP (in a multi-threaded environment). For > > example, I had to do [2] in order to work around this limitation in > > ftrace. > > Really? How does the processor distinguish between overwriting > opcode/condition vs > overwriting immediate operand when invalidating cached instructions? The problem is that nothing in PoP prevents a CPU from fetching instruction bytes one by one, in random order and more than one time. It's not that this is necessarily going to happen, rather, it's just that this has never been verified for all instructions and formally stated. The observation that I use in the ftrace patch is not formalized either, but it's specific to a single instruction and should hold in practice for the existing hardware to the best of my knowledge. > If overwriting operand truly isn't correct, then I think we have to > use indirect branch > always for goto_tb. > > > A third benefit seems to be that we now have one more register to > > allocate. > > Yes. It's call clobbered, so it isn't live so often, but sometimes. > > > r~
On Tue, 2022-12-06 at 18:42 -0600, Richard Henderson wrote: > On 12/6/22 16:22, Richard Henderson wrote: > > > Wouldn't it be worth keeping XILF/XIFH here? > > > > I don't know. It's difficult for me to guess whether a dependency > > chain like > > > > val -> xor -> xor > > > > (3 insns with serial dependencies) is better than > > > > val --> xor > > load -/ > > > > (3 insns, but only one serial dependency) is better. But there may > > also be instruction > > fusion going on at the micro-architectural level, so that there's > > really only one xor. > > > > If you have suggestions, I'm all ears. > > Related microarchitectural question: > > If a 32-bit insn and a 64-bit insn have a similar size encoding (and > perhaps even if they > don't), is it better to produce a 64-bit output so that the hw > doesn't have a false > dependency on the upper 32-bits of the register? > > Just wondering whether most of the distinction between 32-bit and 64- > bit opcodes ought to > be discarded, simplifying code generation. The only items that seem > most likely to have > real execution time differences are multiply and divide. I think this will definitely lead to false dependencies if one produces 32 bits in one place, and then consumes 64 in the other. But if this idea is applied consistently, then there should be hopefully not so many such instances? Two thing come to mind here: memory and CC generation. The first is probably not very important: we can implement 32-bit loads with lgf, which sign-extends to 64 bits. CC generation can be tricker: for conditional jumps it's still going to be okay, since the code would produce 64-bit values and consume 32-bit ones, but if the back-end needs a CC from a 32-bit addition, then we would probably need to sign-extend the result in order to get rid of a false dependency later on. However, based on a quick inspection I could not find any such instances. So using 64-bit operations instead of 32-bit ones would be an interesting experiment. Best regards, Ilya
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h index 22d70d431b..645f522058 100644 --- a/tcg/s390x/tcg-target.h +++ b/tcg/s390x/tcg-target.h @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3]; #define TCG_TARGET_HAS_mulsh_i32 0 #define TCG_TARGET_HAS_extrl_i64_i32 0 #define TCG_TARGET_HAS_extrh_i64_i32 0 -#define TCG_TARGET_HAS_direct_jump HAVE_FACILITY(GEN_INST_EXT) +#define TCG_TARGET_HAS_direct_jump 1 #define TCG_TARGET_HAS_qemu_st8_i32 0 #define TCG_TARGET_HAS_div2_i64 1 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index cb00bb6999..8a4bec0a28 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -65,12 +65,6 @@ /* A scratch register that may be be used throughout the backend. */ #define TCG_TMP0 TCG_REG_R1 -/* A scratch register that holds a pointer to the beginning of the TB. - We don't need this when we have pc-relative loads with the general - instructions extension facility. */ -#define TCG_REG_TB TCG_REG_R12 -#define USE_REG_TB (!HAVE_FACILITY(GEN_INST_EXT)) - #ifndef CONFIG_SOFTMMU #define TCG_GUEST_BASE_REG TCG_REG_R13 #endif @@ -813,8 +807,8 @@ static bool maybe_out_small_movi(TCGContext *s, TCGType type, } /* load a register with an immediate value */ -static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, - tcg_target_long sval, bool in_prologue) +static void tcg_out_movi(TCGContext *s, TCGType type, + TCGReg ret, tcg_target_long sval) { tcg_target_ulong uval; @@ -853,14 +847,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, tcg_out_insn(s, RIL, LARL, ret, off); return; } - } else if (USE_REG_TB && !in_prologue) { - ptrdiff_t off = tcg_tbrel_diff(s, (void *)sval); - if (off == sextract64(off, 0, 20)) { - /* This is certain to be an address within TB, and therefore - OFF will be negative; don't try RX_LA. */ - tcg_out_insn(s, RXY, LAY, ret, TCG_REG_TB, TCG_REG_NONE, off); - return; - } } /* A 32-bit unsigned value can be loaded in 2 insns. And given @@ -876,10 +862,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, if (HAVE_FACILITY(GEN_INST_EXT)) { tcg_out_insn(s, RIL, LGRL, ret, 0); new_pool_label(s, sval, R_390_PC32DBL, s->code_ptr - 2, 2); - } else if (USE_REG_TB && !in_prologue) { - tcg_out_insn(s, RXY, LG, ret, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, sval, R_390_20, s->code_ptr - 2, - tcg_tbrel_diff(s, NULL)); } else { TCGReg base = ret ? ret : TCG_TMP0; tcg_out_insn(s, RIL, LARL, base, 0); @@ -888,12 +870,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, } } -static void tcg_out_movi(TCGContext *s, TCGType type, - TCGReg ret, tcg_target_long sval) -{ - tcg_out_movi_int(s, type, ret, sval, false); -} - /* Emit a load/store type instruction. Inputs are: DATA: The register to be loaded or stored. BASE+OFS: The effective address. @@ -1020,35 +996,6 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, return false; } -/* load data from an absolute host address */ -static void tcg_out_ld_abs(TCGContext *s, TCGType type, - TCGReg dest, const void *abs) -{ - intptr_t addr = (intptr_t)abs; - - if (HAVE_FACILITY(GEN_INST_EXT) && !(addr & 1)) { - ptrdiff_t disp = tcg_pcrel_diff(s, abs) >> 1; - if (disp == (int32_t)disp) { - if (type == TCG_TYPE_I32) { - tcg_out_insn(s, RIL, LRL, dest, disp); - } else { - tcg_out_insn(s, RIL, LGRL, dest, disp); - } - return; - } - } - if (USE_REG_TB) { - ptrdiff_t disp = tcg_tbrel_diff(s, abs); - if (disp == sextract64(disp, 0, 20)) { - tcg_out_ld(s, type, dest, TCG_REG_TB, disp); - return; - } - } - - tcg_out_movi(s, TCG_TYPE_PTR, dest, addr & ~0xffff); - tcg_out_ld(s, type, dest, dest, addr & 0xffff); -} - static inline void tcg_out_risbg(TCGContext *s, TCGReg dest, TCGReg src, int msb, int lsb, int ofs, int z) { @@ -1243,17 +1190,7 @@ static void tgen_andi(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) return; } - /* Use the constant pool if USE_REG_TB, but not for small constants. */ - if (USE_REG_TB) { - if (!maybe_out_small_movi(s, type, TCG_TMP0, val)) { - tcg_out_insn(s, RXY, NG, dest, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, val & valid, R_390_20, s->code_ptr - 2, - tcg_tbrel_diff(s, NULL)); - return; - } - } else { - tcg_out_movi(s, type, TCG_TMP0, val); - } + tcg_out_movi(s, type, TCG_TMP0, val); if (type == TCG_TYPE_I32) { tcg_out_insn(s, RR, NR, dest, TCG_TMP0); } else { @@ -1297,24 +1234,11 @@ static void tgen_ori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) } } - /* Use the constant pool if USE_REG_TB, but not for small constants. */ - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { - if (type == TCG_TYPE_I32) { - tcg_out_insn(s, RR, OR, dest, TCG_TMP0); - } else { - tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); - } - } else if (USE_REG_TB) { - tcg_out_insn(s, RXY, OG, dest, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, val, R_390_20, s->code_ptr - 2, - tcg_tbrel_diff(s, NULL)); + tcg_out_movi(s, type, TCG_TMP0, val); + if (type == TCG_TYPE_I32) { + tcg_out_insn(s, RR, OR, dest, TCG_TMP0); } else { - /* Perform the OR via sequential modifications to the high and - low parts. Do this via recursion to handle 16-bit vs 32-bit - masks in each half. */ - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); - tgen_ori(s, type, dest, val & 0x00000000ffffffffull); - tgen_ori(s, type, dest, val & 0xffffffff00000000ull); + tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); } } @@ -1332,26 +1256,11 @@ static void tgen_xori(TCGContext *s, TCGType type, TCGReg dest, uint64_t val) } } - /* Use the constant pool if USE_REG_TB, but not for small constants. */ - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { - if (type == TCG_TYPE_I32) { - tcg_out_insn(s, RR, XR, dest, TCG_TMP0); - } else { - tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); - } - } else if (USE_REG_TB) { - tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, val, R_390_20, s->code_ptr - 2, - tcg_tbrel_diff(s, NULL)); + tcg_out_movi(s, type, TCG_TMP0, val); + if (type == TCG_TYPE_I32) { + tcg_out_insn(s, RR, XR, dest, TCG_TMP0); } else { - /* Perform the xor by parts. */ - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); - if (val & 0xffffffff) { - tcg_out_insn(s, RIL, XILF, dest, val); - } - if (val > 0xffffffff) { - tcg_out_insn(s, RIL, XIHF, dest, val >> 32); - } + tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); } } @@ -1395,19 +1304,6 @@ static int tgen_cmp(TCGContext *s, TCGType type, TCGCond c, TCGReg r1, if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) { c2 = TCG_TMP0; /* fall through to reg-reg */ - } else if (USE_REG_TB) { - if (type == TCG_TYPE_I32) { - op = (is_unsigned ? RXY_CLY : RXY_CY); - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, (uint32_t)c2, R_390_20, s->code_ptr - 2, - 4 - tcg_tbrel_diff(s, NULL)); - } else { - op = (is_unsigned ? RXY_CLG : RXY_CG); - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); - new_pool_label(s, c2, R_390_20, s->code_ptr - 2, - tcg_tbrel_diff(s, NULL)); - } - goto exit; } else { if (type == TCG_TYPE_I32) { op = (is_unsigned ? RIL_CLRL : RIL_CRL); @@ -2101,43 +1997,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_goto_tb: a0 = args[0]; - if (s->tb_jmp_insn_offset) { - /* - * branch displacement must be aligned for atomic patching; - * see if we need to add extra nop before branch - */ - if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { - tcg_out16(s, NOP); - } - tcg_debug_assert(!USE_REG_TB); - tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); - s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); - s->code_ptr += 2; - } else { - /* load address stored at s->tb_jmp_target_addr + a0 */ - tcg_out_ld_abs(s, TCG_TYPE_PTR, TCG_REG_TB, - tcg_splitwx_to_rx(s->tb_jmp_target_addr + a0)); - /* and go there */ - tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_TB); + tcg_debug_assert(s->tb_jmp_insn_offset); + /* + * branch displacement must be aligned for atomic patching; + * see if we need to add extra nop before branch + */ + if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { + tcg_out16(s, NOP); } + tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); + tcg_out32(s, 0); set_jmp_reset_offset(s, a0); - - /* For the unlinked path of goto_tb, we need to reset - TCG_REG_TB to the beginning of this TB. */ - if (USE_REG_TB) { - int ofs = -tcg_current_code_size(s); - /* All TB are restricted to 64KiB by unwind info. */ - tcg_debug_assert(ofs == sextract64(ofs, 0, 20)); - tcg_out_insn(s, RXY, LAY, TCG_REG_TB, - TCG_REG_TB, TCG_REG_NONE, ofs); - } break; case INDEX_op_goto_ptr: a0 = args[0]; - if (USE_REG_TB) { - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, a0); - } tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0); break; @@ -3405,9 +3280,6 @@ static void tcg_target_init(TCGContext *s) /* XXX many insns can't be used with R0, so we better avoid it for now */ tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0); tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); - if (USE_REG_TB) { - tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB); - } } #define FRAME_SIZE ((int)(TCG_TARGET_CALL_STACK_OFFSET \ @@ -3428,16 +3300,12 @@ static void tcg_target_qemu_prologue(TCGContext *s) #ifndef CONFIG_SOFTMMU if (guest_base >= 0x80000) { - tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true); + tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } #endif tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); - if (USE_REG_TB) { - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, - tcg_target_call_iarg_regs[1]); - } /* br %r3 (go to TB) */ tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, tcg_target_call_iarg_regs[1]);
This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and several follow-up patches. The primary motivation is to reduce the less-tested code paths, pre-z10. Secondarily, this allows the unconditional use of TCG_TARGET_HAS_direct_jump, which might be more important for performance than any slight increase in code size. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/s390x/tcg-target.h | 2 +- tcg/s390x/tcg-target.c.inc | 176 +++++-------------------------------- 2 files changed, 23 insertions(+), 155 deletions(-)