Message ID | 20191015012936.16275-1-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | tcg/arm: Expand epilogue inline | expand |
Hi Richard, On 10/15/19 3:29 AM, Richard Henderson wrote: > It is, after all, just two instructions. > > Profiling on a cortex-a15, using -d nochain to increase the number > of exit_tb that are executed, shows a minor improvement of 0.5%. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/arm/tcg-target.inc.c | 32 +++++++++++++------------------- > 1 file changed, 13 insertions(+), 19 deletions(-) > > diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c > index 94d80d79d1..2a9ebfe25a 100644 > --- a/tcg/arm/tcg-target.inc.c > +++ b/tcg/arm/tcg-target.inc.c > @@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) > #endif > } > > -static tcg_insn_unit *tb_ret_addr; > +static void tcg_out_epilogue(TCGContext *s); > > -static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > - const TCGArg *args, const int *const_args) > +static void tcg_out_op(TCGContext *s, TCGOpcode opc, > + const TCGArg *args, const int *const_args) > { > TCGArg a0, a1, a2, a3, a4, a5; > int c; > > switch (opc) { > case INDEX_op_exit_tb: > - /* Reuse the zeroing that exists for goto_ptr. */ > - a0 = args[0]; > - if (a0 == 0) { > - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); > - } else { > - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); > - tcg_out_goto(s, COND_AL, tb_ret_addr); > - } > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); > + tcg_out_epilogue(s); > break; > case INDEX_op_goto_tb: > { > @@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) > + TCG_TARGET_STACK_ALIGN - 1) \ > & -TCG_TARGET_STACK_ALIGN) > > +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) > + > static void tcg_target_qemu_prologue(TCGContext *s) > { > - int stack_addend; > - > /* Calling convention requires us to save r4-r11 and lr. */ > /* stmdb sp!, { r4 - r11, lr } */ > tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); > > /* Reserve callee argument and tcg temp space. */ > - stack_addend = FRAME_SIZE - PUSH_SIZE; > - > tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, > - TCG_REG_CALL_STACK, stack_addend, 1); > + TCG_REG_CALL_STACK, STACK_ADDEND, 1); > tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, > CPU_TEMP_BUF_NLONGS * sizeof(long)); > > @@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s) > */ > s->code_gen_epilogue = s->code_ptr; > tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); > + tcg_out_epilogue(s); > +} > > - /* TB epilogue */ > - tb_ret_addr = s->code_ptr; > +static void tcg_out_epilogue(TCGContext *s) Do you mind splitting this patch in 2? First use tcg_out_epilogue(), then optimize tcg_out_op(). > +{ > tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, > - TCG_REG_CALL_STACK, stack_addend, 1); > + TCG_REG_CALL_STACK, STACK_ADDEND, 1); > > /* ldmia sp!, { r4 - r11, pc } */ > tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0); >
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index 94d80d79d1..2a9ebfe25a 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -1745,24 +1745,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) #endif } -static tcg_insn_unit *tb_ret_addr; +static void tcg_out_epilogue(TCGContext *s); -static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, - const TCGArg *args, const int *const_args) +static void tcg_out_op(TCGContext *s, TCGOpcode opc, + const TCGArg *args, const int *const_args) { TCGArg a0, a1, a2, a3, a4, a5; int c; switch (opc) { case INDEX_op_exit_tb: - /* Reuse the zeroing that exists for goto_ptr. */ - a0 = args[0]; - if (a0 == 0) { - tcg_out_goto(s, COND_AL, s->code_gen_epilogue); - } else { - tcg_out_movi32(s, COND_AL, TCG_REG_R0, args[0]); - tcg_out_goto(s, COND_AL, tb_ret_addr); - } + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, args[0]); + tcg_out_epilogue(s); break; case INDEX_op_goto_tb: { @@ -2284,19 +2278,17 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) + TCG_TARGET_STACK_ALIGN - 1) \ & -TCG_TARGET_STACK_ALIGN) +#define STACK_ADDEND (FRAME_SIZE - PUSH_SIZE) + static void tcg_target_qemu_prologue(TCGContext *s) { - int stack_addend; - /* Calling convention requires us to save r4-r11 and lr. */ /* stmdb sp!, { r4 - r11, lr } */ tcg_out32(s, (COND_AL << 28) | 0x092d4ff0); /* Reserve callee argument and tcg temp space. */ - stack_addend = FRAME_SIZE - PUSH_SIZE; - tcg_out_dat_rI(s, COND_AL, ARITH_SUB, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE, CPU_TEMP_BUF_NLONGS * sizeof(long)); @@ -2310,11 +2302,13 @@ static void tcg_target_qemu_prologue(TCGContext *s) */ s->code_gen_epilogue = s->code_ptr; tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R0, 0); + tcg_out_epilogue(s); +} - /* TB epilogue */ - tb_ret_addr = s->code_ptr; +static void tcg_out_epilogue(TCGContext *s) +{ tcg_out_dat_rI(s, COND_AL, ARITH_ADD, TCG_REG_CALL_STACK, - TCG_REG_CALL_STACK, stack_addend, 1); + TCG_REG_CALL_STACK, STACK_ADDEND, 1); /* ldmia sp!, { r4 - r11, pc } */ tcg_out32(s, (COND_AL << 28) | 0x08bd8ff0);
It is, after all, just two instructions. Profiling on a cortex-a15, using -d nochain to increase the number of exit_tb that are executed, shows a minor improvement of 0.5%. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/arm/tcg-target.inc.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) -- 2.17.1