Message ID | 1393339545-22111-1-git-send-email-peter.maydell@linaro.org |
---|---|
State | Superseded |
Headers | show |
On Tue, Feb 25, 2014 at 8:45 AM, Peter Maydell <peter.maydell@linaro.org> wrote: > Implement WFE to yield our timeslice to the next CPU. > This avoids slowdowns in multicore configurations caused > by one core busy-waiting on a spinlock which can't possibly > be unlocked until the other core has an opportunity to run. > This speeds up my test case A15 dual-core boot by a factor > of three (though it is still four or five times slower than > a single-core boot). > > Signed-off-by: Peter Maydell <peter.maydell@linaro.org> > --- > RTH pointed out on IRC that we could also implement this by > using EXCP_HALTED without setting env->halted; however I think > having a separate EXCP_ constant for the yield semantics is > less confusing. > > A full implementation of WFE would be more complicated (there > are a lot of conditions which must cause WFE wakeup including > SEV from different CPUs and timer event streams), so making > it do a yield seems like a useful intermediate fix. > > This seems to be hugely beneficial on my test case (whereas > the recent GIC fix to not accidentally deassert PPIs doesn't > have any effect at all). Rob, do you still see things getting > worse on your test case with this patch if you have the GIC > fix applied too? No, At least with the mptimers, it improves things reducing the sched delay messages. Tested-by: Rob Herring <rob.herring@linaro.org> > > include/exec/cpu-defs.h | 1 + > target-arm/helper.h | 1 + > target-arm/op_helper.c | 9 +++++++++ > target-arm/translate.c | 6 ++++++ > target-arm/translate.h | 2 ++ > 5 files changed, 19 insertions(+) > > diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h > index 01cd8c7..66a3d46 100644 > --- a/include/exec/cpu-defs.h > +++ b/include/exec/cpu-defs.h > @@ -59,6 +59,7 @@ typedef uint64_t target_ulong; > #define EXCP_HLT 0x10001 /* hlt instruction reached */ > #define EXCP_DEBUG 0x10002 /* cpu stopped after a breakpoint or singlestep */ > #define EXCP_HALTED 0x10003 /* cpu is halted (waiting for external event) */ > +#define EXCP_YIELD 0x10004 /* cpu wants to yield timeslice to another */ > > #define TB_JMP_CACHE_BITS 12 > #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS) > diff --git a/target-arm/helper.h b/target-arm/helper.h > index 19bd620..118b425 100644 > --- a/target-arm/helper.h > +++ b/target-arm/helper.h > @@ -50,6 +50,7 @@ DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE, > i32, i32, i32, i32) > DEF_HELPER_2(exception, void, env, i32) > DEF_HELPER_1(wfi, void, env) > +DEF_HELPER_1(wfe, void, env) > > DEF_HELPER_3(cpsr_write, void, env, i32, i32) > DEF_HELPER_1(cpsr_read, i32, env) > diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c > index eb0fccd..f0e2889 100644 > --- a/target-arm/op_helper.c > +++ b/target-arm/op_helper.c > @@ -225,6 +225,15 @@ void HELPER(wfi)(CPUARMState *env) > cpu_loop_exit(env); > } > > +void HELPER(wfe)(CPUARMState *env) > +{ > + /* Don't actually halt the CPU, just yield back to top > + * level loop > + */ > + env->exception_index = EXCP_YIELD; > + cpu_loop_exit(env); > +} > + > void HELPER(exception)(CPUARMState *env, uint32_t excp) > { > env->exception_index = excp; > diff --git a/target-arm/translate.c b/target-arm/translate.c > index 6ccf0ba..0c30e5c 100644 > --- a/target-arm/translate.c > +++ b/target-arm/translate.c > @@ -3939,6 +3939,9 @@ static void gen_nop_hint(DisasContext *s, int val) > s->is_jmp = DISAS_WFI; > break; > case 2: /* wfe */ > + gen_set_pc_im(s, s->pc); > + s->is_jmp = DISAS_WFE; > + break; > case 4: /* sev */ > case 5: /* sevl */ > /* TODO: Implement SEV, SEVL and WFE. May help SMP performance. */ > @@ -10801,6 +10804,9 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu, > case DISAS_WFI: > gen_helper_wfi(cpu_env); > break; > + case DISAS_WFE: > + gen_helper_wfe(cpu_env); > + break; > case DISAS_SWI: > gen_exception(EXCP_SWI); > break; > diff --git a/target-arm/translate.h b/target-arm/translate.h > index 67da699..2f491f9 100644 > --- a/target-arm/translate.h > +++ b/target-arm/translate.h > @@ -44,6 +44,8 @@ extern TCGv_ptr cpu_env; > * emitting unreachable code at the end of the TB in the A64 decoder > */ > #define DISAS_EXC 6 > +/* WFE */ > +#define DISAS_WFE 7 > > #ifdef TARGET_AARCH64 > void a64_translate_init(void); > -- > 1.9.0 >
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index 01cd8c7..66a3d46 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -59,6 +59,7 @@ typedef uint64_t target_ulong; #define EXCP_HLT 0x10001 /* hlt instruction reached */ #define EXCP_DEBUG 0x10002 /* cpu stopped after a breakpoint or singlestep */ #define EXCP_HALTED 0x10003 /* cpu is halted (waiting for external event) */ +#define EXCP_YIELD 0x10004 /* cpu wants to yield timeslice to another */ #define TB_JMP_CACHE_BITS 12 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS) diff --git a/target-arm/helper.h b/target-arm/helper.h index 19bd620..118b425 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -50,6 +50,7 @@ DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) DEF_HELPER_2(exception, void, env, i32) DEF_HELPER_1(wfi, void, env) +DEF_HELPER_1(wfe, void, env) DEF_HELPER_3(cpsr_write, void, env, i32, i32) DEF_HELPER_1(cpsr_read, i32, env) diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c index eb0fccd..f0e2889 100644 --- a/target-arm/op_helper.c +++ b/target-arm/op_helper.c @@ -225,6 +225,15 @@ void HELPER(wfi)(CPUARMState *env) cpu_loop_exit(env); } +void HELPER(wfe)(CPUARMState *env) +{ + /* Don't actually halt the CPU, just yield back to top + * level loop + */ + env->exception_index = EXCP_YIELD; + cpu_loop_exit(env); +} + void HELPER(exception)(CPUARMState *env, uint32_t excp) { env->exception_index = excp; diff --git a/target-arm/translate.c b/target-arm/translate.c index 6ccf0ba..0c30e5c 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3939,6 +3939,9 @@ static void gen_nop_hint(DisasContext *s, int val) s->is_jmp = DISAS_WFI; break; case 2: /* wfe */ + gen_set_pc_im(s, s->pc); + s->is_jmp = DISAS_WFE; + break; case 4: /* sev */ case 5: /* sevl */ /* TODO: Implement SEV, SEVL and WFE. May help SMP performance. */ @@ -10801,6 +10804,9 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu, case DISAS_WFI: gen_helper_wfi(cpu_env); break; + case DISAS_WFE: + gen_helper_wfe(cpu_env); + break; case DISAS_SWI: gen_exception(EXCP_SWI); break; diff --git a/target-arm/translate.h b/target-arm/translate.h index 67da699..2f491f9 100644 --- a/target-arm/translate.h +++ b/target-arm/translate.h @@ -44,6 +44,8 @@ extern TCGv_ptr cpu_env; * emitting unreachable code at the end of the TB in the A64 decoder */ #define DISAS_EXC 6 +/* WFE */ +#define DISAS_WFE 7 #ifdef TARGET_AARCH64 void a64_translate_init(void);
Implement WFE to yield our timeslice to the next CPU. This avoids slowdowns in multicore configurations caused by one core busy-waiting on a spinlock which can't possibly be unlocked until the other core has an opportunity to run. This speeds up my test case A15 dual-core boot by a factor of three (though it is still four or five times slower than a single-core boot). Signed-off-by: Peter Maydell <peter.maydell@linaro.org> --- RTH pointed out on IRC that we could also implement this by using EXCP_HALTED without setting env->halted; however I think having a separate EXCP_ constant for the yield semantics is less confusing. A full implementation of WFE would be more complicated (there are a lot of conditions which must cause WFE wakeup including SEV from different CPUs and timer event streams), so making it do a yield seems like a useful intermediate fix. This seems to be hugely beneficial on my test case (whereas the recent GIC fix to not accidentally deassert PPIs doesn't have any effect at all). Rob, do you still see things getting worse on your test case with this patch if you have the GIC fix applied too? include/exec/cpu-defs.h | 1 + target-arm/helper.h | 1 + target-arm/op_helper.c | 9 +++++++++ target-arm/translate.c | 6 ++++++ target-arm/translate.h | 2 ++ 5 files changed, 19 insertions(+)