diff mbox

target-arm: Implement WFE as a yield operation

Message ID 1393339545-22111-1-git-send-email-peter.maydell@linaro.org
State Superseded
Headers show

Commit Message

Peter Maydell Feb. 25, 2014, 2:45 p.m. UTC
Implement WFE to yield our timeslice to the next CPU.
This avoids slowdowns in multicore configurations caused
by one core busy-waiting on a spinlock which can't possibly
be unlocked until the other core has an opportunity to run.
This speeds up my test case A15 dual-core boot by a factor
of three (though it is still four or five times slower than
a single-core boot).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
RTH pointed out on IRC that we could also implement this by
using EXCP_HALTED without setting env->halted; however I think
having a separate EXCP_ constant for the yield semantics is
less confusing.

A full implementation of WFE would be more complicated (there
are a lot of conditions which must cause WFE wakeup including
SEV from different CPUs and timer event streams), so making
it do a yield seems like a useful intermediate fix.

This seems to be hugely beneficial on my test case (whereas
the recent GIC fix to not accidentally deassert PPIs doesn't
have any effect at all). Rob, do you still see things getting
worse on your test case with this patch if you have the GIC
fix applied too?

 include/exec/cpu-defs.h | 1 +
 target-arm/helper.h     | 1 +
 target-arm/op_helper.c  | 9 +++++++++
 target-arm/translate.c  | 6 ++++++
 target-arm/translate.h  | 2 ++
 5 files changed, 19 insertions(+)

Comments

Rob Herring Feb. 25, 2014, 4:06 p.m. UTC | #1
On Tue, Feb 25, 2014 at 8:45 AM, Peter Maydell <peter.maydell@linaro.org> wrote:
> Implement WFE to yield our timeslice to the next CPU.
> This avoids slowdowns in multicore configurations caused
> by one core busy-waiting on a spinlock which can't possibly
> be unlocked until the other core has an opportunity to run.
> This speeds up my test case A15 dual-core boot by a factor
> of three (though it is still four or five times slower than
> a single-core boot).
>
> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
> ---
> RTH pointed out on IRC that we could also implement this by
> using EXCP_HALTED without setting env->halted; however I think
> having a separate EXCP_ constant for the yield semantics is
> less confusing.
>
> A full implementation of WFE would be more complicated (there
> are a lot of conditions which must cause WFE wakeup including
> SEV from different CPUs and timer event streams), so making
> it do a yield seems like a useful intermediate fix.
>
> This seems to be hugely beneficial on my test case (whereas
> the recent GIC fix to not accidentally deassert PPIs doesn't
> have any effect at all). Rob, do you still see things getting
> worse on your test case with this patch if you have the GIC
> fix applied too?

No, At least with the mptimers, it improves things reducing the sched
delay messages.

Tested-by: Rob Herring <rob.herring@linaro.org>

>
>  include/exec/cpu-defs.h | 1 +
>  target-arm/helper.h     | 1 +
>  target-arm/op_helper.c  | 9 +++++++++
>  target-arm/translate.c  | 6 ++++++
>  target-arm/translate.h  | 2 ++
>  5 files changed, 19 insertions(+)
>
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index 01cd8c7..66a3d46 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -59,6 +59,7 @@ typedef uint64_t target_ulong;
>  #define EXCP_HLT        0x10001 /* hlt instruction reached */
>  #define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
>  #define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
> +#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
>
>  #define TB_JMP_CACHE_BITS 12
>  #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
> diff --git a/target-arm/helper.h b/target-arm/helper.h
> index 19bd620..118b425 100644
> --- a/target-arm/helper.h
> +++ b/target-arm/helper.h
> @@ -50,6 +50,7 @@ DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE,
>                     i32, i32, i32, i32)
>  DEF_HELPER_2(exception, void, env, i32)
>  DEF_HELPER_1(wfi, void, env)
> +DEF_HELPER_1(wfe, void, env)
>
>  DEF_HELPER_3(cpsr_write, void, env, i32, i32)
>  DEF_HELPER_1(cpsr_read, i32, env)
> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
> index eb0fccd..f0e2889 100644
> --- a/target-arm/op_helper.c
> +++ b/target-arm/op_helper.c
> @@ -225,6 +225,15 @@ void HELPER(wfi)(CPUARMState *env)
>      cpu_loop_exit(env);
>  }
>
> +void HELPER(wfe)(CPUARMState *env)
> +{
> +    /* Don't actually halt the CPU, just yield back to top
> +     * level loop
> +     */
> +    env->exception_index = EXCP_YIELD;
> +    cpu_loop_exit(env);
> +}
> +
>  void HELPER(exception)(CPUARMState *env, uint32_t excp)
>  {
>      env->exception_index = excp;
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index 6ccf0ba..0c30e5c 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -3939,6 +3939,9 @@ static void gen_nop_hint(DisasContext *s, int val)
>          s->is_jmp = DISAS_WFI;
>          break;
>      case 2: /* wfe */
> +        gen_set_pc_im(s, s->pc);
> +        s->is_jmp = DISAS_WFE;
> +        break;
>      case 4: /* sev */
>      case 5: /* sevl */
>          /* TODO: Implement SEV, SEVL and WFE.  May help SMP performance.  */
> @@ -10801,6 +10804,9 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu,
>          case DISAS_WFI:
>              gen_helper_wfi(cpu_env);
>              break;
> +        case DISAS_WFE:
> +            gen_helper_wfe(cpu_env);
> +            break;
>          case DISAS_SWI:
>              gen_exception(EXCP_SWI);
>              break;
> diff --git a/target-arm/translate.h b/target-arm/translate.h
> index 67da699..2f491f9 100644
> --- a/target-arm/translate.h
> +++ b/target-arm/translate.h
> @@ -44,6 +44,8 @@ extern TCGv_ptr cpu_env;
>   * emitting unreachable code at the end of the TB in the A64 decoder
>   */
>  #define DISAS_EXC 6
> +/* WFE */
> +#define DISAS_WFE 7
>
>  #ifdef TARGET_AARCH64
>  void a64_translate_init(void);
> --
> 1.9.0
>
diff mbox

Patch

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 01cd8c7..66a3d46 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -59,6 +59,7 @@  typedef uint64_t target_ulong;
 #define EXCP_HLT        0x10001 /* hlt instruction reached */
 #define EXCP_DEBUG      0x10002 /* cpu stopped after a breakpoint or singlestep */
 #define EXCP_HALTED     0x10003 /* cpu is halted (waiting for external event) */
+#define EXCP_YIELD      0x10004 /* cpu wants to yield timeslice to another */
 
 #define TB_JMP_CACHE_BITS 12
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 19bd620..118b425 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -50,6 +50,7 @@  DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_NO_RWG_SE,
                    i32, i32, i32, i32)
 DEF_HELPER_2(exception, void, env, i32)
 DEF_HELPER_1(wfi, void, env)
+DEF_HELPER_1(wfe, void, env)
 
 DEF_HELPER_3(cpsr_write, void, env, i32, i32)
 DEF_HELPER_1(cpsr_read, i32, env)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index eb0fccd..f0e2889 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -225,6 +225,15 @@  void HELPER(wfi)(CPUARMState *env)
     cpu_loop_exit(env);
 }
 
+void HELPER(wfe)(CPUARMState *env)
+{
+    /* Don't actually halt the CPU, just yield back to top
+     * level loop
+     */
+    env->exception_index = EXCP_YIELD;
+    cpu_loop_exit(env);
+}
+
 void HELPER(exception)(CPUARMState *env, uint32_t excp)
 {
     env->exception_index = excp;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 6ccf0ba..0c30e5c 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3939,6 +3939,9 @@  static void gen_nop_hint(DisasContext *s, int val)
         s->is_jmp = DISAS_WFI;
         break;
     case 2: /* wfe */
+        gen_set_pc_im(s, s->pc);
+        s->is_jmp = DISAS_WFE;
+        break;
     case 4: /* sev */
     case 5: /* sevl */
         /* TODO: Implement SEV, SEVL and WFE.  May help SMP performance.  */
@@ -10801,6 +10804,9 @@  static inline void gen_intermediate_code_internal(ARMCPU *cpu,
         case DISAS_WFI:
             gen_helper_wfi(cpu_env);
             break;
+        case DISAS_WFE:
+            gen_helper_wfe(cpu_env);
+            break;
         case DISAS_SWI:
             gen_exception(EXCP_SWI);
             break;
diff --git a/target-arm/translate.h b/target-arm/translate.h
index 67da699..2f491f9 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -44,6 +44,8 @@  extern TCGv_ptr cpu_env;
  * emitting unreachable code at the end of the TB in the A64 decoder
  */
 #define DISAS_EXC 6
+/* WFE */
+#define DISAS_WFE 7
 
 #ifdef TARGET_AARCH64
 void a64_translate_init(void);