diff mbox series

plugins: optimize cpu_index code generation

Message ID 20241126190203.3094635-1-pierrick.bouvier@linaro.org
State New
Headers show
Series plugins: optimize cpu_index code generation | expand

Commit Message

Pierrick Bouvier Nov. 26, 2024, 7:02 p.m. UTC
When running with a single vcpu, we can return a constant instead of a
load when accessing cpu_index.
A side effect is that all tcg operations using it are optimized, most
notably scoreboard access.
When running a simple loop in user-mode, the speedup is around 20%.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
 accel/tcg/plugin-gen.c |  7 +++++++
 plugins/core.c         | 13 +++++++++++++
 2 files changed, 20 insertions(+)

Comments

Pierrick Bouvier Nov. 26, 2024, 7:02 p.m. UTC | #1
On 11/26/24 11:02, Pierrick Bouvier wrote:
> When running with a single vcpu, we can return a constant instead of a
> load when accessing cpu_index.
> A side effect is that all tcg operations using it are optimized, most
> notably scoreboard access.
> When running a simple loop in user-mode, the speedup is around 20%.
> 
> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> ---
>   accel/tcg/plugin-gen.c |  7 +++++++
>   plugins/core.c         | 13 +++++++++++++
>   2 files changed, 20 insertions(+)
> 
> diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
> index 0f47bfbb489..2eabeecbdcf 100644
> --- a/accel/tcg/plugin-gen.c
> +++ b/accel/tcg/plugin-gen.c
> @@ -102,6 +102,13 @@ static void gen_disable_mem_helper(void)
>   
>   static TCGv_i32 gen_cpu_index(void)
>   {
> +    /*
> +     * Optimize when we run with a single vcpu. All values using cpu_index,
> +     * including scoreboard index, will be optimized out.
> +     */
> +    if (qemu_plugin_num_vcpus() == 1) {
> +        return tcg_constant_i32(0);
> +    }
>       TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
>       tcg_gen_ld_i32(cpu_index, tcg_env,
>                      -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
> diff --git a/plugins/core.c b/plugins/core.c
> index bb105e8e688..8e32ca5ee08 100644
> --- a/plugins/core.c
> +++ b/plugins/core.c
> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>   
>       assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>       qemu_rec_mutex_lock(&plugin.lock);
> +
> +    /*
> +     * We want to flush tb when a second cpu appear.
> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
> +     */
> +    if (plugin.num_vcpus == 1) {
> +        qemu_rec_mutex_unlock(&plugin.lock);
> +        start_exclusive();
> +        qemu_rec_mutex_lock(&plugin.lock);
> +        tb_flush(cpu);
> +        end_exclusive();
> +    }
> +
>       plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1);
>       plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL);
>       success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,

This patch is for 10.0.
Richard Henderson Nov. 27, 2024, 5:53 p.m. UTC | #2
On 11/26/24 13:02, Pierrick Bouvier wrote:
> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>   
>       assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>       qemu_rec_mutex_lock(&plugin.lock);
> +
> +    /*
> +     * We want to flush tb when a second cpu appear.
> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
> +     */
> +    if (plugin.num_vcpus == 1) {
> +        qemu_rec_mutex_unlock(&plugin.lock);
> +        start_exclusive();
> +        qemu_rec_mutex_lock(&plugin.lock);
> +        tb_flush(cpu);
> +        end_exclusive();
> +    }

We already did this when creating the new thread.
Though we're using slightly different tests:

         /*
          * If this is our first additional thread, we need to ensure we
          * generate code for parallel execution and flush old translations.
          * Do this now so that the copy gets CF_PARALLEL too.
          */
         if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
             tcg_cflags_set(cpu, CF_PARALLEL);
             tb_flush(cpu);
         }


r~
Pierrick Bouvier Nov. 27, 2024, 5:57 p.m. UTC | #3
Hi Richard,

On 11/27/24 09:53, Richard Henderson wrote:
> On 11/26/24 13:02, Pierrick Bouvier wrote:
>> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>>    
>>        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>>        qemu_rec_mutex_lock(&plugin.lock);
>> +
>> +    /*
>> +     * We want to flush tb when a second cpu appear.
>> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
>> +     */
>> +    if (plugin.num_vcpus == 1) {
>> +        qemu_rec_mutex_unlock(&plugin.lock);
>> +        start_exclusive();
>> +        qemu_rec_mutex_lock(&plugin.lock);
>> +        tb_flush(cpu);
>> +        end_exclusive();
>> +    }
> 
> We already did this when creating the new thread.
> Though we're using slightly different tests:
> 
>           /*
>            * If this is our first additional thread, we need to ensure we
>            * generate code for parallel execution and flush old translations.
>            * Do this now so that the copy gets CF_PARALLEL too.
>            */
>           if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
>               tcg_cflags_set(cpu, CF_PARALLEL);
>               tb_flush(cpu);
>           }
> 
> 
> r~

I noticed that it was redundant (for user-mode at least), but it seemed 
too implicit to rely on this.
As well, I didn't observe such a flush in system-mode, does it work the 
same as user-mode (regarding the CF_PARALLEL flag)?
Richard Henderson Nov. 27, 2024, 6:27 p.m. UTC | #4
On 11/27/24 11:57, Pierrick Bouvier wrote:
> I noticed that it was redundant (for user-mode at least), but it seemed too implicit to 
> rely on this.
> As well, I didn't observe such a flush in system-mode, does it work the same as user-mode 
> (regarding the CF_PARALLEL flag)?

Yes, we set CF_PARALLEL for system mode too.  We do it early, because we can tell the 1 vs 
many cpu count from the command-line.  Thus no flush required.


r~
Pierrick Bouvier Nov. 27, 2024, 6:50 p.m. UTC | #5
On 11/27/24 10:27, Richard Henderson wrote:
> On 11/27/24 11:57, Pierrick Bouvier wrote:
>> I noticed that it was redundant (for user-mode at least), but it seemed too implicit to
>> rely on this.
>> As well, I didn't observe such a flush in system-mode, does it work the same as user-mode
>> (regarding the CF_PARALLEL flag)?
> 
> Yes, we set CF_PARALLEL for system mode too.  We do it early, because we can tell the 1 vs
> many cpu count from the command-line.  Thus no flush required.
> 
> 
> r~

Yes I saw that now, we directly boot with the given number of vcpus, and 
they are initialized before any code generation is made.

I'll remove the flush part in v2 then.
Pierrick Bouvier Nov. 28, 2024, 8:43 p.m. UTC | #6
On 11/27/24 09:53, Richard Henderson wrote:
> On 11/26/24 13:02, Pierrick Bouvier wrote:
>> @@ -266,6 +266,19 @@ static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
>>    
>>        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
>>        qemu_rec_mutex_lock(&plugin.lock);
>> +
>> +    /*
>> +     * We want to flush tb when a second cpu appear.
>> +     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
>> +     */
>> +    if (plugin.num_vcpus == 1) {
>> +        qemu_rec_mutex_unlock(&plugin.lock);
>> +        start_exclusive();
>> +        qemu_rec_mutex_lock(&plugin.lock);
>> +        tb_flush(cpu);
>> +        end_exclusive();
>> +    }
> 
> We already did this when creating the new thread.
> Though we're using slightly different tests:
> 
>           /*
>            * If this is our first additional thread, we need to ensure we
>            * generate code for parallel execution and flush old translations.
>            * Do this now so that the copy gets CF_PARALLEL too.
>            */
>           if (!tcg_cflags_has(cpu, CF_PARALLEL)) {
>               tcg_cflags_set(cpu, CF_PARALLEL);
>               tb_flush(cpu);
>           }
> 

After removing the explicit flush, and relying on flush to honor 
CF_PARALLEL flags, I ran into random errors on values expected by 
'inline' plugin, when running a program that spawns multiple threads.

It seems that, when spawning them at once, we may execute old code 
waiting for the flush to happen.

> 
> r~
diff mbox series

Patch

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 0f47bfbb489..2eabeecbdcf 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -102,6 +102,13 @@  static void gen_disable_mem_helper(void)
 
 static TCGv_i32 gen_cpu_index(void)
 {
+    /*
+     * Optimize when we run with a single vcpu. All values using cpu_index,
+     * including scoreboard index, will be optimized out.
+     */
+    if (qemu_plugin_num_vcpus() == 1) {
+        return tcg_constant_i32(0);
+    }
     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
     tcg_gen_ld_i32(cpu_index, tcg_env,
                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
diff --git a/plugins/core.c b/plugins/core.c
index bb105e8e688..8e32ca5ee08 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -266,6 +266,19 @@  static void qemu_plugin_vcpu_init__async(CPUState *cpu, run_on_cpu_data unused)
 
     assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
     qemu_rec_mutex_lock(&plugin.lock);
+
+    /*
+     * We want to flush tb when a second cpu appear.
+     * When generating plugin code, we optimize cpu_index for num_vcpus == 1.
+     */
+    if (plugin.num_vcpus == 1) {
+        qemu_rec_mutex_unlock(&plugin.lock);
+        start_exclusive();
+        qemu_rec_mutex_lock(&plugin.lock);
+        tb_flush(cpu);
+        end_exclusive();
+    }
+
     plugin.num_vcpus = MAX(plugin.num_vcpus, cpu->cpu_index + 1);
     plugin_cpu_update__locked(&cpu->cpu_index, NULL, NULL);
     success = g_hash_table_insert(plugin.cpu_ht, &cpu->cpu_index,