diff mbox series

tcg: Special case split barriers before/after load

Message ID 20220430234534.446733-1-richard.henderson@linaro.org
State New
Headers show
Series tcg: Special case split barriers before/after load | expand

Commit Message

Richard Henderson April 30, 2022, 11:45 p.m. UTC
When st:ld is not required by the guest but ld:st is, we can
put ld:ld+ld:st barriers after loads, and then st:st barriers
before stores to enforce all required barriers.

The st:st barrier is often special cased by hosts, and that
is expected to be more efficient than a full barrier.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

Redha, I expect this to produce exactly the same barriers as you
did with your 'fix guest memory ordering enforcement' patch.

While this compiles, it does not fix the failures that I see
occasionally with our private gitlab runner.  The standalone
version of this failure is

  export QTEST_QEMU_BINARY=./qemu-system-i386
  for i in `seq 1 100`; do
    ./tests/qtest/ahci-test > /dev/null &
  done
  wait

About 10 to 15% of the runs will fail with

ERROR:../src/tests/qtest/ahci-test.c:92:verify_state: assertion failed (ahci_fingerprint == ahci->fingerprint): (0xe0000000 == 0x29228086)

Note that this test never seems to fail unless the system is under
load, thus starting 100 tests on my 80 core neoverse-n1 system.


r~


---
 tcg/tcg-op.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 7 deletions(-)

Comments

Philippe Mathieu-Daudé May 30, 2022, 3:10 p.m. UTC | #1
Hi Richard,

On 1/5/22 01:45, Richard Henderson wrote:
> When st:ld is not required by the guest but ld:st is, we can
> put ld:ld+ld:st barriers after loads, and then st:st barriers
> before stores to enforce all required barriers.
> 
> The st:st barrier is often special cased by hosts, and that
> is expected to be more efficient than a full barrier.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> 
> Redha, I expect this to produce exactly the same barriers as you
> did with your 'fix guest memory ordering enforcement' patch.
> 
> While this compiles, it does not fix the failures that I see
> occasionally with our private gitlab runner.  The standalone
> version of this failure is
> 
>    export QTEST_QEMU_BINARY=./qemu-system-i386
>    for i in `seq 1 100`; do
>      ./tests/qtest/ahci-test > /dev/null &
>    done
>    wait
> 
> About 10 to 15% of the runs will fail with
> 
> ERROR:../src/tests/qtest/ahci-test.c:92:verify_state: assertion failed (ahci_fingerprint == ahci->fingerprint): (0xe0000000 == 0x29228086)
> 
> Note that this test never seems to fail unless the system is under
> load, thus starting 100 tests on my 80 core neoverse-n1 system.
> 
> 
> r~
> 
> 
> ---
>   tcg/tcg-op.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 48 insertions(+), 7 deletions(-)
> 
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 5d48537927..4c568a2592 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -2834,9 +2834,6 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
>   
>   static void tcg_gen_req_mo(TCGBar type)
>   {
> -#ifdef TCG_GUEST_DEFAULT_MO
> -    type &= TCG_GUEST_DEFAULT_MO;
> -#endif
>       type &= ~TCG_TARGET_DEFAULT_MO;
>       if (type) {
>           tcg_gen_mb(type | TCG_BAR_SC);
> @@ -2868,12 +2865,49 @@ static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi,
>   #endif
>   }
>   
> +typedef enum {
> +    BAR_LD_BEFORE,
> +    BAR_LD_AFTER,
> +    BAR_ST_BEFORE,
> +} ChooseBarrier;
> +
> +static TCGBar choose_barrier(ChooseBarrier which)
> +{
> +#ifdef TCG_GUEST_DEFAULT_MO
> +    const TCGBar guest_mo = TCG_GUEST_DEFAULT_MO;
> +#else
> +    const TCGBar guest_mo = TCG_MO_ALL;
> +#endif
> +    TCGBar ret[3];
> +
> +    if (guest_mo == 0) {
> +        return 0;
> +    }

This part ...:

> +    /*
> +     * Special case for i386 and s390x.  Because store-load is not
> +     * required by the guest, we can split the barriers such that we
> +     * wind up with a store-store barrier, which is expected to be
> +     * quicker on some hosts.
> +     */
> +    if (guest_mo == (TCG_MO_ALL & ~TCG_MO_ST_LD)) {
> +        ret[BAR_LD_BEFORE] = 0;
> +        ret[BAR_LD_AFTER]  = TCG_MO_LD_LD | TCG_MO_LD_ST;
> +        ret[BAR_ST_BEFORE] = TCG_MO_ST_ST;
> +    } else {

... could deserve another patch.

> +        ret[BAR_LD_BEFORE] = (TCG_MO_LD_LD | TCG_MO_ST_LD) & guest_mo;
> +        ret[BAR_ST_BEFORE] = (TCG_MO_LD_ST | TCG_MO_ST_ST) & guest_mo;
> +        ret[BAR_LD_AFTER]  = 0;
> +    }
> +    return ret[which];
> +}
> +
>   void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>   {
>       MemOp orig_memop;
>       MemOpIdx oi;
>   
> -    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
> +    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
> +
>       memop = tcg_canonicalize_memop(memop, 0, 0);
>       oi = make_memop_idx(memop, idx);
>   
> @@ -2904,6 +2938,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>               g_assert_not_reached();
>           }
>       }
> +
> +    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
>   }
>   
>   void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
> @@ -2911,7 +2947,8 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>       TCGv_i32 swap = NULL;
>       MemOpIdx oi;
>   
> -    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
> +    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
> +
>       memop = tcg_canonicalize_memop(memop, 0, 1);
>       oi = make_memop_idx(memop, idx);
>   
> @@ -2959,7 +2996,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>           return;
>       }
>   
> -    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
> +    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
> +
>       memop = tcg_canonicalize_memop(memop, 1, 0);
>       oi = make_memop_idx(memop, idx);
>   
> @@ -2994,6 +3032,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>               g_assert_not_reached();
>           }
>       }
> +
> +    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
>   }
>   
>   void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
> @@ -3006,7 +3046,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>           return;
>       }
>   
> -    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
> +    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
> +
>       memop = tcg_canonicalize_memop(memop, 1, 1);
>       oi = make_memop_idx(memop, idx);
>   

Redha, could you test this patch?

Regards,

Phil.
Redha June 7, 2022, 2:01 p.m. UTC | #2
Hi Richard and Philippe,

I was finally able to test the patch.

As expected, the fences generated by TCG with your patch are the same
ones as in mine.

However, I was not able to reproduce the failure with the ahci-test on
my ARM system (2x 28-core Thunder X2, 4 threads per core).  I ran 500
tests concurrently to load the system, but I did not get the assert
failure.  However, I got another qemu error 229 times:

  qemu-system-x86_64: Failed to get "write" lock
  Is another process using the image [/tmp/qtest.IksKVs]?

Each time with a different temporary path. Not sure if that is directly 
related to the actual test.

Also, are we sure the test failure on the gitlab runner is due to memory
ordering issues with regular memory accesses?  Is there a way to check
the code generated by TCG for this test, similar to `-d op,out_asm`?


Redha



On 30/05/2022 17:10, Philippe Mathieu-Daudé wrote:
> Hi Richard,
> 
> On 1/5/22 01:45, Richard Henderson wrote:
>> When st:ld is not required by the guest but ld:st is, we can
>> put ld:ld+ld:st barriers after loads, and then st:st barriers
>> before stores to enforce all required barriers.
>>
>> The st:st barrier is often special cased by hosts, and that
>> is expected to be more efficient than a full barrier.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>
>> Redha, I expect this to produce exactly the same barriers as you
>> did with your 'fix guest memory ordering enforcement' patch.
>>
>> While this compiles, it does not fix the failures that I see
>> occasionally with our private gitlab runner.  The standalone
>> version of this failure is
>>
>>    export QTEST_QEMU_BINARY=./qemu-system-i386
>>    for i in `seq 1 100`; do
>>      ./tests/qtest/ahci-test > /dev/null &
>>    done
>>    wait
>>
>> About 10 to 15% of the runs will fail with
>>
>> ERROR:../src/tests/qtest/ahci-test.c:92:verify_state: assertion failed (ahci_fingerprint == ahci->fingerprint): (0xe0000000 == 0x29228086)
>>
>> Note that this test never seems to fail unless the system is under
>> load, thus starting 100 tests on my 80 core neoverse-n1 system.
>>
>>
>> r~
>>
>>
>> ---
>>   tcg/tcg-op.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-------
>>   1 file changed, 48 insertions(+), 7 deletions(-)
>>
>> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
>> index 5d48537927..4c568a2592 100644
>> --- a/tcg/tcg-op.c
>> +++ b/tcg/tcg-op.c
>> @@ -2834,9 +2834,6 @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
>>     static void tcg_gen_req_mo(TCGBar type)
>>   {
>> -#ifdef TCG_GUEST_DEFAULT_MO
>> -    type &= TCG_GUEST_DEFAULT_MO;
>> -#endif
>>       type &= ~TCG_TARGET_DEFAULT_MO;
>>       if (type) {
>>           tcg_gen_mb(type | TCG_BAR_SC);
>> @@ -2868,12 +2865,49 @@ static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi,
>>   #endif
>>   }
>>   +typedef enum {
>> +    BAR_LD_BEFORE,
>> +    BAR_LD_AFTER,
>> +    BAR_ST_BEFORE,
>> +} ChooseBarrier;
>> +
>> +static TCGBar choose_barrier(ChooseBarrier which)
>> +{
>> +#ifdef TCG_GUEST_DEFAULT_MO
>> +    const TCGBar guest_mo = TCG_GUEST_DEFAULT_MO;
>> +#else
>> +    const TCGBar guest_mo = TCG_MO_ALL;
>> +#endif
>> +    TCGBar ret[3];
>> +
>> +    if (guest_mo == 0) {
>> +        return 0;
>> +    }
> 
> This part ...:
> 
>> +    /*
>> +     * Special case for i386 and s390x.  Because store-load is not
>> +     * required by the guest, we can split the barriers such that we
>> +     * wind up with a store-store barrier, which is expected to be
>> +     * quicker on some hosts.
>> +     */
>> +    if (guest_mo == (TCG_MO_ALL & ~TCG_MO_ST_LD)) {
>> +        ret[BAR_LD_BEFORE] = 0;
>> +        ret[BAR_LD_AFTER]  = TCG_MO_LD_LD | TCG_MO_LD_ST;
>> +        ret[BAR_ST_BEFORE] = TCG_MO_ST_ST;
>> +    } else {
> 
> ... could deserve another patch.
> 
>> +        ret[BAR_LD_BEFORE] = (TCG_MO_LD_LD | TCG_MO_ST_LD) & guest_mo;
>> +        ret[BAR_ST_BEFORE] = (TCG_MO_LD_ST | TCG_MO_ST_ST) & guest_mo;
>> +        ret[BAR_LD_AFTER]  = 0;
>> +    }
>> +    return ret[which];
>> +}
>> +
>>   void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>>   {
>>       MemOp orig_memop;
>>       MemOpIdx oi;
>>   -    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
>> +    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
>> +
>>       memop = tcg_canonicalize_memop(memop, 0, 0);
>>       oi = make_memop_idx(memop, idx);
>>   @@ -2904,6 +2938,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>>               g_assert_not_reached();
>>           }
>>       }
>> +
>> +    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
>>   }
>>     void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>> @@ -2911,7 +2947,8 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
>>       TCGv_i32 swap = NULL;
>>       MemOpIdx oi;
>>   -    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
>> +    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
>> +
>>       memop = tcg_canonicalize_memop(memop, 0, 1);
>>       oi = make_memop_idx(memop, idx);
>>   @@ -2959,7 +2996,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>>           return;
>>       }
>>   -    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
>> +    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
>> +
>>       memop = tcg_canonicalize_memop(memop, 1, 0);
>>       oi = make_memop_idx(memop, idx);
>>   @@ -2994,6 +3032,8 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>>               g_assert_not_reached();
>>           }
>>       }
>> +
>> +    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
>>   }
>>     void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>> @@ -3006,7 +3046,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>>           return;
>>       }
>>   -    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
>> +    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
>> +
>>       memop = tcg_canonicalize_memop(memop, 1, 1);
>>       oi = make_memop_idx(memop, idx);
>>   
> 
> Redha, could you test this patch?
> 
> Regards,
> 
> Phil.
diff mbox series

Patch

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 5d48537927..4c568a2592 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2834,9 +2834,6 @@  static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 val, TCGv addr,
 
 static void tcg_gen_req_mo(TCGBar type)
 {
-#ifdef TCG_GUEST_DEFAULT_MO
-    type &= TCG_GUEST_DEFAULT_MO;
-#endif
     type &= ~TCG_TARGET_DEFAULT_MO;
     if (type) {
         tcg_gen_mb(type | TCG_BAR_SC);
@@ -2868,12 +2865,49 @@  static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi,
 #endif
 }
 
+typedef enum {
+    BAR_LD_BEFORE,
+    BAR_LD_AFTER,
+    BAR_ST_BEFORE,
+} ChooseBarrier;
+
+static TCGBar choose_barrier(ChooseBarrier which)
+{
+#ifdef TCG_GUEST_DEFAULT_MO
+    const TCGBar guest_mo = TCG_GUEST_DEFAULT_MO;
+#else
+    const TCGBar guest_mo = TCG_MO_ALL;
+#endif
+    TCGBar ret[3];
+
+    if (guest_mo == 0) {
+        return 0;
+    }
+    /*
+     * Special case for i386 and s390x.  Because store-load is not
+     * required by the guest, we can split the barriers such that we
+     * wind up with a store-store barrier, which is expected to be
+     * quicker on some hosts.
+     */
+    if (guest_mo == (TCG_MO_ALL & ~TCG_MO_ST_LD)) {
+        ret[BAR_LD_BEFORE] = 0;
+        ret[BAR_LD_AFTER]  = TCG_MO_LD_LD | TCG_MO_LD_ST;
+        ret[BAR_ST_BEFORE] = TCG_MO_ST_ST;
+    } else {
+        ret[BAR_LD_BEFORE] = (TCG_MO_LD_LD | TCG_MO_ST_LD) & guest_mo;
+        ret[BAR_ST_BEFORE] = (TCG_MO_LD_ST | TCG_MO_ST_ST) & guest_mo;
+        ret[BAR_LD_AFTER]  = 0;
+    }
+    return ret[which];
+}
+
 void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
 {
     MemOp orig_memop;
     MemOpIdx oi;
 
-    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
+
     memop = tcg_canonicalize_memop(memop, 0, 0);
     oi = make_memop_idx(memop, idx);
 
@@ -2904,6 +2938,8 @@  void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
             g_assert_not_reached();
         }
     }
+
+    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
 }
 
 void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -2911,7 +2947,8 @@  void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
     TCGv_i32 swap = NULL;
     MemOpIdx oi;
 
-    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
+
     memop = tcg_canonicalize_memop(memop, 0, 1);
     oi = make_memop_idx(memop, idx);
 
@@ -2959,7 +2996,8 @@  void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
         return;
     }
 
-    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+    tcg_gen_req_mo(choose_barrier(BAR_LD_BEFORE));
+
     memop = tcg_canonicalize_memop(memop, 1, 0);
     oi = make_memop_idx(memop, idx);
 
@@ -2994,6 +3032,8 @@  void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
             g_assert_not_reached();
         }
     }
+
+    tcg_gen_req_mo(choose_barrier(BAR_LD_AFTER));
 }
 
 void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -3006,7 +3046,8 @@  void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
         return;
     }
 
-    tcg_gen_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
+    tcg_gen_req_mo(choose_barrier(BAR_ST_BEFORE));
+
     memop = tcg_canonicalize_memop(memop, 1, 1);
     oi = make_memop_idx(memop, idx);