diff mbox series

[27/46] tcg/optimize: Use fold_masks_zs in fold_qemu_ld

Message ID 20241210152401.1823648-28-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: Remove in-flight mask data from OptContext | expand

Commit Message

Richard Henderson Dec. 10, 2024, 3:23 p.m. UTC
Be careful not to call fold_masks_zs when the memory operation
is wide enough to require multiple outputs, so split into two
functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

Comments

Pierrick Bouvier Dec. 17, 2024, 8:35 p.m. UTC | #1
On 12/10/24 07:23, Richard Henderson wrote:
> Be careful not to call fold_masks_zs when the memory operation
> is wide enough to require multiple outputs, so split into two
> functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   tcg/optimize.c | 28 ++++++++++++++++++++++------
>   1 file changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 76ad02d73b..6f41ef5adb 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -2092,24 +2092,33 @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
>       return fold_masks_zs(ctx, op, -1, s_mask);
>   }
>   
> -static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
> +static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
>   {
>       const TCGOpDef *def = &tcg_op_defs[op->opc];
>       MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
>       MemOp mop = get_memop(oi);
>       int width = 8 * memop_size(mop);
> +    uint64_t z_mask = -1, s_mask = 0;
>   
>       if (width < 64) {
> -        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
> +        s_mask = MAKE_64BIT_MASK(width, 64 - width);
>           if (!(mop & MO_SIGN)) {
> -            ctx->z_mask = MAKE_64BIT_MASK(0, width);
> -            ctx->s_mask <<= 1;
> +            z_mask = MAKE_64BIT_MASK(0, width);
> +            s_mask <<= 1;
>           }
>       }
>   
>       /* Opcodes that touch guest memory stop the mb optimization.  */
>       ctx->prev_mb = NULL;
> -    return false;
> +
> +    return fold_masks_zs(ctx, op, z_mask, s_mask);
> +}
> +
> +static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
> +{
> +    /* Opcodes that touch guest memory stop the mb optimization.  */
> +    ctx->prev_mb = NULL;
> +    return finish_folding(ctx, op);
>   }
>   
>   static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
> @@ -3001,11 +3010,18 @@ void tcg_optimize(TCGContext *s)
>               break;
>           case INDEX_op_qemu_ld_a32_i32:
>           case INDEX_op_qemu_ld_a64_i32:
> +            done = fold_qemu_ld_1reg(&ctx, op);
> +            break;
>           case INDEX_op_qemu_ld_a32_i64:
>           case INDEX_op_qemu_ld_a64_i64:
> +            if (TCG_TARGET_REG_BITS == 64) {
> +                done = fold_qemu_ld_1reg(&ctx, op);
> +                break;
> +            }
> +            QEMU_FALLTHROUGH;
>           case INDEX_op_qemu_ld_a32_i128:
>           case INDEX_op_qemu_ld_a64_i128:
> -            done = fold_qemu_ld(&ctx, op);
> +            done = fold_qemu_ld_2reg(&ctx, op);
>               break;
>           case INDEX_op_qemu_st8_a32_i32:
>           case INDEX_op_qemu_st8_a64_i32:

Couldn't we handle this case in fold_masks instead (at least the 64 bits 
store on 32 bits guest case)?
Richard Henderson Dec. 18, 2024, 3:26 a.m. UTC | #2
On 12/17/24 14:35, Pierrick Bouvier wrote:
>> @@ -3001,11 +3010,18 @@ void tcg_optimize(TCGContext *s)
>>               break;
>>           case INDEX_op_qemu_ld_a32_i32:
>>           case INDEX_op_qemu_ld_a64_i32:
>> +            done = fold_qemu_ld_1reg(&ctx, op);
>> +            break;
>>           case INDEX_op_qemu_ld_a32_i64:
>>           case INDEX_op_qemu_ld_a64_i64:
>> +            if (TCG_TARGET_REG_BITS == 64) {
>> +                done = fold_qemu_ld_1reg(&ctx, op);
>> +                break;
>> +            }
>> +            QEMU_FALLTHROUGH;
>>           case INDEX_op_qemu_ld_a32_i128:
>>           case INDEX_op_qemu_ld_a64_i128:
>> -            done = fold_qemu_ld(&ctx, op);
>> +            done = fold_qemu_ld_2reg(&ctx, op);
>>               break;
>>           case INDEX_op_qemu_st8_a32_i32:
>>           case INDEX_op_qemu_st8_a64_i32:
> 
> Couldn't we handle this case in fold_masks instead (at least the 64 bits store on 32 bits 
> guest case)?

No, not with the assertion that the TCGOp passed to fold_masks have a single output.


r~
diff mbox series

Patch

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 76ad02d73b..6f41ef5adb 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -2092,24 +2092,33 @@  static bool fold_orc(OptContext *ctx, TCGOp *op)
     return fold_masks_zs(ctx, op, -1, s_mask);
 }
 
-static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
+    uint64_t z_mask = -1, s_mask = 0;
 
     if (width < 64) {
-        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        s_mask = MAKE_64BIT_MASK(width, 64 - width);
         if (!(mop & MO_SIGN)) {
-            ctx->z_mask = MAKE_64BIT_MASK(0, width);
-            ctx->s_mask <<= 1;
+            z_mask = MAKE_64BIT_MASK(0, width);
+            s_mask <<= 1;
         }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
-    return false;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
+}
+
+static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
@@ -3001,11 +3010,18 @@  void tcg_optimize(TCGContext *s)
             break;
         case INDEX_op_qemu_ld_a32_i32:
         case INDEX_op_qemu_ld_a64_i32:
+            done = fold_qemu_ld_1reg(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_a32_i64:
         case INDEX_op_qemu_ld_a64_i64:
+            if (TCG_TARGET_REG_BITS == 64) {
+                done = fold_qemu_ld_1reg(&ctx, op);
+                break;
+            }
+            QEMU_FALLTHROUGH;
         case INDEX_op_qemu_ld_a32_i128:
         case INDEX_op_qemu_ld_a64_i128:
-            done = fold_qemu_ld(&ctx, op);
+            done = fold_qemu_ld_2reg(&ctx, op);
             break;
         case INDEX_op_qemu_st8_a32_i32:
         case INDEX_op_qemu_st8_a64_i32: