Message ID | 20241210152401.1823648-28-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | tcg: Remove in-flight mask data from OptContext | expand |
On 12/10/24 07:23, Richard Henderson wrote: > Be careful not to call fold_masks_zs when the memory operation > is wide enough to require multiple outputs, so split into two > functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > tcg/optimize.c | 28 ++++++++++++++++++++++------ > 1 file changed, 22 insertions(+), 6 deletions(-) > > diff --git a/tcg/optimize.c b/tcg/optimize.c > index 76ad02d73b..6f41ef5adb 100644 > --- a/tcg/optimize.c > +++ b/tcg/optimize.c > @@ -2092,24 +2092,33 @@ static bool fold_orc(OptContext *ctx, TCGOp *op) > return fold_masks_zs(ctx, op, -1, s_mask); > } > > -static bool fold_qemu_ld(OptContext *ctx, TCGOp *op) > +static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op) > { > const TCGOpDef *def = &tcg_op_defs[op->opc]; > MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs]; > MemOp mop = get_memop(oi); > int width = 8 * memop_size(mop); > + uint64_t z_mask = -1, s_mask = 0; > > if (width < 64) { > - ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width); > + s_mask = MAKE_64BIT_MASK(width, 64 - width); > if (!(mop & MO_SIGN)) { > - ctx->z_mask = MAKE_64BIT_MASK(0, width); > - ctx->s_mask <<= 1; > + z_mask = MAKE_64BIT_MASK(0, width); > + s_mask <<= 1; > } > } > > /* Opcodes that touch guest memory stop the mb optimization. */ > ctx->prev_mb = NULL; > - return false; > + > + return fold_masks_zs(ctx, op, z_mask, s_mask); > +} > + > +static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op) > +{ > + /* Opcodes that touch guest memory stop the mb optimization. */ > + ctx->prev_mb = NULL; > + return finish_folding(ctx, op); > } > > static bool fold_qemu_st(OptContext *ctx, TCGOp *op) > @@ -3001,11 +3010,18 @@ void tcg_optimize(TCGContext *s) > break; > case INDEX_op_qemu_ld_a32_i32: > case INDEX_op_qemu_ld_a64_i32: > + done = fold_qemu_ld_1reg(&ctx, op); > + break; > case INDEX_op_qemu_ld_a32_i64: > case INDEX_op_qemu_ld_a64_i64: > + if (TCG_TARGET_REG_BITS == 64) { > + done = fold_qemu_ld_1reg(&ctx, op); > + break; > + } > + QEMU_FALLTHROUGH; > case INDEX_op_qemu_ld_a32_i128: > case INDEX_op_qemu_ld_a64_i128: > - done = fold_qemu_ld(&ctx, op); > + done = fold_qemu_ld_2reg(&ctx, op); > break; > case INDEX_op_qemu_st8_a32_i32: > case INDEX_op_qemu_st8_a64_i32: Couldn't we handle this case in fold_masks instead (at least the 64 bits store on 32 bits guest case)?
On 12/17/24 14:35, Pierrick Bouvier wrote: >> @@ -3001,11 +3010,18 @@ void tcg_optimize(TCGContext *s) >> break; >> case INDEX_op_qemu_ld_a32_i32: >> case INDEX_op_qemu_ld_a64_i32: >> + done = fold_qemu_ld_1reg(&ctx, op); >> + break; >> case INDEX_op_qemu_ld_a32_i64: >> case INDEX_op_qemu_ld_a64_i64: >> + if (TCG_TARGET_REG_BITS == 64) { >> + done = fold_qemu_ld_1reg(&ctx, op); >> + break; >> + } >> + QEMU_FALLTHROUGH; >> case INDEX_op_qemu_ld_a32_i128: >> case INDEX_op_qemu_ld_a64_i128: >> - done = fold_qemu_ld(&ctx, op); >> + done = fold_qemu_ld_2reg(&ctx, op); >> break; >> case INDEX_op_qemu_st8_a32_i32: >> case INDEX_op_qemu_st8_a64_i32: > > Couldn't we handle this case in fold_masks instead (at least the 64 bits store on 32 bits > guest case)? No, not with the assertion that the TCGOp passed to fold_masks have a single output. r~
diff --git a/tcg/optimize.c b/tcg/optimize.c index 76ad02d73b..6f41ef5adb 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -2092,24 +2092,33 @@ static bool fold_orc(OptContext *ctx, TCGOp *op) return fold_masks_zs(ctx, op, -1, s_mask); } -static bool fold_qemu_ld(OptContext *ctx, TCGOp *op) +static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op) { const TCGOpDef *def = &tcg_op_defs[op->opc]; MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs]; MemOp mop = get_memop(oi); int width = 8 * memop_size(mop); + uint64_t z_mask = -1, s_mask = 0; if (width < 64) { - ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width); + s_mask = MAKE_64BIT_MASK(width, 64 - width); if (!(mop & MO_SIGN)) { - ctx->z_mask = MAKE_64BIT_MASK(0, width); - ctx->s_mask <<= 1; + z_mask = MAKE_64BIT_MASK(0, width); + s_mask <<= 1; } } /* Opcodes that touch guest memory stop the mb optimization. */ ctx->prev_mb = NULL; - return false; + + return fold_masks_zs(ctx, op, z_mask, s_mask); +} + +static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op) +{ + /* Opcodes that touch guest memory stop the mb optimization. */ + ctx->prev_mb = NULL; + return finish_folding(ctx, op); } static bool fold_qemu_st(OptContext *ctx, TCGOp *op) @@ -3001,11 +3010,18 @@ void tcg_optimize(TCGContext *s) break; case INDEX_op_qemu_ld_a32_i32: case INDEX_op_qemu_ld_a64_i32: + done = fold_qemu_ld_1reg(&ctx, op); + break; case INDEX_op_qemu_ld_a32_i64: case INDEX_op_qemu_ld_a64_i64: + if (TCG_TARGET_REG_BITS == 64) { + done = fold_qemu_ld_1reg(&ctx, op); + break; + } + QEMU_FALLTHROUGH; case INDEX_op_qemu_ld_a32_i128: case INDEX_op_qemu_ld_a64_i128: - done = fold_qemu_ld(&ctx, op); + done = fold_qemu_ld_2reg(&ctx, op); break; case INDEX_op_qemu_st8_a32_i32: case INDEX_op_qemu_st8_a64_i32:
Be careful not to call fold_masks_zs when the memory operation is wide enough to require multiple outputs, so split into two functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/optimize.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-)