Message ID | 20250216231012.2808572-123-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | tcg: Convert to TCGOutOp structures | expand |
Ok, this one definitely caught my eye. :) On 2/17/25 00:09, Richard Henderson wrote: > + tcg_gen_add_i32(t0, a, b); > + tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a); Compare against b instead? If there's an immediate (which could even be zero) it is there. > + tcg_gen_add_i32(r, t0, ci); > + tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0); > + tcg_gen_or_i32(co, t0, t1); setcond + or can become a movcond: /* co = t1 | (r < t0) */ tcg_gen_movcond_i32(TCG_COND_LTU, co, r, t0, tcg_constant_i32(1), t1); > + TCGv_i64 t0 = tcg_temp_ebb_new_i64(); > + TCGv_i64 t1 = tcg_temp_ebb_new_i64(); > + > + tcg_gen_add_i64(t0, a, b); > + tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, a); > + tcg_gen_add_i64(r, t0, ci); > + tcg_gen_setcond_i64(TCG_COND_LTU, t0, r, t0); > + tcg_gen_or_i64(co, t0, t1); The same two observations as above apply here, of course. > + tcg_temp_free_i64(t0); > + tcg_temp_free_i64(t1); > + } > + } else { > + if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) { > + TCGv_i32 discard = tcg_temp_ebb_new_i32(); > + TCGv_i32 zero = tcg_constant_i32(0); > + TCGv_i32 mone = tcg_constant_i32(-1); > + > + tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone); > + tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone); This addcio is unnecessary/incorrect. I think you assume that TCGV_HIGH(ci) = 0, since that's what you set it below, and then this you can remove it. But if you really wanted to do it, it should go... > + tcg_gen_op3_i32(INDEX_op_addcio, TCGV_LOW(r), > + TCGV_LOW(a), TCGV_LOW(b)); ... here, after the low word is all set, to "OR" the input high-carry into the output low-carry. But again, I think it's not necessary. > + tcg_gen_op3_i32(INDEX_op_addcio, TCGV_HIGH(r), > + TCGV_HIGH(a), TCGV_HIGH(b)); > + tcg_gen_op3_i32(INDEX_op_addci, TCGV_LOW(co), zero, zero); > + tcg_temp_free_i32(discard); > + } else { > + TCGv_i32 t0 = tcg_temp_ebb_new_i32(); > + TCGv_i32 c0 = tcg_temp_ebb_new_i32(); > + TCGv_i32 c1 = tcg_temp_ebb_new_i32(); > + > + tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci)); > + tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0); Likewise, this shouldn't be needed and you can just add TCGV_LOW(ci) below. > + tcg_gen_add_i32(t0, TCGV_LOW(a), TCGV_LOW(b)); > + tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_LOW(a)); Here you can also change a to b. > + tcg_gen_add_i32(TCGV_LOW(r), t0, c1); > + tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), c1); > + tcg_gen_or_i32(c1, c1, c0); /* c1 = c0 | (r < ci) */ tcg_gen_movcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), TCGV_LOW(ci), tcg_constant_i32(1), c0); > + tcg_gen_add_i32(t0, TCGV_HIGH(a), TCGV_HIGH(b)); > + tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_HIGH(a)); Change a to b. > + tcg_gen_add_i32(TCGV_HIGH(r), t0, c1); > + tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1); > + tcg_gen_or_i32(TCGV_LOW(co), c0, c1); + /* c1 = c0 | (r < c1) */ + tcg_gen_movcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1, tcg_constant_i32(1), c0); > + tcg_temp_free_i32(t0); > + tcg_temp_free_i32(c0); > + tcg_temp_free_i32(c1); > + } > + tcg_gen_movi_i32(TCGV_HIGH(co), 0); > + } > +} > + > void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, > TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh) > {
On 2/21/25 07:41, Paolo Bonzini wrote: > Ok, this one definitely caught my eye. :) > > On 2/17/25 00:09, Richard Henderson wrote: >> + tcg_gen_add_i32(t0, a, b); >> + tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a); > > Compare against b instead? If there's an immediate (which could even be > zero) it is there. Sure. > >> + tcg_gen_add_i32(r, t0, ci); >> + tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0); >> + tcg_gen_or_i32(co, t0, t1); > > setcond + or can become a movcond: For the set of hosts that require this support (mips, riscv, loongarch), this isn't really an improvement -- we'll end up emitting a setcond LTU anyway and using that to feed movcond NE instruction. >> + if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) { >> + TCGv_i32 discard = tcg_temp_ebb_new_i32(); >> + TCGv_i32 zero = tcg_constant_i32(0); >> + TCGv_i32 mone = tcg_constant_i32(-1); >> + >> + tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone); >> + tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone); > > This addcio is unnecessary/incorrect. I think you assume that TCGV_HIGH(ci) = 0, > since that's what you set it below, and then this you can remove it. I am *not* assuming TCGV_HIGH(ci) == 0. I briefly thought about ignoring the high part, because "of course" carry-in is just the low bit. However, the code emitted for a 64-bit host will not ignore the upper 32 bits, so I thought it better to not intentionally change semantics. >> + } else { >> + TCGv_i32 t0 = tcg_temp_ebb_new_i32(); >> + TCGv_i32 c0 = tcg_temp_ebb_new_i32(); >> + TCGv_i32 c1 = tcg_temp_ebb_new_i32(); >> + >> + tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci)); >> + tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0); > > Likewise, this shouldn't be needed and you can just add TCGV_LOW(ci) > below. Again, this is about keeping the semantics the same as x86_64, where a non-zero ci is treated as a single carry-in bit. r~
diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h index 009e2778c5..b439bdb385 100644 --- a/include/tcg/tcg-op-common.h +++ b/include/tcg/tcg-op-common.h @@ -135,6 +135,8 @@ void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al, TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh); void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al, TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh); +void tcg_gen_addcio_i32(TCGv_i32 r, TCGv_i32 co, + TCGv_i32 a, TCGv_i32 b, TCGv_i32 ci); void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2); @@ -238,6 +240,8 @@ void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh); void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh); +void tcg_gen_addcio_i64(TCGv_i64 r, TCGv_i64 co, + TCGv_i64 a, TCGv_i64 b, TCGv_i64 ci); void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2); diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index a02850583b..44914e9326 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -252,6 +252,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_movcond_tl tcg_gen_movcond_i64 #define tcg_gen_add2_tl tcg_gen_add2_i64 #define tcg_gen_sub2_tl tcg_gen_sub2_i64 +#define tcg_gen_addcio_tl tcg_gen_addcio_i64 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64 #define tcg_gen_muls2_tl tcg_gen_muls2_i64 #define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i64 @@ -370,6 +371,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_movcond_tl tcg_gen_movcond_i32 #define tcg_gen_add2_tl tcg_gen_add2_i32 #define tcg_gen_sub2_tl tcg_gen_sub2_i32 +#define tcg_gen_addcio_tl tcg_gen_addcio_i32 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32 #define tcg_gen_muls2_tl tcg_gen_muls2_i32 #define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i32 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 447b0ebacd..b0a29278ab 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -1123,6 +1123,33 @@ void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al, } } +void tcg_gen_addcio_i32(TCGv_i32 r, TCGv_i32 co, + TCGv_i32 a, TCGv_i32 b, TCGv_i32 ci) +{ + if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) { + TCGv_i32 t0 = tcg_temp_ebb_new_i32(); + TCGv_i32 zero = tcg_constant_i32(0); + TCGv_i32 mone = tcg_constant_i32(-1); + + tcg_gen_op3_i32(INDEX_op_addco, t0, ci, mone); + tcg_gen_op3_i32(INDEX_op_addcio, r, a, b); + tcg_gen_op3_i32(INDEX_op_addci, co, zero, zero); + tcg_temp_free_i32(t0); + } else { + TCGv_i32 t0 = tcg_temp_ebb_new_i32(); + TCGv_i32 t1 = tcg_temp_ebb_new_i32(); + + tcg_gen_add_i32(t0, a, b); + tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a); + tcg_gen_add_i32(r, t0, ci); + tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0); + tcg_gen_or_i32(co, t0, t1); + + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); + } +} + void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al, TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh) { @@ -2868,6 +2895,74 @@ void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, } } +void tcg_gen_addcio_i64(TCGv_i64 r, TCGv_i64 co, + TCGv_i64 a, TCGv_i64 b, TCGv_i64 ci) +{ + if (TCG_TARGET_REG_BITS == 64) { + if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I64, 0)) { + TCGv_i64 discard = tcg_temp_ebb_new_i64(); + TCGv_i64 zero = tcg_constant_i64(0); + TCGv_i64 mone = tcg_constant_i64(-1); + + tcg_gen_op3_i64(INDEX_op_addco, discard, ci, mone); + tcg_gen_op3_i64(INDEX_op_addcio, r, a, b); + tcg_gen_op3_i64(INDEX_op_addci, co, zero, zero); + tcg_temp_free_i64(discard); + } else { + TCGv_i64 t0 = tcg_temp_ebb_new_i64(); + TCGv_i64 t1 = tcg_temp_ebb_new_i64(); + + tcg_gen_add_i64(t0, a, b); + tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, a); + tcg_gen_add_i64(r, t0, ci); + tcg_gen_setcond_i64(TCG_COND_LTU, t0, r, t0); + tcg_gen_or_i64(co, t0, t1); + + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); + } + } else { + if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) { + TCGv_i32 discard = tcg_temp_ebb_new_i32(); + TCGv_i32 zero = tcg_constant_i32(0); + TCGv_i32 mone = tcg_constant_i32(-1); + + tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone); + tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone); + tcg_gen_op3_i32(INDEX_op_addcio, TCGV_LOW(r), + TCGV_LOW(a), TCGV_LOW(b)); + tcg_gen_op3_i32(INDEX_op_addcio, TCGV_HIGH(r), + TCGV_HIGH(a), TCGV_HIGH(b)); + tcg_gen_op3_i32(INDEX_op_addci, TCGV_LOW(co), zero, zero); + tcg_temp_free_i32(discard); + } else { + TCGv_i32 t0 = tcg_temp_ebb_new_i32(); + TCGv_i32 c0 = tcg_temp_ebb_new_i32(); + TCGv_i32 c1 = tcg_temp_ebb_new_i32(); + + tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci)); + tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0); + + tcg_gen_add_i32(t0, TCGV_LOW(a), TCGV_LOW(b)); + tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_LOW(a)); + tcg_gen_add_i32(TCGV_LOW(r), t0, c1); + tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), c1); + tcg_gen_or_i32(c1, c1, c0); + + tcg_gen_add_i32(t0, TCGV_HIGH(a), TCGV_HIGH(b)); + tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_HIGH(a)); + tcg_gen_add_i32(TCGV_HIGH(r), t0, c1); + tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1); + tcg_gen_or_i32(TCGV_LOW(co), c0, c1); + + tcg_temp_free_i32(t0); + tcg_temp_free_i32(c0); + tcg_temp_free_i32(c1); + } + tcg_gen_movi_i32(TCGV_HIGH(co), 0); + } +} + void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh) {
Create a function for performing an add with carry-in and producing carry out. The carry-out result is boolean. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/tcg/tcg-op-common.h | 4 ++ include/tcg/tcg-op.h | 2 + tcg/tcg-op.c | 95 +++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+)