diff mbox series

[v3,122/162] tcg: Add tcg_gen_addcio_{i32,i64,tl}

Message ID 20250216231012.2808572-123-richard.henderson@linaro.org
State New
Headers show
Series tcg: Convert to TCGOutOp structures | expand

Commit Message

Richard Henderson Feb. 16, 2025, 11:09 p.m. UTC
Create a function for performing an add with carry-in
and producing carry out.  The carry-out result is boolean.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-common.h |  4 ++
 include/tcg/tcg-op.h        |  2 +
 tcg/tcg-op.c                | 95 +++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)

Comments

Paolo Bonzini Feb. 21, 2025, 3:41 p.m. UTC | #1
Ok, this one definitely caught my eye. :)

On 2/17/25 00:09, Richard Henderson wrote:
> +        tcg_gen_add_i32(t0, a, b);
> +        tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a);

Compare against b instead?  If there's an immediate (which could even be
zero) it is there.

> +        tcg_gen_add_i32(r, t0, ci);
> +        tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0);
> +        tcg_gen_or_i32(co, t0, t1);

setcond + or can become a movcond:

         /* co = t1 | (r < t0) */
         tcg_gen_movcond_i32(TCG_COND_LTU, co, r, t0, tcg_constant_i32(1), t1);

> +            TCGv_i64 t0 = tcg_temp_ebb_new_i64();
> +            TCGv_i64 t1 = tcg_temp_ebb_new_i64();
> +
> +            tcg_gen_add_i64(t0, a, b);
> +            tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, a);
> +            tcg_gen_add_i64(r, t0, ci);
> +            tcg_gen_setcond_i64(TCG_COND_LTU, t0, r, t0);
> +            tcg_gen_or_i64(co, t0, t1);

The same two observations as above apply here, of course.

> +            tcg_temp_free_i64(t0);
> +            tcg_temp_free_i64(t1);
> +        }
> +    } else {
> +        if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) {
> +            TCGv_i32 discard = tcg_temp_ebb_new_i32();
> +            TCGv_i32 zero = tcg_constant_i32(0);
> +            TCGv_i32 mone = tcg_constant_i32(-1);
> +
> +            tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone);
> +            tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone);

This addcio is unnecessary/incorrect.  I think you assume that TCGV_HIGH(ci) = 0,
since that's what you set it below, and then this you can remove it.

But if you really wanted to do it, it should go...

> +            tcg_gen_op3_i32(INDEX_op_addcio, TCGV_LOW(r),
> +                            TCGV_LOW(a), TCGV_LOW(b));

... here, after the low word is all set, to "OR" the input high-carry into the
output low-carry.  But again, I think it's not necessary.

> +            tcg_gen_op3_i32(INDEX_op_addcio, TCGV_HIGH(r),
> +                            TCGV_HIGH(a), TCGV_HIGH(b));
> +            tcg_gen_op3_i32(INDEX_op_addci, TCGV_LOW(co), zero, zero);
> +            tcg_temp_free_i32(discard);
> +        } else {
> +            TCGv_i32 t0 = tcg_temp_ebb_new_i32();
> +            TCGv_i32 c0 = tcg_temp_ebb_new_i32();
> +            TCGv_i32 c1 = tcg_temp_ebb_new_i32();
> +
> +            tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci));
> +            tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0);

Likewise, this shouldn't be needed and you can just add TCGV_LOW(ci)
below.

> +            tcg_gen_add_i32(t0, TCGV_LOW(a), TCGV_LOW(b));
> +            tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_LOW(a));

Here you can also change a to b.

> +            tcg_gen_add_i32(TCGV_LOW(r), t0, c1);
> +            tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), c1);
> +            tcg_gen_or_i32(c1, c1, c0);

             /* c1 = c0 | (r < ci) */
             tcg_gen_movcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), TCGV_LOW(ci), tcg_constant_i32(1), c0);

> +            tcg_gen_add_i32(t0, TCGV_HIGH(a), TCGV_HIGH(b));
> +            tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_HIGH(a));

Change a to b.

> +            tcg_gen_add_i32(TCGV_HIGH(r), t0, c1);
> +            tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1);
> +            tcg_gen_or_i32(TCGV_LOW(co), c0, c1);

+            /* c1 = c0 | (r < c1) */
+            tcg_gen_movcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1, tcg_constant_i32(1), c0);

> +            tcg_temp_free_i32(t0);
> +            tcg_temp_free_i32(c0);
> +            tcg_temp_free_i32(c1);
> +        }
> +        tcg_gen_movi_i32(TCGV_HIGH(co), 0);
> +    }
> +}
> +
>   void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
>                         TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
>   {
Richard Henderson Feb. 22, 2025, 6:08 p.m. UTC | #2
On 2/21/25 07:41, Paolo Bonzini wrote:
> Ok, this one definitely caught my eye. :)
> 
> On 2/17/25 00:09, Richard Henderson wrote:
>> +        tcg_gen_add_i32(t0, a, b);
>> +        tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a);
> 
> Compare against b instead?  If there's an immediate (which could even be
> zero) it is there.

Sure.

> 
>> +        tcg_gen_add_i32(r, t0, ci);
>> +        tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0);
>> +        tcg_gen_or_i32(co, t0, t1);
> 
> setcond + or can become a movcond:

For the set of hosts that require this support (mips, riscv, loongarch), this isn't really 
an improvement -- we'll end up emitting a setcond LTU anyway and using that to feed 
movcond NE instruction.


>> +        if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) {
>> +            TCGv_i32 discard = tcg_temp_ebb_new_i32();
>> +            TCGv_i32 zero = tcg_constant_i32(0);
>> +            TCGv_i32 mone = tcg_constant_i32(-1);
>> +
>> +            tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone);
>> +            tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone);
> 
> This addcio is unnecessary/incorrect.  I think you assume that TCGV_HIGH(ci) = 0,
> since that's what you set it below, and then this you can remove it.

I am *not* assuming TCGV_HIGH(ci) == 0.

I briefly thought about ignoring the high part, because "of course" carry-in is just the 
low bit.  However, the code emitted for a 64-bit host will not ignore the upper 32 bits, 
so I thought it better to not intentionally change semantics.


>> +        } else {
>> +            TCGv_i32 t0 = tcg_temp_ebb_new_i32();
>> +            TCGv_i32 c0 = tcg_temp_ebb_new_i32();
>> +            TCGv_i32 c1 = tcg_temp_ebb_new_i32();
>> +
>> +            tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci));
>> +            tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0);
> 
> Likewise, this shouldn't be needed and you can just add TCGV_LOW(ci)
> below.

Again, this is about keeping the semantics the same as x86_64, where a non-zero ci is 
treated as a single carry-in bit.


r~
diff mbox series

Patch

diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
index 009e2778c5..b439bdb385 100644
--- a/include/tcg/tcg-op-common.h
+++ b/include/tcg/tcg-op-common.h
@@ -135,6 +135,8 @@  void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
                       TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
 void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
                       TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_addcio_i32(TCGv_i32 r, TCGv_i32 co,
+                        TCGv_i32 a, TCGv_i32 b, TCGv_i32 ci);
 void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
@@ -238,6 +240,8 @@  void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
                       TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
 void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
                       TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_addcio_i64(TCGv_i64 r, TCGv_i64 co,
+                        TCGv_i64 a, TCGv_i64 b, TCGv_i64 ci);
 void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index a02850583b..44914e9326 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -252,6 +252,7 @@  DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64)
 #define tcg_gen_movcond_tl tcg_gen_movcond_i64
 #define tcg_gen_add2_tl tcg_gen_add2_i64
 #define tcg_gen_sub2_tl tcg_gen_sub2_i64
+#define tcg_gen_addcio_tl tcg_gen_addcio_i64
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i64
 #define tcg_gen_muls2_tl tcg_gen_muls2_i64
 #define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i64
@@ -370,6 +371,7 @@  DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64)
 #define tcg_gen_movcond_tl tcg_gen_movcond_i32
 #define tcg_gen_add2_tl tcg_gen_add2_i32
 #define tcg_gen_sub2_tl tcg_gen_sub2_i32
+#define tcg_gen_addcio_tl tcg_gen_addcio_i32
 #define tcg_gen_mulu2_tl tcg_gen_mulu2_i32
 #define tcg_gen_muls2_tl tcg_gen_muls2_i32
 #define tcg_gen_mulsu2_tl tcg_gen_mulsu2_i32
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 447b0ebacd..b0a29278ab 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1123,6 +1123,33 @@  void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
     }
 }
 
+void tcg_gen_addcio_i32(TCGv_i32 r, TCGv_i32 co,
+                        TCGv_i32 a, TCGv_i32 b, TCGv_i32 ci)
+{
+    if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) {
+        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
+        TCGv_i32 zero = tcg_constant_i32(0);
+        TCGv_i32 mone = tcg_constant_i32(-1);
+
+        tcg_gen_op3_i32(INDEX_op_addco, t0, ci, mone);
+        tcg_gen_op3_i32(INDEX_op_addcio, r, a, b);
+        tcg_gen_op3_i32(INDEX_op_addci, co, zero, zero);
+        tcg_temp_free_i32(t0);
+    } else {
+        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
+        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
+
+        tcg_gen_add_i32(t0, a, b);
+        tcg_gen_setcond_i32(TCG_COND_LTU, t1, t0, a);
+        tcg_gen_add_i32(r, t0, ci);
+        tcg_gen_setcond_i32(TCG_COND_LTU, t0, r, t0);
+        tcg_gen_or_i32(co, t0, t1);
+
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
 void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
                       TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh)
 {
@@ -2868,6 +2895,74 @@  void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
     }
 }
 
+void tcg_gen_addcio_i64(TCGv_i64 r, TCGv_i64 co,
+                        TCGv_i64 a, TCGv_i64 b, TCGv_i64 ci)
+{
+    if (TCG_TARGET_REG_BITS == 64) {
+        if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I64, 0)) {
+            TCGv_i64 discard = tcg_temp_ebb_new_i64();
+            TCGv_i64 zero = tcg_constant_i64(0);
+            TCGv_i64 mone = tcg_constant_i64(-1);
+
+            tcg_gen_op3_i64(INDEX_op_addco, discard, ci, mone);
+            tcg_gen_op3_i64(INDEX_op_addcio, r, a, b);
+            tcg_gen_op3_i64(INDEX_op_addci, co, zero, zero);
+            tcg_temp_free_i64(discard);
+        } else {
+            TCGv_i64 t0 = tcg_temp_ebb_new_i64();
+            TCGv_i64 t1 = tcg_temp_ebb_new_i64();
+
+            tcg_gen_add_i64(t0, a, b);
+            tcg_gen_setcond_i64(TCG_COND_LTU, t1, t0, a);
+            tcg_gen_add_i64(r, t0, ci);
+            tcg_gen_setcond_i64(TCG_COND_LTU, t0, r, t0);
+            tcg_gen_or_i64(co, t0, t1);
+
+            tcg_temp_free_i64(t0);
+            tcg_temp_free_i64(t1);
+        }
+    } else {
+        if (tcg_op_supported(INDEX_op_addci, TCG_TYPE_I32, 0)) {
+            TCGv_i32 discard = tcg_temp_ebb_new_i32();
+            TCGv_i32 zero = tcg_constant_i32(0);
+            TCGv_i32 mone = tcg_constant_i32(-1);
+
+            tcg_gen_op3_i32(INDEX_op_addco, discard, TCGV_LOW(ci), mone);
+            tcg_gen_op3_i32(INDEX_op_addcio, discard, TCGV_HIGH(ci), mone);
+            tcg_gen_op3_i32(INDEX_op_addcio, TCGV_LOW(r),
+                            TCGV_LOW(a), TCGV_LOW(b));
+            tcg_gen_op3_i32(INDEX_op_addcio, TCGV_HIGH(r),
+                            TCGV_HIGH(a), TCGV_HIGH(b));
+            tcg_gen_op3_i32(INDEX_op_addci, TCGV_LOW(co), zero, zero);
+            tcg_temp_free_i32(discard);
+        } else {
+            TCGv_i32 t0 = tcg_temp_ebb_new_i32();
+            TCGv_i32 c0 = tcg_temp_ebb_new_i32();
+            TCGv_i32 c1 = tcg_temp_ebb_new_i32();
+
+            tcg_gen_or_i32(c1, TCGV_LOW(ci), TCGV_HIGH(ci));
+            tcg_gen_setcondi_i32(TCG_COND_NE, c1, c1, 0);
+
+            tcg_gen_add_i32(t0, TCGV_LOW(a), TCGV_LOW(b));
+            tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_LOW(a));
+            tcg_gen_add_i32(TCGV_LOW(r), t0, c1);
+            tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_LOW(r), c1);
+            tcg_gen_or_i32(c1, c1, c0);
+
+            tcg_gen_add_i32(t0, TCGV_HIGH(a), TCGV_HIGH(b));
+            tcg_gen_setcond_i32(TCG_COND_LTU, c0, t0, TCGV_HIGH(a));
+            tcg_gen_add_i32(TCGV_HIGH(r), t0, c1);
+            tcg_gen_setcond_i32(TCG_COND_LTU, c1, TCGV_HIGH(r), c1);
+            tcg_gen_or_i32(TCGV_LOW(co), c0, c1);
+
+            tcg_temp_free_i32(t0);
+            tcg_temp_free_i32(c0);
+            tcg_temp_free_i32(c1);
+        }
+        tcg_gen_movi_i32(TCGV_HIGH(co), 0);
+    }
+}
+
 void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
                       TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh)
 {