Message ID | 20230307183503.2512684-21-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | tcg: Remove tcg_const_* | expand |
On 3/7/23 15:34, Richard Henderson wrote: > Compute all carry bits in parallel instead of a loop. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- Hmmmm the function was added by 6addef4d27268 9 months ago. All tcg ops you used here were available back then. I guess this existing implementation was an oversight on our end. Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com> > Cc: Daniel Henrique Barboza <danielhb413@gmail.com> > Cc: Cédric Le Goater <clg@kaod.org> > Cc: David Gibson <david@gibson.dropbear.id.au> > Cc: Greg Kurz <groug@kaod.org> > Cc: qemu-ppc@nongnu.org > --- > target/ppc/translate/fixedpoint-impl.c.inc | 44 +++++++++++----------- > 1 file changed, 23 insertions(+), 21 deletions(-) > > diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc > index 20ea484c3d..02d86b77a8 100644 > --- a/target/ppc/translate/fixedpoint-impl.c.inc > +++ b/target/ppc/translate/fixedpoint-impl.c.inc > @@ -484,33 +484,35 @@ static bool trans_PEXTD(DisasContext *ctx, arg_X *a) > > static bool trans_ADDG6S(DisasContext *ctx, arg_X *a) > { > - const uint64_t carry_bits = 0x1111111111111111ULL; > - TCGv t0, t1, carry, zero = tcg_constant_tl(0); > + const target_ulong carry_bits = (target_ulong)-1 / 0xf; > + TCGv in1, in2, carryl, carryh, tmp; > + TCGv zero = tcg_constant_tl(0); > > REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206); > > - t0 = tcg_temp_new(); > - t1 = tcg_const_tl(0); > - carry = tcg_const_tl(0); > + in1 = cpu_gpr[a->ra]; > + in2 = cpu_gpr[a->rb]; > + tmp = tcg_temp_new(); > + carryl = tcg_temp_new(); > + carryh = tcg_temp_new(); > > - for (int i = 0; i < 16; i++) { > - tcg_gen_shri_tl(t0, cpu_gpr[a->ra], i * 4); > - tcg_gen_andi_tl(t0, t0, 0xf); > - tcg_gen_add_tl(t1, t1, t0); > + /* Addition with carry. */ > + tcg_gen_add2_tl(carryl, carryh, in1, zero, in2, zero); > + /* Addition without carry. */ > + tcg_gen_xor_tl(tmp, in1, in2); > + /* Difference between the two is carry in to each bit. */ > + tcg_gen_xor_tl(carryl, carryl, tmp); > > - tcg_gen_shri_tl(t0, cpu_gpr[a->rb], i * 4); > - tcg_gen_andi_tl(t0, t0, 0xf); > - tcg_gen_add_tl(t1, t1, t0); > + /* > + * The carry-out that we're looking for is the carry-in to > + * the next nibble. Shift the double-word down one nibble, > + * which puts all of the bits back into one word. > + */ > + tcg_gen_extract2_tl(carryl, carryl, carryh, 4); > > - tcg_gen_andi_tl(t1, t1, 0x10); > - tcg_gen_setcond_tl(TCG_COND_NE, t1, t1, zero); > - > - tcg_gen_shli_tl(t0, t1, i * 4); > - tcg_gen_or_tl(carry, carry, t0); > - } > - > - tcg_gen_xori_tl(carry, carry, (target_long)carry_bits); > - tcg_gen_muli_tl(cpu_gpr[a->rt], carry, 6); > + /* Invert, isolate the carry bits, and produce 6's. */ > + tcg_gen_andc_tl(carryl, tcg_constant_tl(carry_bits), carryl); > + tcg_gen_muli_tl(cpu_gpr[a->rt], carryl, 6); > return true; > } >
On 3/7/23 13:51, Daniel Henrique Barboza wrote: > > > On 3/7/23 15:34, Richard Henderson wrote: >> Compute all carry bits in parallel instead of a loop. >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- > > Hmmmm the function was added by 6addef4d27268 9 months ago. All tcg ops you used > here were available back then. > > I guess this existing implementation was an oversight on our end. I seem to have missed the whole patch set, which is a shame -- I could have pointed you to a similar function for hppa. r~
diff --git a/target/ppc/translate/fixedpoint-impl.c.inc b/target/ppc/translate/fixedpoint-impl.c.inc index 20ea484c3d..02d86b77a8 100644 --- a/target/ppc/translate/fixedpoint-impl.c.inc +++ b/target/ppc/translate/fixedpoint-impl.c.inc @@ -484,33 +484,35 @@ static bool trans_PEXTD(DisasContext *ctx, arg_X *a) static bool trans_ADDG6S(DisasContext *ctx, arg_X *a) { - const uint64_t carry_bits = 0x1111111111111111ULL; - TCGv t0, t1, carry, zero = tcg_constant_tl(0); + const target_ulong carry_bits = (target_ulong)-1 / 0xf; + TCGv in1, in2, carryl, carryh, tmp; + TCGv zero = tcg_constant_tl(0); REQUIRE_INSNS_FLAGS2(ctx, BCDA_ISA206); - t0 = tcg_temp_new(); - t1 = tcg_const_tl(0); - carry = tcg_const_tl(0); + in1 = cpu_gpr[a->ra]; + in2 = cpu_gpr[a->rb]; + tmp = tcg_temp_new(); + carryl = tcg_temp_new(); + carryh = tcg_temp_new(); - for (int i = 0; i < 16; i++) { - tcg_gen_shri_tl(t0, cpu_gpr[a->ra], i * 4); - tcg_gen_andi_tl(t0, t0, 0xf); - tcg_gen_add_tl(t1, t1, t0); + /* Addition with carry. */ + tcg_gen_add2_tl(carryl, carryh, in1, zero, in2, zero); + /* Addition without carry. */ + tcg_gen_xor_tl(tmp, in1, in2); + /* Difference between the two is carry in to each bit. */ + tcg_gen_xor_tl(carryl, carryl, tmp); - tcg_gen_shri_tl(t0, cpu_gpr[a->rb], i * 4); - tcg_gen_andi_tl(t0, t0, 0xf); - tcg_gen_add_tl(t1, t1, t0); + /* + * The carry-out that we're looking for is the carry-in to + * the next nibble. Shift the double-word down one nibble, + * which puts all of the bits back into one word. + */ + tcg_gen_extract2_tl(carryl, carryl, carryh, 4); - tcg_gen_andi_tl(t1, t1, 0x10); - tcg_gen_setcond_tl(TCG_COND_NE, t1, t1, zero); - - tcg_gen_shli_tl(t0, t1, i * 4); - tcg_gen_or_tl(carry, carry, t0); - } - - tcg_gen_xori_tl(carry, carry, (target_long)carry_bits); - tcg_gen_muli_tl(cpu_gpr[a->rt], carry, 6); + /* Invert, isolate the carry bits, and produce 6's. */ + tcg_gen_andc_tl(carryl, tcg_constant_tl(carry_bits), carryl); + tcg_gen_muli_tl(cpu_gpr[a->rt], carryl, 6); return true; }
Compute all carry bits in parallel instead of a loop. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- Cc: Daniel Henrique Barboza <danielhb413@gmail.com> Cc: Cédric Le Goater <clg@kaod.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Greg Kurz <groug@kaod.org> Cc: qemu-ppc@nongnu.org --- target/ppc/translate/fixedpoint-impl.c.inc | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-)