[6/7] target/arm: Simplify SMMLA, SMMLAR, SMMLS, SMMLSR

Message ID	20190808202616.13782-7-richard.henderson@linaro.org
State	Superseded
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of qemu-devel-bounces+patch=linaro.org@nongnu.org designates 209.51.188.17 as permitted sender) client-ip=209.51.188.17; From: Richard Henderson <richard.henderson@linaro.org> To: qemu-devel@nongnu.org Date: Thu, 8 Aug 2019 13:26:15 -0700 Message-Id: <20190808202616.13782-7-richard.henderson@linaro.org> In-Reply-To: <20190808202616.13782-1-richard.henderson@linaro.org> References: <20190808202616.13782-1-richard.henderson@linaro.org> Subject: [Qemu-devel] [PATCH 6/7] target/arm: Simplify SMMLA, SMMLAR, SMMLS, SMMLSR Precedence: list Cc: peter.maydell@linaro.org, qemu-arm@nongnu.org Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>
Series	target/arm: Misc cleanups \| expand [0/7] target/arm: Misc cleanups [1/7] target/arm: Use tcg_gen_extract_i32 for shifter_out_im [2/7] target/arm: Use tcg_gen_deposit_i32 for PKHBT, PKHTB [3/7] target/arm: Remove redundant shift tests [4/7] target/arm: Use ror32 instead of open-coding the operation [5/7] target/arm: Use tcg_gen_rotri_i32 for gen_swap_half [6/7] target/arm: Simplify SMMLA, SMMLAR, SMMLS, SMMLSR [7/7] target/arm: Use tcg_gen_extrh_i64_i32 to extract the high word

Message ID

20190808202616.13782-7-richard.henderson@linaro.org

State

Superseded

Headers

Received-SPF: pass (google.com: domain of
	qemu-devel-bounces+patch=linaro.org@nongnu.org designates
	209.51.188.17 as permitted sender) client-ip=209.51.188.17; 
From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Date: Thu,  8 Aug 2019 13:26:15 -0700
Message-Id: <20190808202616.13782-7-richard.henderson@linaro.org>
In-Reply-To: <20190808202616.13782-1-richard.henderson@linaro.org>
References: <20190808202616.13782-1-richard.henderson@linaro.org>
Subject: [Qemu-devel] [PATCH 6/7] target/arm: Simplify SMMLA, SMMLAR, SMMLS, 
	SMMLSR
Precedence: list
Cc: peter.maydell@linaro.org, qemu-arm@nongnu.org
Errors-To: qemu-devel-bounces+patch=linaro.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+patch=linaro.org@nongnu.org>

Series

target/arm: Misc cleanups | expand

Commit Message

Richard Henderson Aug. 8, 2019, 8:26 p.m. UTC

All of the inputs to these instructions are 32-bits.  Rather than
extend each input to 64-bits and then extract the high 32-bits of
the output, use tcg_gen_muls2_i32 and other 32-bit generator functions.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/translate.c | 72 +++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 46 deletions(-)

-- 
2.17.1

Comments

Laurent Desnogues Aug. 28, 2019, 7:22 a.m. UTC | #1

Hi Richard,

On Thu, Aug 8, 2019 at 10:28 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>

> All of the inputs to these instructions are 32-bits.  Rather than

> extend each input to 64-bits and then extract the high 32-bits of

> the output, use tcg_gen_muls2_i32 and other 32-bit generator functions.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/translate.c | 72 +++++++++++++++---------------------------

>  1 file changed, 26 insertions(+), 46 deletions(-)

>

> diff --git a/target/arm/translate.c b/target/arm/translate.c

> index ddc54e77e4..77154be743 100644

> --- a/target/arm/translate.c

> +++ b/target/arm/translate.c

> @@ -391,34 +391,6 @@ static void gen_revsh(TCGv_i32 var)

>      tcg_gen_ext16s_i32(var, var);

>  }

>

> -/* Return (b << 32) + a. Mark inputs as dead */

> -static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv_i32 b)

> -{

> -    TCGv_i64 tmp64 = tcg_temp_new_i64();

> -

> -    tcg_gen_extu_i32_i64(tmp64, b);

> -    tcg_temp_free_i32(b);

> -    tcg_gen_shli_i64(tmp64, tmp64, 32);

> -    tcg_gen_add_i64(a, tmp64, a);

> -

> -    tcg_temp_free_i64(tmp64);

> -    return a;

> -}

> -

> -/* Return (b << 32) - a. Mark inputs as dead. */

> -static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv_i32 b)

> -{

> -    TCGv_i64 tmp64 = tcg_temp_new_i64();

> -

> -    tcg_gen_extu_i32_i64(tmp64, b);

> -    tcg_temp_free_i32(b);

> -    tcg_gen_shli_i64(tmp64, tmp64, 32);

> -    tcg_gen_sub_i64(a, tmp64, a);

> -

> -    tcg_temp_free_i64(tmp64);

> -    return a;

> -}

> -

>  /* 32x32->64 multiply.  Marks inputs as dead.  */

>  static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)

>  {

> @@ -8872,23 +8844,27 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)

>                             (SMMUL, SMMLA, SMMLS) */

>                          tmp = load_reg(s, rm);

>                          tmp2 = load_reg(s, rs);

> -                        tmp64 = gen_muls_i64_i32(tmp, tmp2);

> +                        tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);

>

>                          if (rd != 15) {

> -                            tmp = load_reg(s, rd);

> +                            tmp3 = load_reg(s, rd);

>                              if (insn & (1 << 6)) {

> -                                tmp64 = gen_subq_msw(tmp64, tmp);

> +                                tcg_gen_sub_i32(tmp, tmp, tmp3);


Shouldn't you subtract tmp from tmp3?

>                              } else {

> -                                tmp64 = gen_addq_msw(tmp64, tmp);

> +                                tcg_gen_add_i32(tmp, tmp, tmp3);

>                              }

> +                            tcg_temp_free_i32(tmp3);

>                          }

>                          if (insn & (1 << 5)) {

> -                            tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);

> +                            /*

> +                             * Adding 0x80000000 to the 64-bit quantity

> +                             * means that we have carry in to the high

> +                             * word when the low word has the high bit set.

> +                             */

> +                            tcg_gen_shri_i32(tmp2, tmp2, 31);

> +                            tcg_gen_add_i32(tmp, tmp, tmp2);

>                          }

> -                        tcg_gen_shri_i64(tmp64, tmp64, 32);

> -                        tmp = tcg_temp_new_i32();

> -                        tcg_gen_extrl_i64_i32(tmp, tmp64);

> -                        tcg_temp_free_i64(tmp64);

> +                        tcg_temp_free_i32(tmp2);

>                          store_reg(s, rn, tmp);

>                          break;

>                      case 0:

> @@ -10114,22 +10090,26 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)

>                    }

>                  break;

>              case 5: case 6: /* 32 * 32 -> 32msb (SMMUL, SMMLA, SMMLS) */

> -                tmp64 = gen_muls_i64_i32(tmp, tmp2);

> +                tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);

>                  if (rs != 15) {

> -                    tmp = load_reg(s, rs);

> +                    tmp3 = load_reg(s, rs);

>                      if (insn & (1 << 20)) {

> -                        tmp64 = gen_addq_msw(tmp64, tmp);

> +                        tcg_gen_add_i32(tmp, tmp, tmp3);

>                      } else {

> -                        tmp64 = gen_subq_msw(tmp64, tmp);

> +                        tcg_gen_sub_i32(tmp, tmp, tmp3);


Same here.

Also the way you do the computation means you don't propagate the
borrow from the lower 32-bit of the 64-bit product when doing the
subtraction.

Thanks,

Laurent

>                      }

> +                    tcg_temp_free_i32(tmp3);

>                  }

>                  if (insn & (1 << 4)) {

> -                    tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);

> +                    /*

> +                     * Adding 0x80000000 to the 64-bit quantity

> +                     * means that we have carry in to the high

> +                     * word when the low word has the high bit set.

> +                     */

> +                    tcg_gen_shri_i32(tmp2, tmp2, 31);

> +                    tcg_gen_add_i32(tmp, tmp, tmp2);

>                  }

> -                tcg_gen_shri_i64(tmp64, tmp64, 32);

> -                tmp = tcg_temp_new_i32();

> -                tcg_gen_extrl_i64_i32(tmp, tmp64);

> -                tcg_temp_free_i64(tmp64);

> +                tcg_temp_free_i32(tmp2);

>                  break;

>              case 7: /* Unsigned sum of absolute differences.  */

>                  gen_helper_usad8(tmp, tmp, tmp2);

> --

> 2.17.1

>

>

diff --git a/target/arm/translate.c b/target/arm/translate.c
index ddc54e77e4..77154be743 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -391,34 +391,6 @@  static void gen_revsh(TCGv_i32 var)
     tcg_gen_ext16s_i32(var, var);
 }
 
-/* Return (b << 32) + a. Mark inputs as dead */
-static TCGv_i64 gen_addq_msw(TCGv_i64 a, TCGv_i32 b)
-{
-    TCGv_i64 tmp64 = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(tmp64, b);
-    tcg_temp_free_i32(b);
-    tcg_gen_shli_i64(tmp64, tmp64, 32);
-    tcg_gen_add_i64(a, tmp64, a);
-
-    tcg_temp_free_i64(tmp64);
-    return a;
-}
-
-/* Return (b << 32) - a. Mark inputs as dead. */
-static TCGv_i64 gen_subq_msw(TCGv_i64 a, TCGv_i32 b)
-{
-    TCGv_i64 tmp64 = tcg_temp_new_i64();
-
-    tcg_gen_extu_i32_i64(tmp64, b);
-    tcg_temp_free_i32(b);
-    tcg_gen_shli_i64(tmp64, tmp64, 32);
-    tcg_gen_sub_i64(a, tmp64, a);
-
-    tcg_temp_free_i64(tmp64);
-    return a;
-}
-
 /* 32x32->64 multiply.  Marks inputs as dead.  */
 static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)
 {
@@ -8872,23 +8844,27 @@  static void disas_arm_insn(DisasContext *s, unsigned int insn)
                            (SMMUL, SMMLA, SMMLS) */
                         tmp = load_reg(s, rm);
                         tmp2 = load_reg(s, rs);
-                        tmp64 = gen_muls_i64_i32(tmp, tmp2);
+                        tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
 
                         if (rd != 15) {
-                            tmp = load_reg(s, rd);
+                            tmp3 = load_reg(s, rd);
                             if (insn & (1 << 6)) {
-                                tmp64 = gen_subq_msw(tmp64, tmp);
+                                tcg_gen_sub_i32(tmp, tmp, tmp3);
                             } else {
-                                tmp64 = gen_addq_msw(tmp64, tmp);
+                                tcg_gen_add_i32(tmp, tmp, tmp3);
                             }
+                            tcg_temp_free_i32(tmp3);
                         }
                         if (insn & (1 << 5)) {
-                            tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
+                            /*
+                             * Adding 0x80000000 to the 64-bit quantity
+                             * means that we have carry in to the high
+                             * word when the low word has the high bit set.
+                             */
+                            tcg_gen_shri_i32(tmp2, tmp2, 31);
+                            tcg_gen_add_i32(tmp, tmp, tmp2);
                         }
-                        tcg_gen_shri_i64(tmp64, tmp64, 32);
-                        tmp = tcg_temp_new_i32();
-                        tcg_gen_extrl_i64_i32(tmp, tmp64);
-                        tcg_temp_free_i64(tmp64);
+                        tcg_temp_free_i32(tmp2);
                         store_reg(s, rn, tmp);
                         break;
                     case 0:
@@ -10114,22 +10090,26 @@  static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
                   }
                 break;
             case 5: case 6: /* 32 * 32 -> 32msb (SMMUL, SMMLA, SMMLS) */
-                tmp64 = gen_muls_i64_i32(tmp, tmp2);
+                tcg_gen_muls2_i32(tmp2, tmp, tmp, tmp2);
                 if (rs != 15) {
-                    tmp = load_reg(s, rs);
+                    tmp3 = load_reg(s, rs);
                     if (insn & (1 << 20)) {
-                        tmp64 = gen_addq_msw(tmp64, tmp);
+                        tcg_gen_add_i32(tmp, tmp, tmp3);
                     } else {
-                        tmp64 = gen_subq_msw(tmp64, tmp);
+                        tcg_gen_sub_i32(tmp, tmp, tmp3);
                     }
+                    tcg_temp_free_i32(tmp3);
                 }
                 if (insn & (1 << 4)) {
-                    tcg_gen_addi_i64(tmp64, tmp64, 0x80000000u);
+                    /*
+                     * Adding 0x80000000 to the 64-bit quantity
+                     * means that we have carry in to the high
+                     * word when the low word has the high bit set.
+                     */
+                    tcg_gen_shri_i32(tmp2, tmp2, 31);
+                    tcg_gen_add_i32(tmp, tmp, tmp2);
                 }
-                tcg_gen_shri_i64(tmp64, tmp64, 32);
-                tmp = tcg_temp_new_i32();
-                tcg_gen_extrl_i64_i32(tmp, tmp64);
-                tcg_temp_free_i64(tmp64);
+                tcg_temp_free_i32(tmp2);
                 break;
             case 7: /* Unsigned sum of absolute differences.  */
                 gen_helper_usad8(tmp, tmp, tmp2);

[6/7] target/arm: Simplify SMMLA, SMMLAR, SMMLS, SMMLSR

Commit Message

Comments

Patch