diff mbox series

LoongArch: Optimize for conditional move operations

Message ID 20241230023913.10260-1-guojie@loongson.cn
State Accepted
Commit d55d40afd42a280c80729b538e3cce994f20961d
Headers show
Series LoongArch: Optimize for conditional move operations | expand

Commit Message

Guo Jie Dec. 30, 2024, 2:39 a.m. UTC
The optimization example is as follows.

From:
  if (condition)
    dest += 1 << 16;
To:
  dest += (condition ? 1 : 0) << 16;

It does not use maskeqz and masknez, thus reducing the number of
instructions.

gcc/ChangeLog:

	* config/loongarch/loongarch.cc
	(loongarch_expand_conditional_move): Add some optimization
	implementations based on noce_try_cmove_arith.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/conditional-move-opt-1.c: New test.
	* gcc.target/loongarch/conditional-move-opt-2.c: New test.

---
 gcc/config/loongarch/loongarch.cc             | 103 +++++++++++++++++-
 .../loongarch/conditional-move-opt-1.c        |  58 ++++++++++
 .../loongarch/conditional-move-opt-2.c        |  42 +++++++
 3 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c

Comments

Xi Ruoyao Dec. 30, 2024, 4:06 a.m. UTC | #1
On Mon, 2024-12-30 at 10:39 +0800, Guo Jie wrote:
> +	      /* Make sure that imm is a positive integer power of
> 2.  */

Maybe we should also consider the case $imm = 2^k + 1$ as they can be
implemented with sl[te] and bstrins.[wd].  But it can be done in another
patch anyway.

> +	      if (val > 0 && !(val & (val - 1)))
> +		can_be_optimized = true;
Guo Jie Dec. 30, 2024, 7:18 a.m. UTC | #2
Thanks for your suggestion!

Indeed, there are still some scenarios that can be optimized and 
improved next.

在 2024/12/30 下午12:06, Xi Ruoyao 写道:
> On Mon, 2024-12-30 at 10:39 +0800, Guo Jie wrote:
>> +	      /* Make sure that imm is a positive integer power of
>> 2.  */
> Maybe we should also consider the case $imm = 2^k + 1$ as they can be
> implemented with sl[te] and bstrins.[wd].  But it can be done in another
> patch anyway.
>
>> +	      if (val > 0 && !(val & (val - 1)))
>> +		can_be_optimized = true;
Lulu Cheng Jan. 2, 2025, 3:25 a.m. UTC | #3
Pushed to r15-6493.

在 2024/12/30 上午10:39, Guo Jie 写道:
> The optimization example is as follows.
>
> From:
>    if (condition)
>      dest += 1 << 16;
> To:
>    dest += (condition ? 1 : 0) << 16;
>
> It does not use maskeqz and masknez, thus reducing the number of
> instructions.
>
> gcc/ChangeLog:
>
> 	* config/loongarch/loongarch.cc
> 	(loongarch_expand_conditional_move): Add some optimization
> 	implementations based on noce_try_cmove_arith.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/loongarch/conditional-move-opt-1.c: New test.
> 	* gcc.target/loongarch/conditional-move-opt-2.c: New test.
>
> ---
>   gcc/config/loongarch/loongarch.cc             | 103 +++++++++++++++++-
>   .../loongarch/conditional-move-opt-1.c        |  58 ++++++++++
>   .../loongarch/conditional-move-opt-2.c        |  42 +++++++
>   3 files changed, 202 insertions(+), 1 deletion(-)
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index 2d4290bc2d1..32fd1697813 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -5294,6 +5294,81 @@ loongarch_expand_conditional_move (rtx *operands)
>       loongarch_emit_float_compare (&code, &op0, &op1);
>     else
>       {
> +      /* Optimize to reduce the number of instructions for ternary operations.
> +	 Mainly implemented based on noce_try_cmove_arith.
> +	 For dest = (condition) ? value_if_true : value_if_false;
> +	 the optimization requires:
> +	  a. value_if_false = var;
> +	  b. value_if_true = var OP C (a positive integer power of 2).
> +
> +	 Situations similar to the following:
> +	    if (condition)
> +	      dest += 1 << imm;
> +	 to:
> +	    dest += (condition ? 1 : 0) << imm;  */
> +
> +      rtx_insn *insn;
> +      HOST_WIDE_INT val = 0; /* The value of rtx C.  */
> +      /* INSN with operands[2] as the output.  */
> +      rtx_insn *value_if_true_insn = NULL;
> +      /* INSN with operands[3] as the output.  */
> +      rtx_insn *value_if_false_insn = NULL;
> +      rtx value_if_true_insn_src = NULL_RTX;
> +      /* Common operand var in value_if_true and value_if_false.  */
> +      rtx comm_var = NULL_RTX;
> +      bool can_be_optimized = false;
> +
> +      /* Search value_if_true_insn and value_if_false_insn.  */
> +      struct sequence_stack *seq = get_current_sequence ()->next;
> +      for (insn = seq->last; insn; insn = PREV_INSN (insn))
> +	{
> +	  if (single_set (insn))
> +	    {
> +	      rtx set_dest = SET_DEST (single_set (insn));
> +	      if (rtx_equal_p (set_dest, operands[2]))
> +		value_if_true_insn = insn;
> +	      else if (rtx_equal_p (set_dest, operands[3]))
> +		value_if_false_insn = insn;
> +	      if (value_if_true_insn && value_if_false_insn)
> +		break;
> +	    }
> +	}
> +
> +      /* Check if the optimization conditions are met.  */
> +      if (value_if_true_insn
> +	  && value_if_false_insn
> +	  /* Make sure that value_if_false and var are the same.  */
> +	  && BINARY_P (value_if_true_insn_src
> +		       = SET_SRC (single_set (value_if_true_insn)))
> +	  /* Make sure that both value_if_true and value_if_false
> +	     has the same var.  */
> +	  && rtx_equal_p (XEXP (value_if_true_insn_src, 0),
> +			  SET_SRC (single_set (value_if_false_insn))))
> +	{
> +	  comm_var = SET_SRC (single_set (value_if_false_insn));
> +	  rtx src = XEXP (value_if_true_insn_src, 1);
> +	  rtx imm = NULL_RTX;
> +	  if (CONST_INT_P (src))
> +	    imm = src;
> +	  else
> +	    for (insn = seq->last; insn; insn = PREV_INSN (insn))
> +	      {
> +		rtx set = single_set (insn);
> +		if (set && rtx_equal_p (SET_DEST (set), src))
> +		  {
> +		    imm = SET_SRC (set);
> +		    break;
> +		  }
> +	      }
> +	  if (imm && CONST_INT_P (imm))
> +	    {
> +	      val = INTVAL (imm);
> +	      /* Make sure that imm is a positive integer power of 2.  */
> +	      if (val > 0 && !(val & (val - 1)))
> +		can_be_optimized = true;
> +	    }
> +	}
> +
>         if (GET_MODE_SIZE (GET_MODE (op0)) < UNITS_PER_WORD)
>   	{
>   	  promote_op[0] = (REG_P (op0) && REG_P (operands[2]) &&
> @@ -5314,22 +5389,48 @@ loongarch_expand_conditional_move (rtx *operands)
>         op0_extend = op0;
>         op1_extend = force_reg (word_mode, op1);
>   
> +      rtx target = gen_reg_rtx (GET_MODE (op0));
> +
>         if (code == EQ || code == NE)
>   	{
>   	  op0 = loongarch_zero_if_equal (op0, op1);
>   	  op1 = const0_rtx;
> +	  /* For EQ, set target to 1 if op0 and op1 are the same,
> +	     otherwise set to 0.
> +	     For NE, set target to 0 if op0 and op1 are the same,
> +	     otherwise set to 1.  */
> +	  if (can_be_optimized)
> +	    loongarch_emit_binary (code, target, op0, const0_rtx);
>   	}
>         else
>   	{
>   	  /* The comparison needs a separate scc instruction.  Store the
>   	     result of the scc in *OP0 and compare it against zero.  */
>   	  bool invert = false;
> -	  rtx target = gen_reg_rtx (GET_MODE (op0));
>   	  loongarch_emit_int_order_test (code, &invert, target, op0, op1);
> +	  if (can_be_optimized && invert)
> +	    loongarch_emit_binary (EQ, target, target, const0_rtx);
>   	  code = invert ? EQ : NE;
>   	  op0 = target;
>   	  op1 = const0_rtx;
>   	}
> +
> +      if (can_be_optimized)
> +	{
> +	  /* Perform (condition ? 1 : 0) << log2 (C).  */
> +	  loongarch_emit_binary (ASHIFT, target, target,
> +				 GEN_INT (exact_log2 (val)));
> +	  /* Shift-related insn patterns only support SImode operands[2].  */
> +	  enum rtx_code opcode = GET_CODE (value_if_true_insn_src);
> +	  if (opcode == ASHIFT || opcode == ASHIFTRT || opcode == LSHIFTRT
> +	      || opcode == ROTATE || opcode == ROTATERT)
> +	    target = gen_lowpart (SImode, target);
> +	  /* Perform target = target OP ((condition ? 1 : 0) << log2 (C)).  */
> +	  loongarch_emit_binary (opcode, operands[0],
> +				 force_reg (GET_MODE (operands[3]), comm_var),
> +				 target);
> +	  return;
> +	}
>       }
>   
>     rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
> diff --git a/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
> new file mode 100644
> index 00000000000..ed13471aa90
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-not "maskeqz" } } */
> +/* { dg-final { scan-assembler-not "masknez" } } */
> +
> +extern long lm, ln, lr;
> +
> +void
> +test_ne ()
> +{
> +  if (lm != ln)
> +    lr += (1 << 16);
> +  lr += lm;
> +}
> +
> +void
> +test_eq ()
> +{
> +  if (lm == ln)
> +    lr = lm + (1 << 16);
> +  else
> +    lr = lm;
> +  lr += lm;
> +}
> +
> +void
> +test_lt ()
> +{
> +  if (lm < ln)
> +    lr *= (1 << 16);
> +  lr += lm;
> +}
> +
> +void
> +test_le ()
> +{
> +  if (lm <= ln)
> +    lr = lm * ((long)1 << 32);
> +  else
> +    lr = lm;
> +  lr += lm;
> +}
> +
> +void
> +test_nez ()
> +{
> +  if (lm != 0)
> +    lr <<= (1 << 4);
> +  lr += lm;
> +}
> +
> +void
> +test_eqz ()
> +{
> +  if (lm == 0)
> +    lr >>= (1 << 2);
> +  lr += lm;
> +}
> diff --git a/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c
> new file mode 100644
> index 00000000000..ac72d4d933a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c
> @@ -0,0 +1,42 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 --param max-rtl-if-conversion-insns=1" } */
> +/* { dg-final { scan-assembler-not "maskeqz" } } */
> +/* { dg-final { scan-assembler-not "masknez" } } */
> +
> +/* The relevant optimization is currently only based on noce_try_cmove_arith,
> +   so it bypasses noce_convert_multiple_sets by
> +   --param max-rtl-if-conversion-insns=1 to execute noce_try_cmove_arith.  */
> +
> +extern long lm, ln, lr;
> +
> +void
> +test_ge ()
> +{
> +  if (lm >= ln)
> +    lr += ((long)1 << 32);
> +  lr += lm;
> +}
> +
> +void
> +test_ltz ()
> +{
> +  if (lm < 0)
> +    lr |= (1 << 16);
> +  lr += lm;
> +}
> +
> +void
> +test_lez ()
> +{
> +  if (lm <= 0)
> +    lr &= (1 << 16);
> +  lr += lm;
> +}
> +
> +void
> +test_gez ()
> +{
> +  if (lm >= 0)
> +    lr ^= (1 << 16);
> +  lr += lm;
> +}
diff mbox series

Patch

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 2d4290bc2d1..32fd1697813 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -5294,6 +5294,81 @@  loongarch_expand_conditional_move (rtx *operands)
     loongarch_emit_float_compare (&code, &op0, &op1);
   else
     {
+      /* Optimize to reduce the number of instructions for ternary operations.
+	 Mainly implemented based on noce_try_cmove_arith.
+	 For dest = (condition) ? value_if_true : value_if_false;
+	 the optimization requires:
+	  a. value_if_false = var;
+	  b. value_if_true = var OP C (a positive integer power of 2).
+
+	 Situations similar to the following:
+	    if (condition)
+	      dest += 1 << imm;
+	 to:
+	    dest += (condition ? 1 : 0) << imm;  */
+
+      rtx_insn *insn;
+      HOST_WIDE_INT val = 0; /* The value of rtx C.  */
+      /* INSN with operands[2] as the output.  */
+      rtx_insn *value_if_true_insn = NULL;
+      /* INSN with operands[3] as the output.  */
+      rtx_insn *value_if_false_insn = NULL;
+      rtx value_if_true_insn_src = NULL_RTX;
+      /* Common operand var in value_if_true and value_if_false.  */
+      rtx comm_var = NULL_RTX;
+      bool can_be_optimized = false;
+
+      /* Search value_if_true_insn and value_if_false_insn.  */
+      struct sequence_stack *seq = get_current_sequence ()->next;
+      for (insn = seq->last; insn; insn = PREV_INSN (insn))
+	{
+	  if (single_set (insn))
+	    {
+	      rtx set_dest = SET_DEST (single_set (insn));
+	      if (rtx_equal_p (set_dest, operands[2]))
+		value_if_true_insn = insn;
+	      else if (rtx_equal_p (set_dest, operands[3]))
+		value_if_false_insn = insn;
+	      if (value_if_true_insn && value_if_false_insn)
+		break;
+	    }
+	}
+
+      /* Check if the optimization conditions are met.  */
+      if (value_if_true_insn
+	  && value_if_false_insn
+	  /* Make sure that value_if_false and var are the same.  */
+	  && BINARY_P (value_if_true_insn_src
+		       = SET_SRC (single_set (value_if_true_insn)))
+	  /* Make sure that both value_if_true and value_if_false
+	     has the same var.  */
+	  && rtx_equal_p (XEXP (value_if_true_insn_src, 0),
+			  SET_SRC (single_set (value_if_false_insn))))
+	{
+	  comm_var = SET_SRC (single_set (value_if_false_insn));
+	  rtx src = XEXP (value_if_true_insn_src, 1);
+	  rtx imm = NULL_RTX;
+	  if (CONST_INT_P (src))
+	    imm = src;
+	  else
+	    for (insn = seq->last; insn; insn = PREV_INSN (insn))
+	      {
+		rtx set = single_set (insn);
+		if (set && rtx_equal_p (SET_DEST (set), src))
+		  {
+		    imm = SET_SRC (set);
+		    break;
+		  }
+	      }
+	  if (imm && CONST_INT_P (imm))
+	    {
+	      val = INTVAL (imm);
+	      /* Make sure that imm is a positive integer power of 2.  */
+	      if (val > 0 && !(val & (val - 1)))
+		can_be_optimized = true;
+	    }
+	}
+
       if (GET_MODE_SIZE (GET_MODE (op0)) < UNITS_PER_WORD)
 	{
 	  promote_op[0] = (REG_P (op0) && REG_P (operands[2]) &&
@@ -5314,22 +5389,48 @@  loongarch_expand_conditional_move (rtx *operands)
       op0_extend = op0;
       op1_extend = force_reg (word_mode, op1);
 
+      rtx target = gen_reg_rtx (GET_MODE (op0));
+
       if (code == EQ || code == NE)
 	{
 	  op0 = loongarch_zero_if_equal (op0, op1);
 	  op1 = const0_rtx;
+	  /* For EQ, set target to 1 if op0 and op1 are the same,
+	     otherwise set to 0.
+	     For NE, set target to 0 if op0 and op1 are the same,
+	     otherwise set to 1.  */
+	  if (can_be_optimized)
+	    loongarch_emit_binary (code, target, op0, const0_rtx);
 	}
       else
 	{
 	  /* The comparison needs a separate scc instruction.  Store the
 	     result of the scc in *OP0 and compare it against zero.  */
 	  bool invert = false;
-	  rtx target = gen_reg_rtx (GET_MODE (op0));
 	  loongarch_emit_int_order_test (code, &invert, target, op0, op1);
+	  if (can_be_optimized && invert)
+	    loongarch_emit_binary (EQ, target, target, const0_rtx);
 	  code = invert ? EQ : NE;
 	  op0 = target;
 	  op1 = const0_rtx;
 	}
+
+      if (can_be_optimized)
+	{
+	  /* Perform (condition ? 1 : 0) << log2 (C).  */
+	  loongarch_emit_binary (ASHIFT, target, target,
+				 GEN_INT (exact_log2 (val)));
+	  /* Shift-related insn patterns only support SImode operands[2].  */
+	  enum rtx_code opcode = GET_CODE (value_if_true_insn_src);
+	  if (opcode == ASHIFT || opcode == ASHIFTRT || opcode == LSHIFTRT
+	      || opcode == ROTATE || opcode == ROTATERT)
+	    target = gen_lowpart (SImode, target);
+	  /* Perform target = target OP ((condition ? 1 : 0) << log2 (C)).  */
+	  loongarch_emit_binary (opcode, operands[0],
+				 force_reg (GET_MODE (operands[3]), comm_var),
+				 target);
+	  return;
+	}
     }
 
   rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
diff --git a/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
new file mode 100644
index 00000000000..ed13471aa90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-1.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "maskeqz" } } */
+/* { dg-final { scan-assembler-not "masknez" } } */
+
+extern long lm, ln, lr;
+
+void
+test_ne ()
+{
+  if (lm != ln)
+    lr += (1 << 16);
+  lr += lm;
+}
+
+void
+test_eq ()
+{
+  if (lm == ln)
+    lr = lm + (1 << 16);
+  else
+    lr = lm;
+  lr += lm;
+}
+
+void
+test_lt ()
+{
+  if (lm < ln)
+    lr *= (1 << 16);
+  lr += lm;
+}
+
+void
+test_le ()
+{
+  if (lm <= ln)
+    lr = lm * ((long)1 << 32);
+  else
+    lr = lm;
+  lr += lm;
+}
+
+void
+test_nez ()
+{
+  if (lm != 0)
+    lr <<= (1 << 4);
+  lr += lm;
+}
+
+void
+test_eqz ()
+{
+  if (lm == 0)
+    lr >>= (1 << 2);
+  lr += lm;
+}
diff --git a/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c
new file mode 100644
index 00000000000..ac72d4d933a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/conditional-move-opt-2.c
@@ -0,0 +1,42 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 --param max-rtl-if-conversion-insns=1" } */
+/* { dg-final { scan-assembler-not "maskeqz" } } */
+/* { dg-final { scan-assembler-not "masknez" } } */
+
+/* The relevant optimization is currently only based on noce_try_cmove_arith,
+   so it bypasses noce_convert_multiple_sets by
+   --param max-rtl-if-conversion-insns=1 to execute noce_try_cmove_arith.  */
+
+extern long lm, ln, lr;
+
+void
+test_ge ()
+{
+  if (lm >= ln)
+    lr += ((long)1 << 32);
+  lr += lm;
+}
+
+void
+test_ltz ()
+{
+  if (lm < 0)
+    lr |= (1 << 16);
+  lr += lm;
+}
+
+void
+test_lez ()
+{
+  if (lm <= 0)
+    lr &= (1 << 16);
+  lr += lm;
+}
+
+void
+test_gez ()
+{
+  if (lm >= 0)
+    lr ^= (1 << 16);
+  lr += lm;
+}