Message ID | 20210907091704.1034380-10-christophe.lyon@foss.st.com |
---|---|
State | New |
Headers | show |
Series | None | expand |
Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > From: Christophe Lyon <christophe.lyon@linaro.org> > > The problem in this PR is that we call VPSEL with a mask of vector > type instead of HImode. This happens because operand 3 in vcond_mask > is the pre-computed vector comparison and has vector type. > > This patch fixes it by implementing TARGET_VECTORIZE_GET_MASK_MODE, > returning the appropriate VxBI mode when targeting MVE. In turn, this > implies implementing vec_cmp<mode><MVE_vpred>, > vec_cmpu<mode><MVE_vpred> and vcond_mask_<mode><MVE_vpred>, and we can > move vec_cmp<mode><v_cmp_result>, vec_cmpu<mode><mode> and > vcond_mask_<mode><v_cmp_result> back to neon.md since they are not > used by MVE anymore. The new *<MVE_vpred> patterns listed above are > implemented in mve.md since they are only valid for MVE. However this > may make maintenance/comparison more painful than having all of them > in vec-common.md. > > In the process, we can get rid of the recently added vcond_mve > parameter of arm_expand_vector_compare. > > Compared to neon.md's vcond_mask_<mode><v_cmp_result> before my "arm: > Auto-vectorization for MVE: vcmp" patch (r12-834), it keeps the VDQWH > iterator added in r12-835 (to have V4HF/V8HF support), as well as the > (!<Is_float_mode> || flag_unsafe_math_optimizations) condition which > was not present before r12-834 although SF modes were enabled by VDQW > (I think this was a bug). > > Using TARGET_VECTORIZE_GET_MASK_MODE has the advantage that we no > longer need to generate vpsel with vectors of 0 and 1: the masks are > now merged via scalar 'ands' instructions operating on 16-bit masks > after converting the boolean vectors. > > In addition, this patch fixes a problem in arm_expand_vcond() where > the result would be a vector of 0 or 1 instead of operand 1 or 2. > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > generate the code below: > > float a[32]; > float fn1(int d) { > float c = 4.0f; > for (int b = 0; b < 8; b++) > if (a[b] != 2.0f) > c = 5.0f; > return c; > } > > fn1: > ldr r3, .L3+48 > vldr.64 d4, .L3 // q2=(2.0,2.0,2.0,2.0) > vldr.64 d5, .L3+8 > vldrw.32 q0, [r3] // q0=a(0..3) > adds r3, r3, #16 > vcmp.f32 eq, q0, q2 // cmp a(0..3) == (2.0,2.0,2.0,2.0) > vldrw.32 q1, [r3] // q1=a(4..7) > vmrs r3, P0 > vcmp.f32 eq, q1, q2 // cmp a(4..7) == (2.0,2.0,2.0,2.0) > vmrs r2, P0 @ movhi > ands r3, r3, r2 // r3=select(a(0..3]) & select(a(4..7)) > vldr.64 d4, .L3+16 // q2=(5.0,5.0,5.0,5.0) > vldr.64 d5, .L3+24 > vmsr P0, r3 > vldr.64 d6, .L3+32 // q3=(4.0,4.0,4.0,4.0) > vldr.64 d7, .L3+40 > vpsel q3, q3, q2 // q3=vcond_mask(4.0,5.0) > vmov.32 r2, q3[1] // keep the scalar max > vmov.32 r0, q3[3] > vmov.32 r3, q3[2] > vmov.f32 s11, s12 > vmov s15, r2 > vmov s14, r3 > vmaxnm.f32 s15, s11, s15 > vmaxnm.f32 s15, s15, s14 > vmov s14, r0 > vmaxnm.f32 s15, s15, s14 > vmov r0, s15 > bx lr > .L4: > .align 3 > .L3: > .word 1073741824 // 2.0f > .word 1073741824 > .word 1073741824 > .word 1073741824 > .word 1084227584 // 5.0f > .word 1084227584 > .word 1084227584 > .word 1084227584 > .word 1082130432 // 4.0f > .word 1082130432 > .word 1082130432 > .word 1082130432 > > 2021-09-02 Christophe Lyon <christophe.lyon@linaro.org> > > PR target/100757 > gcc/ > * config/arm/arm-protos.h (arm_get_mask_mode): New prototype. > (arm_expand_vector_compare): Update prototype. > * config/arm/arm.c (TARGET_VECTORIZE_GET_MASK_MODE): New. > (arm_vector_mode_supported_p): Add support for VxBI modes. > (arm_expand_vector_compare): Remove useless generation of vpsel. > (arm_expand_vcond): Fix select operands. > (arm_get_mask_mode): New. > * config/arm/mve.md (vec_cmp<mode><MVE_vpred>): New. > (vec_cmpu<mode><MVE_vpred>): New. > (vcond_mask_<mode><MVE_vpred>): New. > * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>) > (vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): Move to ... > * config/arm/neon.md (vec_cmp<mode><v_cmp_result>) > (vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): ... here > and disable for MVE. > > diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h > index 9b1f61394ad..9e3d71e0c29 100644 > --- a/gcc/config/arm/arm-protos.h > +++ b/gcc/config/arm/arm-protos.h > @@ -201,6 +201,7 @@ extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); > extern bool arm_pad_reg_upward (machine_mode, tree, int); > #endif > extern int arm_apply_result_size (void); > +extern opt_machine_mode arm_get_mask_mode (machine_mode mode); > > #endif /* RTX_CODE */ > > @@ -372,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, > extern bool arm_fusion_enabled_p (tune_params::fuse_ops); > extern bool arm_valid_symbolic_address_p (rtx); > extern bool arm_validize_comparison (rtx *, rtx *, rtx *); > -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool); > +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool); > #endif /* RTX_CODE */ > > extern bool arm_gen_setmem (rtx *); > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c > index 5f6637d9a5f..3326cd163a2 100644 > --- a/gcc/config/arm/arm.c > +++ b/gcc/config/arm/arm.c > @@ -835,6 +835,10 @@ static const struct attribute_spec arm_attribute_table[] = > > #undef TARGET_MD_ASM_ADJUST > #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust > + > +#undef TARGET_VECTORIZE_GET_MASK_MODE > +#define TARGET_VECTORIZE_GET_MASK_MODE arm_get_mask_mode > + > > /* Obstack for minipool constant handling. */ > static struct obstack minipool_obstack; > @@ -29193,7 +29197,8 @@ arm_vector_mode_supported_p (machine_mode mode) > > if (TARGET_HAVE_MVE > && (mode == V2DImode || mode == V4SImode || mode == V8HImode > - || mode == V16QImode)) > + || mode == V16QImode > + || mode == V16BImode || mode == V8BImode || mode == V4BImode)) > return true; > > if (TARGET_HAVE_MVE_FLOAT > @@ -31012,16 +31017,12 @@ arm_mode_to_pred_mode (machine_mode mode) > and return true if TARGET contains the inverse. If !CAN_INVERT, > always store the result in TARGET, never its inverse. > > - If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do > - it with the right destination type to avoid emiting two vpsel, one here and > - one in arm_expand_vcond. > - > Note that the handling of floating-point comparisons is not > IEEE compliant. */ > > bool > arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > - bool can_invert, bool vcond_mve) > + bool can_invert) > { > machine_mode cmp_result_mode = GET_MODE (target); > machine_mode cmp_mode = GET_MODE (op0); > @@ -31050,7 +31051,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > and then store its inverse in TARGET. This avoids reusing > TARGET (which for integer NE could be one of the inputs). */ > rtx tmp = gen_reg_rtx (cmp_result_mode); > - if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve)) > + if (arm_expand_vector_compare (tmp, code, op0, op1, true)) > gcc_unreachable (); > emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); > return false; > @@ -31086,36 +31087,20 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > case NE: > if (TARGET_HAVE_MVE) > { > - rtx vpr_p0; > - if (vcond_mve) > - vpr_p0 = target; > - else > - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); > - > switch (GET_MODE_CLASS (cmp_mode)) > { > case MODE_VECTOR_INT: > - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); > + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); > break; > case MODE_VECTOR_FLOAT: > if (TARGET_HAVE_MVE_FLOAT) > - emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); > + emit_insn (gen_mve_vcmpq_f (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); > else > gcc_unreachable (); > break; > default: > gcc_unreachable (); > } > - > - /* If we are not expanding a vcond, build the result here. */ > - if (!vcond_mve) > - { > - rtx zero = gen_reg_rtx (cmp_result_mode); > - rtx one = gen_reg_rtx (cmp_result_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); > - } > } > else > emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); > @@ -31127,23 +31112,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > case GEU: > case GTU: > if (TARGET_HAVE_MVE) > - { > - rtx vpr_p0; > - if (vcond_mve) > - vpr_p0 = target; > - else > - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); > - > - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); > - if (!vcond_mve) > - { > - rtx zero = gen_reg_rtx (cmp_result_mode); > - rtx one = gen_reg_rtx (cmp_result_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); > - } > - } > + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); > else > emit_insn (gen_neon_vc (code, cmp_mode, target, > op0, force_reg (cmp_mode, op1))); > @@ -31154,23 +31123,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > case LEU: > case LTU: > if (TARGET_HAVE_MVE) > - { > - rtx vpr_p0; > - if (vcond_mve) > - vpr_p0 = target; > - else > - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); > - > - emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0)); > - if (!vcond_mve) > - { > - rtx zero = gen_reg_rtx (cmp_result_mode); > - rtx one = gen_reg_rtx (cmp_result_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); > - } > - } > + emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, target, force_reg (cmp_mode, op1), op0)); > else > emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, > target, force_reg (cmp_mode, op1), op0)); > @@ -31185,8 +31138,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > rtx gt_res = gen_reg_rtx (cmp_result_mode); > rtx alt_res = gen_reg_rtx (cmp_result_mode); > rtx_code alt_code = (code == LTGT ? LT : LE); > - if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve) > - || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve)) > + if (arm_expand_vector_compare (gt_res, GT, op0, op1, true) > + || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true)) > gcc_unreachable (); > emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode, > gt_res, alt_res))); > @@ -31206,19 +31159,15 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) > { > /* When expanding for MVE, we do not want to emit a (useless) vpsel in > arm_expand_vector_compare, and another one here. */ > - bool vcond_mve=false; > rtx mask; > > if (TARGET_HAVE_MVE) > - { > - vcond_mve=true; > - mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); > - } > + mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); > else > mask = gen_reg_rtx (cmp_result_mode); > > bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]), > - operands[4], operands[5], true, vcond_mve); > + operands[4], operands[5], true); > if (inverted) > std::swap (operands[1], operands[2]); > if (TARGET_NEON) > @@ -31226,20 +31175,20 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) > mask, operands[1], operands[2])); > else > { > - machine_mode cmp_mode = GET_MODE (operands[4]); > - rtx vpr_p0 = mask; > - rtx zero = gen_reg_rtx (cmp_mode); > - rtx one = gen_reg_rtx (cmp_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_mode)); > + machine_mode cmp_mode = GET_MODE (operands[0]); > + > switch (GET_MODE_CLASS (cmp_mode)) > { > case MODE_VECTOR_INT: > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); > + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0], > + operands[1], operands[2], mask)); > break; > case MODE_VECTOR_FLOAT: > if (TARGET_HAVE_MVE_FLOAT) > - emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); > + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], > + operands[1], operands[2], mask)); > + else > + gcc_unreachable (); > break; > default: > gcc_unreachable (); > @@ -34149,4 +34098,15 @@ arm_mode_base_reg_class (machine_mode mode) > > struct gcc_target targetm = TARGET_INITIALIZER; > > +/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ > + > +opt_machine_mode > +arm_get_mask_mode (machine_mode mode) > +{ > + if (TARGET_HAVE_MVE) > + return arm_mode_to_pred_mode (mode); I think this needs to check whether arm_mode_to_pred_mode accepts the mode first. (Alternatively, arm_mode_to_pred_mode could return an opt_machine_mode and punt for modes that it doesn't understand.) > + > + return default_get_mask_mode (mode); > +} > + > #include "gt-arm.h" > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index c9c8e2c13fe..d663c698cfb 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -10550,3 +10550,58 @@ (define_insn "*mve_mov<mode>" > vmsr%?\t P0, %1 > vmrs%?\t %0, P0" > ) > + > +;; Expanders for vec_cmp and vcond > + > +(define_expand "vec_cmp<mode><MVE_vpred>" > + [(set (match_operand:<MVE_VPRED> 0 "s_register_operand") > + (match_operator:<MVE_VPRED> 1 "comparison_operator" > + [(match_operand:MVE_VLD_ST 2 "s_register_operand") > + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] > + "TARGET_HAVE_MVE > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false); > + DONE; > +}) > + > +(define_expand "vec_cmpu<mode><MVE_vpred>" > + [(set (match_operand:<MVE_VPRED> 0 "s_register_operand") > + (match_operator:<MVE_VPRED> 1 "comparison_operator" > + [(match_operand:MVE_2 2 "s_register_operand") > + (match_operand:MVE_2 3 "reg_or_zero_operand")]))] > + "TARGET_HAVE_MVE > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" The float check should be redundant here, since MVE_2 only includes integer modes. Looks good otherwise, thanks. Richard > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false); > + DONE; > +}) > + > +(define_expand "vcond_mask_<mode><MVE_vpred>" > + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand") > + (if_then_else:MVE_VLD_ST > + (match_operand:<MVE_VPRED> 3 "s_register_operand") > + (match_operand:MVE_VLD_ST 1 "s_register_operand") > + (match_operand:MVE_VLD_ST 2 "s_register_operand")))] > + "TARGET_HAVE_MVE" > +{ > + switch (GET_MODE_CLASS (<MODE>mode)) > + { > + case MODE_VECTOR_INT: > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > + operands[1], operands[2], operands[3])); > + break; > + case MODE_VECTOR_FLOAT: > + if (TARGET_HAVE_MVE_FLOAT) > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], > + operands[1], operands[2], operands[3])); > + else > + gcc_unreachable (); > + break; > + default: > + gcc_unreachable (); > + } > + DONE; > +}) > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > index 8b0a396947c..28310d93a4e 100644 > --- a/gcc/config/arm/neon.md > +++ b/gcc/config/arm/neon.md > @@ -1394,6 +1394,45 @@ (define_insn "*us_sub<mode>_neon" > [(set_attr "type" "neon_qsub<q>")] > ) > > +(define_expand "vec_cmp<mode><v_cmp_result>" > + [(set (match_operand:<V_cmp_result> 0 "s_register_operand") > + (match_operator:<V_cmp_result> 1 "comparison_operator" > + [(match_operand:VDQWH 2 "s_register_operand") > + (match_operand:VDQWH 3 "reg_or_zero_operand")]))] > + "TARGET_NEON > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false); > + DONE; > +}) > + > +(define_expand "vec_cmpu<mode><mode>" > + [(set (match_operand:VDQIW 0 "s_register_operand") > + (match_operator:VDQIW 1 "comparison_operator" > + [(match_operand:VDQIW 2 "s_register_operand") > + (match_operand:VDQIW 3 "reg_or_zero_operand")]))] > + "TARGET_NEON" > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false); > + DONE; > +}) > + > +(define_expand "vcond_mask_<mode><v_cmp_result>" > + [(set (match_operand:VDQWH 0 "s_register_operand") > + (if_then_else:VDQWH > + (match_operand:<V_cmp_result> 3 "s_register_operand") > + (match_operand:VDQWH 1 "s_register_operand") > + (match_operand:VDQWH 2 "s_register_operand")))] > + "TARGET_NEON > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > +{ > + emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1], > + operands[2])); > + DONE; > +}) > + > ;; Patterns for builtins. > > ; good for plain vadd, vaddq. > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index 68de4f0f943..9b461a76155 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -363,33 +363,6 @@ (define_expand "vlshr<mode>3" > } > }) > > -(define_expand "vec_cmp<mode><v_cmp_result>" > - [(set (match_operand:<V_cmp_result> 0 "s_register_operand") > - (match_operator:<V_cmp_result> 1 "comparison_operator" > - [(match_operand:VDQWH 2 "s_register_operand") > - (match_operand:VDQWH 3 "reg_or_zero_operand")]))] > - "ARM_HAVE_<MODE>_ARITH > - && !TARGET_REALLY_IWMMXT > - && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > -{ > - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > - operands[2], operands[3], false, false); > - DONE; > -}) > - > -(define_expand "vec_cmpu<mode><mode>" > - [(set (match_operand:VDQIW 0 "s_register_operand") > - (match_operator:VDQIW 1 "comparison_operator" > - [(match_operand:VDQIW 2 "s_register_operand") > - (match_operand:VDQIW 3 "reg_or_zero_operand")]))] > - "ARM_HAVE_<MODE>_ARITH > - && !TARGET_REALLY_IWMMXT" > -{ > - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > - operands[2], operands[3], false, false); > - DONE; > -}) > - > ;; Conditional instructions. These are comparisons with conditional moves for > ;; vectors. They perform the assignment: > ;; > @@ -461,31 +434,6 @@ (define_expand "vcondu<mode><v_cmp_result>" > DONE; > }) > > -(define_expand "vcond_mask_<mode><v_cmp_result>" > - [(set (match_operand:VDQWH 0 "s_register_operand") > - (if_then_else:VDQWH > - (match_operand:<V_cmp_result> 3 "s_register_operand") > - (match_operand:VDQWH 1 "s_register_operand") > - (match_operand:VDQWH 2 "s_register_operand")))] > - "ARM_HAVE_<MODE>_ARITH > - && !TARGET_REALLY_IWMMXT > - && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > -{ > - if (TARGET_NEON) > - { > - emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3], > - operands[1], operands[2])); > - } > - else if (TARGET_HAVE_MVE) > - { > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > - operands[1], operands[2], operands[3])); > - } > - else > - gcc_unreachable (); > - DONE; > -}) > - > (define_expand "vec_load_lanesoi<mode>" > [(set (match_operand:OI 0 "s_register_operand") > (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 9b1f61394ad..9e3d71e0c29 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -201,6 +201,7 @@ extern void arm_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); extern bool arm_pad_reg_upward (machine_mode, tree, int); #endif extern int arm_apply_result_size (void); +extern opt_machine_mode arm_get_mask_mode (machine_mode mode); #endif /* RTX_CODE */ @@ -372,7 +373,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx, extern bool arm_fusion_enabled_p (tune_params::fuse_ops); extern bool arm_valid_symbolic_address_p (rtx); extern bool arm_validize_comparison (rtx *, rtx *, rtx *); -extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool, bool); +extern bool arm_expand_vector_compare (rtx, rtx_code, rtx, rtx, bool); #endif /* RTX_CODE */ extern bool arm_gen_setmem (rtx *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 5f6637d9a5f..3326cd163a2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -835,6 +835,10 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_MD_ASM_ADJUST #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust + +#undef TARGET_VECTORIZE_GET_MASK_MODE +#define TARGET_VECTORIZE_GET_MASK_MODE arm_get_mask_mode + /* Obstack for minipool constant handling. */ static struct obstack minipool_obstack; @@ -29193,7 +29197,8 @@ arm_vector_mode_supported_p (machine_mode mode) if (TARGET_HAVE_MVE && (mode == V2DImode || mode == V4SImode || mode == V8HImode - || mode == V16QImode)) + || mode == V16QImode + || mode == V16BImode || mode == V8BImode || mode == V4BImode)) return true; if (TARGET_HAVE_MVE_FLOAT @@ -31012,16 +31017,12 @@ arm_mode_to_pred_mode (machine_mode mode) and return true if TARGET contains the inverse. If !CAN_INVERT, always store the result in TARGET, never its inverse. - If VCOND_MVE, do not emit the vpsel instruction here, let arm_expand_vcond do - it with the right destination type to avoid emiting two vpsel, one here and - one in arm_expand_vcond. - Note that the handling of floating-point comparisons is not IEEE compliant. */ bool arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, - bool can_invert, bool vcond_mve) + bool can_invert) { machine_mode cmp_result_mode = GET_MODE (target); machine_mode cmp_mode = GET_MODE (op0); @@ -31050,7 +31051,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, and then store its inverse in TARGET. This avoids reusing TARGET (which for integer NE could be one of the inputs). */ rtx tmp = gen_reg_rtx (cmp_result_mode); - if (arm_expand_vector_compare (tmp, code, op0, op1, true, vcond_mve)) + if (arm_expand_vector_compare (tmp, code, op0, op1, true)) gcc_unreachable (); emit_insn (gen_rtx_SET (target, gen_rtx_NOT (cmp_result_mode, tmp))); return false; @@ -31086,36 +31087,20 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case NE: if (TARGET_HAVE_MVE) { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - switch (GET_MODE_CLASS (cmp_mode)) { case MODE_VECTOR_INT: - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); break; case MODE_VECTOR_FLOAT: if (TARGET_HAVE_MVE_FLOAT) - emit_insn (gen_mve_vcmpq_f (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); + emit_insn (gen_mve_vcmpq_f (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); else gcc_unreachable (); break; default: gcc_unreachable (); } - - /* If we are not expanding a vcond, build the result here. */ - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } } else emit_insn (gen_neon_vc (code, cmp_mode, target, op0, op1)); @@ -31127,23 +31112,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case GEU: case GTU: if (TARGET_HAVE_MVE) - { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - - emit_insn (gen_mve_vcmpq (code, cmp_mode, vpr_p0, op0, force_reg (cmp_mode, op1))); - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } - } + emit_insn (gen_mve_vcmpq (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); else emit_insn (gen_neon_vc (code, cmp_mode, target, op0, force_reg (cmp_mode, op1))); @@ -31154,23 +31123,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, case LEU: case LTU: if (TARGET_HAVE_MVE) - { - rtx vpr_p0; - if (vcond_mve) - vpr_p0 = target; - else - vpr_p0 = gen_reg_rtx (arm_mode_to_pred_mode (cmp_mode)); - - emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, vpr_p0, force_reg (cmp_mode, op1), op0)); - if (!vcond_mve) - { - rtx zero = gen_reg_rtx (cmp_result_mode); - rtx one = gen_reg_rtx (cmp_result_mode); - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); - } - } + emit_insn (gen_mve_vcmpq (swap_condition (code), cmp_mode, target, force_reg (cmp_mode, op1), op0)); else emit_insn (gen_neon_vc (swap_condition (code), cmp_mode, target, force_reg (cmp_mode, op1), op0)); @@ -31185,8 +31138,8 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, rtx gt_res = gen_reg_rtx (cmp_result_mode); rtx alt_res = gen_reg_rtx (cmp_result_mode); rtx_code alt_code = (code == LTGT ? LT : LE); - if (arm_expand_vector_compare (gt_res, GT, op0, op1, true, vcond_mve) - || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true, vcond_mve)) + if (arm_expand_vector_compare (gt_res, GT, op0, op1, true) + || arm_expand_vector_compare (alt_res, alt_code, op0, op1, true)) gcc_unreachable (); emit_insn (gen_rtx_SET (target, gen_rtx_IOR (cmp_result_mode, gt_res, alt_res))); @@ -31206,19 +31159,15 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) { /* When expanding for MVE, we do not want to emit a (useless) vpsel in arm_expand_vector_compare, and another one here. */ - bool vcond_mve=false; rtx mask; if (TARGET_HAVE_MVE) - { - vcond_mve=true; - mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); - } + mask = gen_reg_rtx (arm_mode_to_pred_mode (cmp_result_mode)); else mask = gen_reg_rtx (cmp_result_mode); bool inverted = arm_expand_vector_compare (mask, GET_CODE (operands[3]), - operands[4], operands[5], true, vcond_mve); + operands[4], operands[5], true); if (inverted) std::swap (operands[1], operands[2]); if (TARGET_NEON) @@ -31226,20 +31175,20 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) mask, operands[1], operands[2])); else { - machine_mode cmp_mode = GET_MODE (operands[4]); - rtx vpr_p0 = mask; - rtx zero = gen_reg_rtx (cmp_mode); - rtx one = gen_reg_rtx (cmp_mode); - emit_move_insn (zero, CONST0_RTX (cmp_mode)); - emit_move_insn (one, CONST1_RTX (cmp_mode)); + machine_mode cmp_mode = GET_MODE (operands[0]); + switch (GET_MODE_CLASS (cmp_mode)) { case MODE_VECTOR_INT: - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0], + operands[1], operands[2], mask)); break; case MODE_VECTOR_FLOAT: if (TARGET_HAVE_MVE_FLOAT) - emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], + operands[1], operands[2], mask)); + else + gcc_unreachable (); break; default: gcc_unreachable (); @@ -34149,4 +34098,15 @@ arm_mode_base_reg_class (machine_mode mode) struct gcc_target targetm = TARGET_INITIALIZER; +/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ + +opt_machine_mode +arm_get_mask_mode (machine_mode mode) +{ + if (TARGET_HAVE_MVE) + return arm_mode_to_pred_mode (mode); + + return default_get_mask_mode (mode); +} + #include "gt-arm.h" diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index c9c8e2c13fe..d663c698cfb 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -10550,3 +10550,58 @@ (define_insn "*mve_mov<mode>" vmsr%?\t P0, %1 vmrs%?\t %0, P0" ) + +;; Expanders for vec_cmp and vcond + +(define_expand "vec_cmp<mode><MVE_vpred>" + [(set (match_operand:<MVE_VPRED> 0 "s_register_operand") + (match_operator:<MVE_VPRED> 1 "comparison_operator" + [(match_operand:MVE_VLD_ST 2 "s_register_operand") + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] + "TARGET_HAVE_MVE + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vec_cmpu<mode><MVE_vpred>" + [(set (match_operand:<MVE_VPRED> 0 "s_register_operand") + (match_operator:<MVE_VPRED> 1 "comparison_operator" + [(match_operand:MVE_2 2 "s_register_operand") + (match_operand:MVE_2 3 "reg_or_zero_operand")]))] + "TARGET_HAVE_MVE + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vcond_mask_<mode><MVE_vpred>" + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand") + (if_then_else:MVE_VLD_ST + (match_operand:<MVE_VPRED> 3 "s_register_operand") + (match_operand:MVE_VLD_ST 1 "s_register_operand") + (match_operand:MVE_VLD_ST 2 "s_register_operand")))] + "TARGET_HAVE_MVE" +{ + switch (GET_MODE_CLASS (<MODE>mode)) + { + case MODE_VECTOR_INT: + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], + operands[1], operands[2], operands[3])); + break; + case MODE_VECTOR_FLOAT: + if (TARGET_HAVE_MVE_FLOAT) + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], + operands[1], operands[2], operands[3])); + else + gcc_unreachable (); + break; + default: + gcc_unreachable (); + } + DONE; +}) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 8b0a396947c..28310d93a4e 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1394,6 +1394,45 @@ (define_insn "*us_sub<mode>_neon" [(set_attr "type" "neon_qsub<q>")] ) +(define_expand "vec_cmp<mode><v_cmp_result>" + [(set (match_operand:<V_cmp_result> 0 "s_register_operand") + (match_operator:<V_cmp_result> 1 "comparison_operator" + [(match_operand:VDQWH 2 "s_register_operand") + (match_operand:VDQWH 3 "reg_or_zero_operand")]))] + "TARGET_NEON + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vec_cmpu<mode><mode>" + [(set (match_operand:VDQIW 0 "s_register_operand") + (match_operator:VDQIW 1 "comparison_operator" + [(match_operand:VDQIW 2 "s_register_operand") + (match_operand:VDQIW 3 "reg_or_zero_operand")]))] + "TARGET_NEON" +{ + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), + operands[2], operands[3], false); + DONE; +}) + +(define_expand "vcond_mask_<mode><v_cmp_result>" + [(set (match_operand:VDQWH 0 "s_register_operand") + (if_then_else:VDQWH + (match_operand:<V_cmp_result> 3 "s_register_operand") + (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "TARGET_NEON + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" +{ + emit_insn (gen_neon_vbsl<mode> (operands[0], operands[3], operands[1], + operands[2])); + DONE; +}) + ;; Patterns for builtins. ; good for plain vadd, vaddq. diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 68de4f0f943..9b461a76155 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -363,33 +363,6 @@ (define_expand "vlshr<mode>3" } }) -(define_expand "vec_cmp<mode><v_cmp_result>" - [(set (match_operand:<V_cmp_result> 0 "s_register_operand") - (match_operator:<V_cmp_result> 1 "comparison_operator" - [(match_operand:VDQWH 2 "s_register_operand") - (match_operand:VDQWH 3 "reg_or_zero_operand")]))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT - && (!<Is_float_mode> || flag_unsafe_math_optimizations)" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false, false); - DONE; -}) - -(define_expand "vec_cmpu<mode><mode>" - [(set (match_operand:VDQIW 0 "s_register_operand") - (match_operator:VDQIW 1 "comparison_operator" - [(match_operand:VDQIW 2 "s_register_operand") - (match_operand:VDQIW 3 "reg_or_zero_operand")]))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT" -{ - arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), - operands[2], operands[3], false, false); - DONE; -}) - ;; Conditional instructions. These are comparisons with conditional moves for ;; vectors. They perform the assignment: ;; @@ -461,31 +434,6 @@ (define_expand "vcondu<mode><v_cmp_result>" DONE; }) -(define_expand "vcond_mask_<mode><v_cmp_result>" - [(set (match_operand:VDQWH 0 "s_register_operand") - (if_then_else:VDQWH - (match_operand:<V_cmp_result> 3 "s_register_operand") - (match_operand:VDQWH 1 "s_register_operand") - (match_operand:VDQWH 2 "s_register_operand")))] - "ARM_HAVE_<MODE>_ARITH - && !TARGET_REALLY_IWMMXT - && (!<Is_float_mode> || flag_unsafe_math_optimizations)" -{ - if (TARGET_NEON) - { - emit_insn (gen_neon_vbsl (<MODE>mode, operands[0], operands[3], - operands[1], operands[2])); - } - else if (TARGET_HAVE_MVE) - { - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], - operands[1], operands[2], operands[3])); - } - else - gcc_unreachable (); - DONE; -}) - (define_expand "vec_load_lanesoi<mode>" [(set (match_operand:OI 0 "s_register_operand") (unspec:OI [(match_operand:OI 1 "neon_struct_operand")
From: Christophe Lyon <christophe.lyon@linaro.org> The problem in this PR is that we call VPSEL with a mask of vector type instead of HImode. This happens because operand 3 in vcond_mask is the pre-computed vector comparison and has vector type. This patch fixes it by implementing TARGET_VECTORIZE_GET_MASK_MODE, returning the appropriate VxBI mode when targeting MVE. In turn, this implies implementing vec_cmp<mode><MVE_vpred>, vec_cmpu<mode><MVE_vpred> and vcond_mask_<mode><MVE_vpred>, and we can move vec_cmp<mode><v_cmp_result>, vec_cmpu<mode><mode> and vcond_mask_<mode><v_cmp_result> back to neon.md since they are not used by MVE anymore. The new *<MVE_vpred> patterns listed above are implemented in mve.md since they are only valid for MVE. However this may make maintenance/comparison more painful than having all of them in vec-common.md. In the process, we can get rid of the recently added vcond_mve parameter of arm_expand_vector_compare. Compared to neon.md's vcond_mask_<mode><v_cmp_result> before my "arm: Auto-vectorization for MVE: vcmp" patch (r12-834), it keeps the VDQWH iterator added in r12-835 (to have V4HF/V8HF support), as well as the (!<Is_float_mode> || flag_unsafe_math_optimizations) condition which was not present before r12-834 although SF modes were enabled by VDQW (I think this was a bug). Using TARGET_VECTORIZE_GET_MASK_MODE has the advantage that we no longer need to generate vpsel with vectors of 0 and 1: the masks are now merged via scalar 'ands' instructions operating on 16-bit masks after converting the boolean vectors. In addition, this patch fixes a problem in arm_expand_vcond() where the result would be a vector of 0 or 1 instead of operand 1 or 2. Reducing the number of iterations in pr100757-3.c from 32 to 8, we generate the code below: float a[32]; float fn1(int d) { float c = 4.0f; for (int b = 0; b < 8; b++) if (a[b] != 2.0f) c = 5.0f; return c; } fn1: ldr r3, .L3+48 vldr.64 d4, .L3 // q2=(2.0,2.0,2.0,2.0) vldr.64 d5, .L3+8 vldrw.32 q0, [r3] // q0=a(0..3) adds r3, r3, #16 vcmp.f32 eq, q0, q2 // cmp a(0..3) == (2.0,2.0,2.0,2.0) vldrw.32 q1, [r3] // q1=a(4..7) vmrs r3, P0 vcmp.f32 eq, q1, q2 // cmp a(4..7) == (2.0,2.0,2.0,2.0) vmrs r2, P0 @ movhi ands r3, r3, r2 // r3=select(a(0..3]) & select(a(4..7)) vldr.64 d4, .L3+16 // q2=(5.0,5.0,5.0,5.0) vldr.64 d5, .L3+24 vmsr P0, r3 vldr.64 d6, .L3+32 // q3=(4.0,4.0,4.0,4.0) vldr.64 d7, .L3+40 vpsel q3, q3, q2 // q3=vcond_mask(4.0,5.0) vmov.32 r2, q3[1] // keep the scalar max vmov.32 r0, q3[3] vmov.32 r3, q3[2] vmov.f32 s11, s12 vmov s15, r2 vmov s14, r3 vmaxnm.f32 s15, s11, s15 vmaxnm.f32 s15, s15, s14 vmov s14, r0 vmaxnm.f32 s15, s15, s14 vmov r0, s15 bx lr .L4: .align 3 .L3: .word 1073741824 // 2.0f .word 1073741824 .word 1073741824 .word 1073741824 .word 1084227584 // 5.0f .word 1084227584 .word 1084227584 .word 1084227584 .word 1082130432 // 4.0f .word 1082130432 .word 1082130432 .word 1082130432 2021-09-02 Christophe Lyon <christophe.lyon@linaro.org> PR target/100757 gcc/ * config/arm/arm-protos.h (arm_get_mask_mode): New prototype. (arm_expand_vector_compare): Update prototype. * config/arm/arm.c (TARGET_VECTORIZE_GET_MASK_MODE): New. (arm_vector_mode_supported_p): Add support for VxBI modes. (arm_expand_vector_compare): Remove useless generation of vpsel. (arm_expand_vcond): Fix select operands. (arm_get_mask_mode): New. * config/arm/mve.md (vec_cmp<mode><MVE_vpred>): New. (vec_cmpu<mode><MVE_vpred>): New. (vcond_mask_<mode><MVE_vpred>): New. * config/arm/vec-common.md (vec_cmp<mode><v_cmp_result>) (vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): Move to ... * config/arm/neon.md (vec_cmp<mode><v_cmp_result>) (vec_cmpu<mode><mode, vcond_mask_<mode><v_cmp_result>): ... here and disable for MVE. -- 2.25.1