Message ID | CAAgBjMndm8upw++qigGPQNTA4nqm+Uck=CcVBpt3cZ74fEGEwA@mail.gmail.com |
---|---|
State | New |
Headers | show |
Series | [SVE,fwprop] PR88833 - Redundant moves for WHILELO-based loops | expand |
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > diff --git a/gcc/fwprop.c b/gcc/fwprop.c > index 45703fe5f01..93a1a10c9a6 100644 > --- a/gcc/fwprop.c > +++ b/gcc/fwprop.c > @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) > tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), > SUBREG_BYTE (x)); > } > + > + else > + { > + rtvec vec; > + rtvec newvec; > + const char *fmt = GET_RTX_FORMAT (code); > + rtx op; > + > + for (int i = 0; fmt[i]; i++) > + switch (fmt[i]) > + { > + case 'E': > + vec = XVEC (x, i); > + newvec = vec; > + for (int j = 0; j < GET_NUM_ELEM (vec); j++) > + { > + op = RTVEC_ELT (vec, j); > + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); > + if (op != RTVEC_ELT (vec, j)) > + { > + if (newvec == vec) > + { > + newvec = shallow_copy_rtvec (vec); > + if (!tem) > + tem = shallow_copy_rtx (x); > + XVEC (tem, i) = newvec; > + } > + RTVEC_ELT (newvec, j) = op; > + } > + } > + break; Misindented break: should be indented two columns further. > + > + case 'e': > + if (XEXP (x, i)) > + { > + op = XEXP (x, i); > + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); > + if (op != XEXP (x, i)) > + { > + if (!tem) > + tem = shallow_copy_rtx (x); > + XEXP (tem, i) = op; > + } > + } > + break; Same here. > + } > + } > + > break; > > case RTX_OBJ: > @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) > > /* Given a use USE of an insn, if it has a single reaching > definition, try to forward propagate it into that insn. > - Return true if cfg cleanup will be needed. */ > + Return true if cfg cleanup will be needed. > + REG_PROP_ONLY is true if we should only propagate register copies. */ > > static bool > -forward_propagate_into (df_ref use) > +forward_propagate_into (df_ref use, bool reg_prop_only = false) > { > df_ref def; > rtx_insn *def_insn, *use_insn; > @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use) > if (DF_REF_IS_ARTIFICIAL (def)) > return false; > > - /* Do not propagate loop invariant definitions inside the loop. */ > - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) > - return false; > - > /* Check if the use is still present in the insn! */ > use_insn = DF_REF_INSN (use); > if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) > @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use) > if (!def_set) > return false; > > + if (reg_prop_only && !REG_P (SET_SRC (def_set))) > + return false; > + > + /* Allow propagating def inside loop only if source of def_set is > + reg, since replacing reg by source reg shouldn't increase cost. */ Maybe: /* Allow propagations into a loop only for reg-to-reg copies, since replacing one register by another shouldn't increase the cost. */ > + > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > + && !REG_P (SET_SRC (def_set))) > + return false; To be extra safe, I think we should check that the SET_DEST is a REG_P in both cases, to exclude REG-to-SUBREG copies. Thanks, Richard
On Mon, 24 Jun 2019 at 14:59, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > diff --git a/gcc/fwprop.c b/gcc/fwprop.c > > index 45703fe5f01..93a1a10c9a6 100644 > > --- a/gcc/fwprop.c > > +++ b/gcc/fwprop.c > > @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) > > tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), > > SUBREG_BYTE (x)); > > } > > + > > + else > > + { > > + rtvec vec; > > + rtvec newvec; > > + const char *fmt = GET_RTX_FORMAT (code); > > + rtx op; > > + > > + for (int i = 0; fmt[i]; i++) > > + switch (fmt[i]) > > + { > > + case 'E': > > + vec = XVEC (x, i); > > + newvec = vec; > > + for (int j = 0; j < GET_NUM_ELEM (vec); j++) > > + { > > + op = RTVEC_ELT (vec, j); > > + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); > > + if (op != RTVEC_ELT (vec, j)) > > + { > > + if (newvec == vec) > > + { > > + newvec = shallow_copy_rtvec (vec); > > + if (!tem) > > + tem = shallow_copy_rtx (x); > > + XVEC (tem, i) = newvec; > > + } > > + RTVEC_ELT (newvec, j) = op; > > + } > > + } > > + break; > > Misindented break: should be indented two columns further. > > > + > > + case 'e': > > + if (XEXP (x, i)) > > + { > > + op = XEXP (x, i); > > + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); > > + if (op != XEXP (x, i)) > > + { > > + if (!tem) > > + tem = shallow_copy_rtx (x); > > + XEXP (tem, i) = op; > > + } > > + } > > + break; > > Same here. > > > + } > > + } > > + > > break; > > > > case RTX_OBJ: > > @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) > > > > /* Given a use USE of an insn, if it has a single reaching > > definition, try to forward propagate it into that insn. > > - Return true if cfg cleanup will be needed. */ > > + Return true if cfg cleanup will be needed. > > + REG_PROP_ONLY is true if we should only propagate register copies. */ > > > > static bool > > -forward_propagate_into (df_ref use) > > +forward_propagate_into (df_ref use, bool reg_prop_only = false) > > { > > df_ref def; > > rtx_insn *def_insn, *use_insn; > > @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use) > > if (DF_REF_IS_ARTIFICIAL (def)) > > return false; > > > > - /* Do not propagate loop invariant definitions inside the loop. */ > > - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) > > - return false; > > - > > /* Check if the use is still present in the insn! */ > > use_insn = DF_REF_INSN (use); > > if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) > > @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use) > > if (!def_set) > > return false; > > > > + if (reg_prop_only && !REG_P (SET_SRC (def_set))) > > + return false; > > + > > + /* Allow propagating def inside loop only if source of def_set is > > + reg, since replacing reg by source reg shouldn't increase cost. */ > > Maybe: > > /* Allow propagations into a loop only for reg-to-reg copies, since > replacing one register by another shouldn't increase the cost. */ > > > + > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > > + && !REG_P (SET_SRC (def_set))) > > + return false; > > To be extra safe, I think we should check that the SET_DEST is a REG_P > in both cases, to exclude REG-to-SUBREG copies. Thanks for the suggestions. Does the attached version look OK ? Thanks, Prathamesh > > Thanks, > Richard diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..c5abebb7832 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only + && !REG_P (SET_SRC (def_set)) + && !REG_P (SET_DEST (def_set))) + return false; + + /* Allow propagations into a loop only for reg-to-reg copies, since + replacing one register by another shouldn't increase the cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && !REG_P (SET_SRC (def_set)) + && !REG_P (SET_DEST (def_set))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1541,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1560,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1600,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1612,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1636,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > if (!def_set) > return false; > > + if (reg_prop_only > + && !REG_P (SET_SRC (def_set)) > + && !REG_P (SET_DEST (def_set))) > + return false; This should be: if (reg_prop_only && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) return false; so that we return false if either operand isn't a register. > + > + /* Allow propagations into a loop only for reg-to-reg copies, since > + replacing one register by another shouldn't increase the cost. */ > + > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > + && !REG_P (SET_SRC (def_set)) > + && !REG_P (SET_DEST (def_set))) > + return false; Same here. OK with that change, thanks. Richard
On Mon, 24 Jun 2019 at 19:51, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > > if (!def_set) > > return false; > > > > + if (reg_prop_only > > + && !REG_P (SET_SRC (def_set)) > > + && !REG_P (SET_DEST (def_set))) > > + return false; > > This should be: > > if (reg_prop_only > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) > return false; > > so that we return false if either operand isn't a register. Oops, sorry about that -:( > > > + > > + /* Allow propagations into a loop only for reg-to-reg copies, since > > + replacing one register by another shouldn't increase the cost. */ > > + > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > > + && !REG_P (SET_SRC (def_set)) > > + && !REG_P (SET_DEST (def_set))) > > + return false; > > Same here. > > OK with that change, thanks. Thanks for the review, will make the changes and commit the patch after re-testing. Thanks, Prathamesh > > Richard
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni > <prathamesh.kulkarni@linaro.org> wrote: >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford >> <richard.sandiford@arm.com> wrote: >> > >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) >> > > if (!def_set) >> > > return false; >> > > >> > > + if (reg_prop_only >> > > + && !REG_P (SET_SRC (def_set)) >> > > + && !REG_P (SET_DEST (def_set))) >> > > + return false; >> > >> > This should be: >> > >> > if (reg_prop_only >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) >> > return false; >> > >> > so that we return false if either operand isn't a register. >> Oops, sorry about that -:( >> > >> > > + >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since >> > > + replacing one register by another shouldn't increase the cost. */ >> > > + >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father >> > > + && !REG_P (SET_SRC (def_set)) >> > > + && !REG_P (SET_DEST (def_set))) >> > > + return false; >> > >> > Same here. >> > >> > OK with that change, thanks. >> Thanks for the review, will make the changes and commit the patch >> after re-testing. > Hi, > Testing the patch showed following failures on 32-bit x86: > > Executed from: g++.target/i386/i386.exp > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra > Executed from: gcc.target/i386/i386.exp > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t > ]*\\%eax,[\\t ]*%eax 1 > > The failure of pr88152.C is also seen on x86_64. > > For pr66768.c, and pr90178.c, forwprop replaces register which is > volatile and frame related respectively. > To avoid that, the attached patch, makes a stronger constraint that > src and dest should be a register > and not have frame_related or volatil flags set, which is checked in > usable_reg_p(). > Which avoids the failures for both the cases. > Does it look OK ? That's not the reason it's a bad transform. In both cases we're propagating r2 <- r1 even though (a) r1 dies in the copy and (b) fwprop can't replace all uses of r2, because some have multiple definitions This has the effect of making both values live in cases where only one was previously. In the case of pr66768.c, fwprop2 is undoing the effect of cse.c:canon_reg, which tries to pick the best register to use (see cse.c:make_regs_eqv). fwprop1 makes the same change, and made it even before the patch, but the cse.c choice should win. A (hopefully) conservative fix would be to propagate the copy only if both registers have a single definition, which you can test with: (DF_REG_DEF_COUNT (regno) == 1 && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) In that case, fwprop should see all uses of the destination, and should be able to replace it in all cases with the source. > For g++.target/i386/pr88152.C, the issue is that after the patch, > forwprop1 does following propagation (in f10) which wasn't done > before: > > In insn 10, replacing > (unspec:SI [ > (reg:V2DF 91) > ] UNSPEC_MOVMSK) > with (unspec:SI [ > (subreg:V2DF (reg:V2DI 90) 0) > ] UNSPEC_MOVMSK) > > This later defeats combine because insn 9 gets deleted. > Without patch, the following combination takes place: > > Trying 7 -> 9: > 7: r90:V2DI=r89:V2DI>r93:V2DI > REG_DEAD r93:V2DI > REG_DEAD r89:V2DI > 9: r91:V2DF=r90:V2DI#0 > REG_DEAD r90:V2DI > Successfully matched this instruction: > (set (subreg:V2DI (reg:V2DF 91) 0) > (gt:V2DI (reg:V2DI 89) > (reg:V2DI 93))) > allowing combination of insns 7 and 9 > > and then: > Trying 6, 9 -> 10: > 6: r89:V2DI=const_vector > 9: r91:V2DF#0=r89:V2DI>r93:V2DI > REG_DEAD r89:V2DI > REG_DEAD r93:V2DI > 10: r87:SI=unspec[r91:V2DF] 43 > REG_DEAD r91:V2DF > Successfully matched this instruction: > (set (reg:SI 87) > (unspec:SI [ > (lt:V2DF (reg:V2DI 93) > (const_vector:V2DI [ > (const_int 0 [0]) repeated x2 > ])) > ] UNSPEC_MOVMSK)) Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN for true? Looks like a bug that we manage to fold to that, and manage to match it. Thanks, Richard > allowing combination of insns 6, 9 and 10 > original costs 4 + 8 + 4 = 16 > replacement cost 12 > deferring deletion of insn with uid = 9. > deferring deletion of insn with uid = 6. > which deletes insns 2, 3, 6, 7, 9. > > With patch, it fails to combine 7->10: > Trying 7 -> 10: > 7: r90:V2DI=r89:V2DI>r93:V2DI > REG_DEAD r93:V2DI > REG_DEAD r89:V2DI > 10: r87:SI=unspec[r90:V2DI#0] 43 > REG_DEAD r90:V2DI > Failed to match this instruction: > (set (reg:SI 87) > (unspec:SI [ > (subreg:V2DF (gt:V2DI (reg:V2DI 89) > (reg:V2DI 93)) 0) > ] UNSPEC_MOVMSK)) > > and subsequently 6, 7 -> 10 > (attached combine dumps before and after patch). > > So IIUC, the issue is that the target does not have a pattern that can > match the above insn ? > I tried a simple workaround to "pessimize" the else condition in > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same > rtx_code, which at least > works for this test-case, but not sure if that's the correct approach. > Could you suggest how to proceed ? > > Thanks, > Prathamesh >> >> Thanks, >> Prathamesh >> > >> > Richard
On Tue, 25 Jun 2019 at 20:05, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni > > <prathamesh.kulkarni@linaro.org> wrote: > >> > >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford > >> <richard.sandiford@arm.com> wrote: > >> > > >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > >> > > if (!def_set) > >> > > return false; > >> > > > >> > > + if (reg_prop_only > >> > > + && !REG_P (SET_SRC (def_set)) > >> > > + && !REG_P (SET_DEST (def_set))) > >> > > + return false; > >> > > >> > This should be: > >> > > >> > if (reg_prop_only > >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) > >> > return false; > >> > > >> > so that we return false if either operand isn't a register. > >> Oops, sorry about that -:( > >> > > >> > > + > >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since > >> > > + replacing one register by another shouldn't increase the cost. */ > >> > > + > >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > >> > > + && !REG_P (SET_SRC (def_set)) > >> > > + && !REG_P (SET_DEST (def_set))) > >> > > + return false; > >> > > >> > Same here. > >> > > >> > OK with that change, thanks. > >> Thanks for the review, will make the changes and commit the patch > >> after re-testing. > > Hi, > > Testing the patch showed following failures on 32-bit x86: > > > > Executed from: g++.target/i386/i386.exp > > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra > > Executed from: gcc.target/i386/i386.exp > > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: > > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t > > ]*\\%eax,[\\t ]*%eax 1 > > > > The failure of pr88152.C is also seen on x86_64. > > > > For pr66768.c, and pr90178.c, forwprop replaces register which is > > volatile and frame related respectively. > > To avoid that, the attached patch, makes a stronger constraint that > > src and dest should be a register > > and not have frame_related or volatil flags set, which is checked in > > usable_reg_p(). > > Which avoids the failures for both the cases. > > Does it look OK ? > > That's not the reason it's a bad transform. In both cases we're > propagating r2 <- r1 even though > > (a) r1 dies in the copy and > (b) fwprop can't replace all uses of r2, because some have multiple > definitions > > This has the effect of making both values live in cases where only one > was previously. > > In the case of pr66768.c, fwprop2 is undoing the effect of > cse.c:canon_reg, which tries to pick the best register to use > (see cse.c:make_regs_eqv). fwprop1 makes the same change, > and made it even before the patch, but the cse.c choice should win. > > A (hopefully) conservative fix would be to propagate the copy only if > both registers have a single definition, which you can test with: > > (DF_REG_DEF_COUNT (regno) == 1 > && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) > > In that case, fwprop should see all uses of the destination, and should > be able to replace it in all cases with the source. Ah I see, thanks for the explanation! The above check works to resolve the failure. IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ? > > > For g++.target/i386/pr88152.C, the issue is that after the patch, > > forwprop1 does following propagation (in f10) which wasn't done > > before: > > > > In insn 10, replacing > > (unspec:SI [ > > (reg:V2DF 91) > > ] UNSPEC_MOVMSK) > > with (unspec:SI [ > > (subreg:V2DF (reg:V2DI 90) 0) > > ] UNSPEC_MOVMSK) > > > > This later defeats combine because insn 9 gets deleted. > > Without patch, the following combination takes place: > > > > Trying 7 -> 9: > > 7: r90:V2DI=r89:V2DI>r93:V2DI > > REG_DEAD r93:V2DI > > REG_DEAD r89:V2DI > > 9: r91:V2DF=r90:V2DI#0 > > REG_DEAD r90:V2DI > > Successfully matched this instruction: > > (set (subreg:V2DI (reg:V2DF 91) 0) > > (gt:V2DI (reg:V2DI 89) > > (reg:V2DI 93))) > > allowing combination of insns 7 and 9 > > > > and then: > > Trying 6, 9 -> 10: > > 6: r89:V2DI=const_vector > > 9: r91:V2DF#0=r89:V2DI>r93:V2DI > > REG_DEAD r89:V2DI > > REG_DEAD r93:V2DI > > 10: r87:SI=unspec[r91:V2DF] 43 > > REG_DEAD r91:V2DF > > Successfully matched this instruction: > > (set (reg:SI 87) > > (unspec:SI [ > > (lt:V2DF (reg:V2DI 93) > > (const_vector:V2DI [ > > (const_int 0 [0]) repeated x2 > > ])) > > ] UNSPEC_MOVMSK)) > > Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN > for true? Well looking at .optimized dump: vector(2) long int _2; vector(2) double _3; int _6; signed long _7; long int _8; signed long _9; long int _10; <bb 2> [local count: 1073741824]: _7 = BIT_FIELD_REF <a_4(D), 64, 0>; _8 = _7 < 0 ? -1 : 0; _9 = BIT_FIELD_REF <a_4(D), 64, 64>; _10 = _9 < 0 ? -1 : 0; _2 = {_8, _10}; _3 = VIEW_CONVERT_EXPR<__m128d>(_2); _6 = __builtin_ia32_movmskpd (_3); [tail call] return _6; So AFAIU, we're using -1 for representing true and 0 for false and casting -1 (literally) to double would change it's value to -NaN ? The above insn 10 created by combine is then transformed to following insn 22: (insn 22 9 16 2 (set (reg:SI 0 ax [87]) (unspec:SI [ (reg:V2DF 20 xmm0 [93]) ] UNSPEC_MOVMSK)) "../../stage1-build/gcc/include/emmintrin.h":958:34 4222 {sse2_movmskpd} (nil)) deleting insn 10. Thanks, Prathamesh > > Looks like a bug that we manage to fold to that, and manage to match it. > > Thanks, > Richard > > > allowing combination of insns 6, 9 and 10 > > original costs 4 + 8 + 4 = 16 > > replacement cost 12 > > deferring deletion of insn with uid = 9. > > deferring deletion of insn with uid = 6. > > which deletes insns 2, 3, 6, 7, 9. > > > > With patch, it fails to combine 7->10: > > Trying 7 -> 10: > > 7: r90:V2DI=r89:V2DI>r93:V2DI > > REG_DEAD r93:V2DI > > REG_DEAD r89:V2DI > > 10: r87:SI=unspec[r90:V2DI#0] 43 > > REG_DEAD r90:V2DI > > Failed to match this instruction: > > (set (reg:SI 87) > > (unspec:SI [ > > (subreg:V2DF (gt:V2DI (reg:V2DI 89) > > (reg:V2DI 93)) 0) > > ] UNSPEC_MOVMSK)) > > > > and subsequently 6, 7 -> 10 > > (attached combine dumps before and after patch). > > > > So IIUC, the issue is that the target does not have a pattern that can > > match the above insn ? > > I tried a simple workaround to "pessimize" the else condition in > > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same > > rtx_code, which at least > > works for this test-case, but not sure if that's the correct approach. > > Could you suggest how to proceed ? > > > > Thanks, > > Prathamesh > >> > >> Thanks, > >> Prathamesh > >> > > >> > Richard
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni >> > <prathamesh.kulkarni@linaro.org> wrote: >> >> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford >> >> <richard.sandiford@arm.com> wrote: >> >> > >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) >> >> > > if (!def_set) >> >> > > return false; >> >> > > >> >> > > + if (reg_prop_only >> >> > > + && !REG_P (SET_SRC (def_set)) >> >> > > + && !REG_P (SET_DEST (def_set))) >> >> > > + return false; >> >> > >> >> > This should be: >> >> > >> >> > if (reg_prop_only >> >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) >> >> > return false; >> >> > >> >> > so that we return false if either operand isn't a register. >> >> Oops, sorry about that -:( >> >> > >> >> > > + >> >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since >> >> > > + replacing one register by another shouldn't increase the cost. */ >> >> > > + >> >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father >> >> > > + && !REG_P (SET_SRC (def_set)) >> >> > > + && !REG_P (SET_DEST (def_set))) >> >> > > + return false; >> >> > >> >> > Same here. >> >> > >> >> > OK with that change, thanks. >> >> Thanks for the review, will make the changes and commit the patch >> >> after re-testing. >> > Hi, >> > Testing the patch showed following failures on 32-bit x86: >> > >> > Executed from: g++.target/i386/i386.exp >> > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra >> > Executed from: gcc.target/i386/i386.exp >> > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: >> > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t >> > ]*\\%eax,[\\t ]*%eax 1 >> > >> > The failure of pr88152.C is also seen on x86_64. >> > >> > For pr66768.c, and pr90178.c, forwprop replaces register which is >> > volatile and frame related respectively. >> > To avoid that, the attached patch, makes a stronger constraint that >> > src and dest should be a register >> > and not have frame_related or volatil flags set, which is checked in >> > usable_reg_p(). >> > Which avoids the failures for both the cases. >> > Does it look OK ? >> >> That's not the reason it's a bad transform. In both cases we're >> propagating r2 <- r1 even though >> >> (a) r1 dies in the copy and >> (b) fwprop can't replace all uses of r2, because some have multiple >> definitions >> >> This has the effect of making both values live in cases where only one >> was previously. >> >> In the case of pr66768.c, fwprop2 is undoing the effect of >> cse.c:canon_reg, which tries to pick the best register to use >> (see cse.c:make_regs_eqv). fwprop1 makes the same change, >> and made it even before the patch, but the cse.c choice should win. >> >> A (hopefully) conservative fix would be to propagate the copy only if >> both registers have a single definition, which you can test with: >> >> (DF_REG_DEF_COUNT (regno) == 1 >> && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) >> >> In that case, fwprop should see all uses of the destination, and should >> be able to replace it in all cases with the source. > Ah I see, thanks for the explanation! > The above check works to resolve the failure. > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ? Right. >> > For g++.target/i386/pr88152.C, the issue is that after the patch, >> > forwprop1 does following propagation (in f10) which wasn't done >> > before: >> > >> > In insn 10, replacing >> > (unspec:SI [ >> > (reg:V2DF 91) >> > ] UNSPEC_MOVMSK) >> > with (unspec:SI [ >> > (subreg:V2DF (reg:V2DI 90) 0) >> > ] UNSPEC_MOVMSK) >> > >> > This later defeats combine because insn 9 gets deleted. >> > Without patch, the following combination takes place: >> > >> > Trying 7 -> 9: >> > 7: r90:V2DI=r89:V2DI>r93:V2DI >> > REG_DEAD r93:V2DI >> > REG_DEAD r89:V2DI >> > 9: r91:V2DF=r90:V2DI#0 >> > REG_DEAD r90:V2DI >> > Successfully matched this instruction: >> > (set (subreg:V2DI (reg:V2DF 91) 0) >> > (gt:V2DI (reg:V2DI 89) >> > (reg:V2DI 93))) >> > allowing combination of insns 7 and 9 >> > >> > and then: >> > Trying 6, 9 -> 10: >> > 6: r89:V2DI=const_vector >> > 9: r91:V2DF#0=r89:V2DI>r93:V2DI >> > REG_DEAD r89:V2DI >> > REG_DEAD r93:V2DI >> > 10: r87:SI=unspec[r91:V2DF] 43 >> > REG_DEAD r91:V2DF >> > Successfully matched this instruction: >> > (set (reg:SI 87) >> > (unspec:SI [ >> > (lt:V2DF (reg:V2DI 93) >> > (const_vector:V2DI [ >> > (const_int 0 [0]) repeated x2 >> > ])) >> > ] UNSPEC_MOVMSK)) >> >> Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN >> for true? > Well looking at .optimized dump: > > vector(2) long int _2; > vector(2) double _3; > int _6; > signed long _7; > long int _8; > signed long _9; > long int _10; > > <bb 2> [local count: 1073741824]: > _7 = BIT_FIELD_REF <a_4(D), 64, 0>; > _8 = _7 < 0 ? -1 : 0; > _9 = BIT_FIELD_REF <a_4(D), 64, 64>; > _10 = _9 < 0 ? -1 : 0; > _2 = {_8, _10}; > _3 = VIEW_CONVERT_EXPR<__m128d>(_2); > _6 = __builtin_ia32_movmskpd (_3); [tail call] > return _6; > > So AFAIU, we're using -1 for representing true and 0 for false > and casting -1 (literally) to double would change it's value to -NaN ? Exactly. And for -ffinite-math-only, we might as well then fold the condition to false. :-) IMO rtl condition results have to have integral modes and not folding (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour. So I think this is just a latent bug and shouldn't hold up the patch. I'm not sure whether: reinterpret_cast<__m128d> (a > ...) is something we expect users to write, or whether it was just included for completeness. Thanks, Richard > The above insn 10 created by combine is then transformed to following insn 22: > (insn 22 9 16 2 (set (reg:SI 0 ax [87]) > (unspec:SI [ > (reg:V2DF 20 xmm0 [93]) > ] UNSPEC_MOVMSK)) > "../../stage1-build/gcc/include/emmintrin.h":958:34 4222 > {sse2_movmskpd} > (nil)) > > deleting insn 10. > > Thanks, > Prathamesh > >> >> Looks like a bug that we manage to fold to that, and manage to match it. >> >> Thanks, >> Richard >> >> > allowing combination of insns 6, 9 and 10 >> > original costs 4 + 8 + 4 = 16 >> > replacement cost 12 >> > deferring deletion of insn with uid = 9. >> > deferring deletion of insn with uid = 6. >> > which deletes insns 2, 3, 6, 7, 9. >> > >> > With patch, it fails to combine 7->10: >> > Trying 7 -> 10: >> > 7: r90:V2DI=r89:V2DI>r93:V2DI >> > REG_DEAD r93:V2DI >> > REG_DEAD r89:V2DI >> > 10: r87:SI=unspec[r90:V2DI#0] 43 >> > REG_DEAD r90:V2DI >> > Failed to match this instruction: >> > (set (reg:SI 87) >> > (unspec:SI [ >> > (subreg:V2DF (gt:V2DI (reg:V2DI 89) >> > (reg:V2DI 93)) 0) >> > ] UNSPEC_MOVMSK)) >> > >> > and subsequently 6, 7 -> 10 >> > (attached combine dumps before and after patch). >> > >> > So IIUC, the issue is that the target does not have a pattern that can >> > match the above insn ? >> > I tried a simple workaround to "pessimize" the else condition in >> > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same >> > rtx_code, which at least >> > works for this test-case, but not sure if that's the correct approach. >> > Could you suggest how to proceed ? >> > >> > Thanks, >> > Prathamesh >> >> >> >> Thanks, >> >> Prathamesh >> >> > >> >> > Richard
On Wed, 26 Jun 2019 at 16:05, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni > >> > <prathamesh.kulkarni@linaro.org> wrote: > >> >> > >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford > >> >> <richard.sandiford@arm.com> wrote: > >> >> > > >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > >> >> > > if (!def_set) > >> >> > > return false; > >> >> > > > >> >> > > + if (reg_prop_only > >> >> > > + && !REG_P (SET_SRC (def_set)) > >> >> > > + && !REG_P (SET_DEST (def_set))) > >> >> > > + return false; > >> >> > > >> >> > This should be: > >> >> > > >> >> > if (reg_prop_only > >> >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) > >> >> > return false; > >> >> > > >> >> > so that we return false if either operand isn't a register. > >> >> Oops, sorry about that -:( > >> >> > > >> >> > > + > >> >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since > >> >> > > + replacing one register by another shouldn't increase the cost. */ > >> >> > > + > >> >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > >> >> > > + && !REG_P (SET_SRC (def_set)) > >> >> > > + && !REG_P (SET_DEST (def_set))) > >> >> > > + return false; > >> >> > > >> >> > Same here. > >> >> > > >> >> > OK with that change, thanks. > >> >> Thanks for the review, will make the changes and commit the patch > >> >> after re-testing. > >> > Hi, > >> > Testing the patch showed following failures on 32-bit x86: > >> > > >> > Executed from: g++.target/i386/i386.exp > >> > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra > >> > Executed from: gcc.target/i386/i386.exp > >> > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: > >> > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t > >> > ]*\\%eax,[\\t ]*%eax 1 > >> > > >> > The failure of pr88152.C is also seen on x86_64. > >> > > >> > For pr66768.c, and pr90178.c, forwprop replaces register which is > >> > volatile and frame related respectively. > >> > To avoid that, the attached patch, makes a stronger constraint that > >> > src and dest should be a register > >> > and not have frame_related or volatil flags set, which is checked in > >> > usable_reg_p(). > >> > Which avoids the failures for both the cases. > >> > Does it look OK ? > >> > >> That's not the reason it's a bad transform. In both cases we're > >> propagating r2 <- r1 even though > >> > >> (a) r1 dies in the copy and > >> (b) fwprop can't replace all uses of r2, because some have multiple > >> definitions > >> > >> This has the effect of making both values live in cases where only one > >> was previously. > >> > >> In the case of pr66768.c, fwprop2 is undoing the effect of > >> cse.c:canon_reg, which tries to pick the best register to use > >> (see cse.c:make_regs_eqv). fwprop1 makes the same change, > >> and made it even before the patch, but the cse.c choice should win. > >> > >> A (hopefully) conservative fix would be to propagate the copy only if > >> both registers have a single definition, which you can test with: > >> > >> (DF_REG_DEF_COUNT (regno) == 1 > >> && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) > >> > >> In that case, fwprop should see all uses of the destination, and should > >> be able to replace it in all cases with the source. > > Ah I see, thanks for the explanation! > > The above check works to resolve the failure. > > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ? > > Right. > > >> > For g++.target/i386/pr88152.C, the issue is that after the patch, > >> > forwprop1 does following propagation (in f10) which wasn't done > >> > before: > >> > > >> > In insn 10, replacing > >> > (unspec:SI [ > >> > (reg:V2DF 91) > >> > ] UNSPEC_MOVMSK) > >> > with (unspec:SI [ > >> > (subreg:V2DF (reg:V2DI 90) 0) > >> > ] UNSPEC_MOVMSK) > >> > > >> > This later defeats combine because insn 9 gets deleted. > >> > Without patch, the following combination takes place: > >> > > >> > Trying 7 -> 9: > >> > 7: r90:V2DI=r89:V2DI>r93:V2DI > >> > REG_DEAD r93:V2DI > >> > REG_DEAD r89:V2DI > >> > 9: r91:V2DF=r90:V2DI#0 > >> > REG_DEAD r90:V2DI > >> > Successfully matched this instruction: > >> > (set (subreg:V2DI (reg:V2DF 91) 0) > >> > (gt:V2DI (reg:V2DI 89) > >> > (reg:V2DI 93))) > >> > allowing combination of insns 7 and 9 > >> > > >> > and then: > >> > Trying 6, 9 -> 10: > >> > 6: r89:V2DI=const_vector > >> > 9: r91:V2DF#0=r89:V2DI>r93:V2DI > >> > REG_DEAD r89:V2DI > >> > REG_DEAD r93:V2DI > >> > 10: r87:SI=unspec[r91:V2DF] 43 > >> > REG_DEAD r91:V2DF > >> > Successfully matched this instruction: > >> > (set (reg:SI 87) > >> > (unspec:SI [ > >> > (lt:V2DF (reg:V2DI 93) > >> > (const_vector:V2DI [ > >> > (const_int 0 [0]) repeated x2 > >> > ])) > >> > ] UNSPEC_MOVMSK)) > >> > >> Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN > >> for true? > > Well looking at .optimized dump: > > > > vector(2) long int _2; > > vector(2) double _3; > > int _6; > > signed long _7; > > long int _8; > > signed long _9; > > long int _10; > > > > <bb 2> [local count: 1073741824]: > > _7 = BIT_FIELD_REF <a_4(D), 64, 0>; > > _8 = _7 < 0 ? -1 : 0; > > _9 = BIT_FIELD_REF <a_4(D), 64, 64>; > > _10 = _9 < 0 ? -1 : 0; > > _2 = {_8, _10}; > > _3 = VIEW_CONVERT_EXPR<__m128d>(_2); > > _6 = __builtin_ia32_movmskpd (_3); [tail call] > > return _6; > > > > So AFAIU, we're using -1 for representing true and 0 for false > > and casting -1 (literally) to double would change it's value to -NaN ? > > Exactly. And for -ffinite-math-only, we might as well then fold the > condition to false. :-) > > IMO rtl condition results have to have integral modes and not folding > (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour. So I think this > is just a latent bug and shouldn't hold up the patch. > > I'm not sure whether: > > reinterpret_cast<__m128d> (a > ...) > > is something we expect users to write, or whether it was just > included for completeness. I will report the issue and commit after re-testing patch. Thanks a lot for the helpful reviews! Thanks, Prathamesh > > Thanks, > Richard > > > The above insn 10 created by combine is then transformed to following insn 22: > > (insn 22 9 16 2 (set (reg:SI 0 ax [87]) > > (unspec:SI [ > > (reg:V2DF 20 xmm0 [93]) > > ] UNSPEC_MOVMSK)) > > "../../stage1-build/gcc/include/emmintrin.h":958:34 4222 > > {sse2_movmskpd} > > (nil)) > > > > deleting insn 10. > > > > Thanks, > > Prathamesh > > > >> > >> Looks like a bug that we manage to fold to that, and manage to match it. > >> > >> Thanks, > >> Richard > >> > >> > allowing combination of insns 6, 9 and 10 > >> > original costs 4 + 8 + 4 = 16 > >> > replacement cost 12 > >> > deferring deletion of insn with uid = 9. > >> > deferring deletion of insn with uid = 6. > >> > which deletes insns 2, 3, 6, 7, 9. > >> > > >> > With patch, it fails to combine 7->10: > >> > Trying 7 -> 10: > >> > 7: r90:V2DI=r89:V2DI>r93:V2DI > >> > REG_DEAD r93:V2DI > >> > REG_DEAD r89:V2DI > >> > 10: r87:SI=unspec[r90:V2DI#0] 43 > >> > REG_DEAD r90:V2DI > >> > Failed to match this instruction: > >> > (set (reg:SI 87) > >> > (unspec:SI [ > >> > (subreg:V2DF (gt:V2DI (reg:V2DI 89) > >> > (reg:V2DI 93)) 0) > >> > ] UNSPEC_MOVMSK)) > >> > > >> > and subsequently 6, 7 -> 10 > >> > (attached combine dumps before and after patch). > >> > > >> > So IIUC, the issue is that the target does not have a pattern that can > >> > match the above insn ? > >> > I tried a simple workaround to "pessimize" the else condition in > >> > propagate_rtx_1 in patch, to require old_rtx and new_rtx have same > >> > rtx_code, which at least > >> > works for this test-case, but not sure if that's the correct approach. > >> > Could you suggest how to proceed ? > >> > > >> > Thanks, > >> > Prathamesh > >> >> > >> >> Thanks, > >> >> Prathamesh > >> >> > > >> >> > Richard
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Wed, 26 Jun 2019 at 16:05, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford >> > <richard.sandiford@arm.com> wrote: >> >> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni >> >> > <prathamesh.kulkarni@linaro.org> wrote: >> >> >> >> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford >> >> >> <richard.sandiford@arm.com> wrote: >> >> >> > >> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) >> >> >> > > if (!def_set) >> >> >> > > return false; >> >> >> > > >> >> >> > > + if (reg_prop_only >> >> >> > > + && !REG_P (SET_SRC (def_set)) >> >> >> > > + && !REG_P (SET_DEST (def_set))) >> >> >> > > + return false; >> >> >> > >> >> >> > This should be: >> >> >> > >> >> >> > if (reg_prop_only >> >> >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) >> >> >> > return false; >> >> >> > >> >> >> > so that we return false if either operand isn't a register. >> >> >> Oops, sorry about that -:( >> >> >> > >> >> >> > > + >> >> >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since >> >> >> > > + replacing one register by another shouldn't increase the cost. */ >> >> >> > > + >> >> >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father >> >> >> > > + && !REG_P (SET_SRC (def_set)) >> >> >> > > + && !REG_P (SET_DEST (def_set))) >> >> >> > > + return false; >> >> >> > >> >> >> > Same here. >> >> >> > >> >> >> > OK with that change, thanks. >> >> >> Thanks for the review, will make the changes and commit the patch >> >> >> after re-testing. >> >> > Hi, >> >> > Testing the patch showed following failures on 32-bit x86: >> >> > >> >> > Executed from: g++.target/i386/i386.exp >> >> > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra >> >> > Executed from: gcc.target/i386/i386.exp >> >> > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: >> >> > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t >> >> > ]*\\%eax,[\\t ]*%eax 1 >> >> > >> >> > The failure of pr88152.C is also seen on x86_64. >> >> > >> >> > For pr66768.c, and pr90178.c, forwprop replaces register which is >> >> > volatile and frame related respectively. >> >> > To avoid that, the attached patch, makes a stronger constraint that >> >> > src and dest should be a register >> >> > and not have frame_related or volatil flags set, which is checked in >> >> > usable_reg_p(). >> >> > Which avoids the failures for both the cases. >> >> > Does it look OK ? >> >> >> >> That's not the reason it's a bad transform. In both cases we're >> >> propagating r2 <- r1 even though >> >> >> >> (a) r1 dies in the copy and >> >> (b) fwprop can't replace all uses of r2, because some have multiple >> >> definitions >> >> >> >> This has the effect of making both values live in cases where only one >> >> was previously. >> >> >> >> In the case of pr66768.c, fwprop2 is undoing the effect of >> >> cse.c:canon_reg, which tries to pick the best register to use >> >> (see cse.c:make_regs_eqv). fwprop1 makes the same change, >> >> and made it even before the patch, but the cse.c choice should win. >> >> >> >> A (hopefully) conservative fix would be to propagate the copy only if >> >> both registers have a single definition, which you can test with: >> >> >> >> (DF_REG_DEF_COUNT (regno) == 1 >> >> && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) >> >> >> >> In that case, fwprop should see all uses of the destination, and should >> >> be able to replace it in all cases with the source. >> > Ah I see, thanks for the explanation! >> > The above check works to resolve the failure. >> > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ? >> >> Right. >> >> >> > For g++.target/i386/pr88152.C, the issue is that after the patch, >> >> > forwprop1 does following propagation (in f10) which wasn't done >> >> > before: >> >> > >> >> > In insn 10, replacing >> >> > (unspec:SI [ >> >> > (reg:V2DF 91) >> >> > ] UNSPEC_MOVMSK) >> >> > with (unspec:SI [ >> >> > (subreg:V2DF (reg:V2DI 90) 0) >> >> > ] UNSPEC_MOVMSK) >> >> > >> >> > This later defeats combine because insn 9 gets deleted. >> >> > Without patch, the following combination takes place: >> >> > >> >> > Trying 7 -> 9: >> >> > 7: r90:V2DI=r89:V2DI>r93:V2DI >> >> > REG_DEAD r93:V2DI >> >> > REG_DEAD r89:V2DI >> >> > 9: r91:V2DF=r90:V2DI#0 >> >> > REG_DEAD r90:V2DI >> >> > Successfully matched this instruction: >> >> > (set (subreg:V2DI (reg:V2DF 91) 0) >> >> > (gt:V2DI (reg:V2DI 89) >> >> > (reg:V2DI 93))) >> >> > allowing combination of insns 7 and 9 >> >> > >> >> > and then: >> >> > Trying 6, 9 -> 10: >> >> > 6: r89:V2DI=const_vector >> >> > 9: r91:V2DF#0=r89:V2DI>r93:V2DI >> >> > REG_DEAD r89:V2DI >> >> > REG_DEAD r93:V2DI >> >> > 10: r87:SI=unspec[r91:V2DF] 43 >> >> > REG_DEAD r91:V2DF >> >> > Successfully matched this instruction: >> >> > (set (reg:SI 87) >> >> > (unspec:SI [ >> >> > (lt:V2DF (reg:V2DI 93) >> >> > (const_vector:V2DI [ >> >> > (const_int 0 [0]) repeated x2 >> >> > ])) >> >> > ] UNSPEC_MOVMSK)) >> >> >> >> Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN >> >> for true? >> > Well looking at .optimized dump: >> > >> > vector(2) long int _2; >> > vector(2) double _3; >> > int _6; >> > signed long _7; >> > long int _8; >> > signed long _9; >> > long int _10; >> > >> > <bb 2> [local count: 1073741824]: >> > _7 = BIT_FIELD_REF <a_4(D), 64, 0>; >> > _8 = _7 < 0 ? -1 : 0; >> > _9 = BIT_FIELD_REF <a_4(D), 64, 64>; >> > _10 = _9 < 0 ? -1 : 0; >> > _2 = {_8, _10}; >> > _3 = VIEW_CONVERT_EXPR<__m128d>(_2); >> > _6 = __builtin_ia32_movmskpd (_3); [tail call] >> > return _6; >> > >> > So AFAIU, we're using -1 for representing true and 0 for false >> > and casting -1 (literally) to double would change it's value to -NaN ? >> >> Exactly. And for -ffinite-math-only, we might as well then fold the >> condition to false. :-) >> >> IMO rtl condition results have to have integral modes and not folding >> (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour. So I think this >> is just a latent bug and shouldn't hold up the patch. >> >> I'm not sure whether: >> >> reinterpret_cast<__m128d> (a > ...) >> >> is something we expect users to write, or whether it was just >> included for completeness. > I will report the issue and commit after re-testing patch. > Thanks a lot for the helpful reviews! Since it seems FP comparison results are OK, I guess it needs to be fixed after all. I think the problem is that the simplification is only done by gen_lowpart_for_combine: /* If X is a comparison operator, rewrite it in a new mode. This probably won't match, but may allow further simplifications. */ else if (COMPARISON_P (x)) return gen_rtx_fmt_ee (GET_CODE (x), omode, XEXP (x, 0), XEXP (x, 1)); and triggers via expand_field_assignment when the subreg is on the lhs of the SET. For it to work for a general subreg on the rhs, it probably needs to be moved from here to simplify_subreg. We'll need to be careful about the conditions under which it happens though. The above doesn't for example check whether the new mode has the same number of elements as the original, so could generate things like: (gt:V4SF (reg:V2DI X) (reg:V2DI Y)) for: (set (reg:V2DI res) (gt:V2DI (reg:V2DI X) (reg:V2DI Y))) (subreg:V4SF (reg:V2DI res) 0) I think most code would expect the result of a comparison to have the same number of elements as the inputs and could ICE if they don't, so I don't think it's up to the target to decide whether mismatches are OK. (But there again, I thought that last time. :-)) We'd also need to check the element sizes, since e.g. a lowpart (subreg:V2HI ...) of a V2SI comparison isn't the same as a V2HI comparison. Thanks, Richard
On Wed, 26 Jun 2019 at 23:45, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Wed, 26 Jun 2019 at 16:05, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > On Tue, 25 Jun 2019 at 20:05, Richard Sandiford > >> > <richard.sandiford@arm.com> wrote: > >> >> > >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> >> > On Mon, 24 Jun 2019 at 21:41, Prathamesh Kulkarni > >> >> > <prathamesh.kulkarni@linaro.org> wrote: > >> >> >> > >> >> >> On Mon, 24 Jun 2019 at 19:51, Richard Sandiford > >> >> >> <richard.sandiford@arm.com> wrote: > >> >> >> > > >> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> >> >> > > @@ -1415,6 +1460,19 @@ forward_propagate_into (df_ref use) > >> >> >> > > if (!def_set) > >> >> >> > > return false; > >> >> >> > > > >> >> >> > > + if (reg_prop_only > >> >> >> > > + && !REG_P (SET_SRC (def_set)) > >> >> >> > > + && !REG_P (SET_DEST (def_set))) > >> >> >> > > + return false; > >> >> >> > > >> >> >> > This should be: > >> >> >> > > >> >> >> > if (reg_prop_only > >> >> >> > && (!REG_P (SET_SRC (def_set)) || !REG_P (SET_DEST (def_set)))) > >> >> >> > return false; > >> >> >> > > >> >> >> > so that we return false if either operand isn't a register. > >> >> >> Oops, sorry about that -:( > >> >> >> > > >> >> >> > > + > >> >> >> > > + /* Allow propagations into a loop only for reg-to-reg copies, since > >> >> >> > > + replacing one register by another shouldn't increase the cost. */ > >> >> >> > > + > >> >> >> > > + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father > >> >> >> > > + && !REG_P (SET_SRC (def_set)) > >> >> >> > > + && !REG_P (SET_DEST (def_set))) > >> >> >> > > + return false; > >> >> >> > > >> >> >> > Same here. > >> >> >> > > >> >> >> > OK with that change, thanks. > >> >> >> Thanks for the review, will make the changes and commit the patch > >> >> >> after re-testing. > >> >> > Hi, > >> >> > Testing the patch showed following failures on 32-bit x86: > >> >> > > >> >> > Executed from: g++.target/i386/i386.exp > >> >> > g++:g++.target/i386/pr88152.C scan-assembler-not vpcmpgt|vpcmpeq|vpsra > >> >> > Executed from: gcc.target/i386/i386.exp > >> >> > gcc:gcc.target/i386/pr66768.c scan-assembler add*.[ \t]%gs: > >> >> > gcc:gcc.target/i386/pr90178.c scan-assembler-times xorl[\\t > >> >> > ]*\\%eax,[\\t ]*%eax 1 > >> >> > > >> >> > The failure of pr88152.C is also seen on x86_64. > >> >> > > >> >> > For pr66768.c, and pr90178.c, forwprop replaces register which is > >> >> > volatile and frame related respectively. > >> >> > To avoid that, the attached patch, makes a stronger constraint that > >> >> > src and dest should be a register > >> >> > and not have frame_related or volatil flags set, which is checked in > >> >> > usable_reg_p(). > >> >> > Which avoids the failures for both the cases. > >> >> > Does it look OK ? > >> >> > >> >> That's not the reason it's a bad transform. In both cases we're > >> >> propagating r2 <- r1 even though > >> >> > >> >> (a) r1 dies in the copy and > >> >> (b) fwprop can't replace all uses of r2, because some have multiple > >> >> definitions > >> >> > >> >> This has the effect of making both values live in cases where only one > >> >> was previously. > >> >> > >> >> In the case of pr66768.c, fwprop2 is undoing the effect of > >> >> cse.c:canon_reg, which tries to pick the best register to use > >> >> (see cse.c:make_regs_eqv). fwprop1 makes the same change, > >> >> and made it even before the patch, but the cse.c choice should win. > >> >> > >> >> A (hopefully) conservative fix would be to propagate the copy only if > >> >> both registers have a single definition, which you can test with: > >> >> > >> >> (DF_REG_DEF_COUNT (regno) == 1 > >> >> && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (m_fn)), regno)) > >> >> > >> >> In that case, fwprop should see all uses of the destination, and should > >> >> be able to replace it in all cases with the source. > >> > Ah I see, thanks for the explanation! > >> > The above check works to resolve the failure. > >> > IIUC, !bitmap_bit_p (...) above checks that reg isn't used uninitialized ? > >> > >> Right. > >> > >> >> > For g++.target/i386/pr88152.C, the issue is that after the patch, > >> >> > forwprop1 does following propagation (in f10) which wasn't done > >> >> > before: > >> >> > > >> >> > In insn 10, replacing > >> >> > (unspec:SI [ > >> >> > (reg:V2DF 91) > >> >> > ] UNSPEC_MOVMSK) > >> >> > with (unspec:SI [ > >> >> > (subreg:V2DF (reg:V2DI 90) 0) > >> >> > ] UNSPEC_MOVMSK) > >> >> > > >> >> > This later defeats combine because insn 9 gets deleted. > >> >> > Without patch, the following combination takes place: > >> >> > > >> >> > Trying 7 -> 9: > >> >> > 7: r90:V2DI=r89:V2DI>r93:V2DI > >> >> > REG_DEAD r93:V2DI > >> >> > REG_DEAD r89:V2DI > >> >> > 9: r91:V2DF=r90:V2DI#0 > >> >> > REG_DEAD r90:V2DI > >> >> > Successfully matched this instruction: > >> >> > (set (subreg:V2DI (reg:V2DF 91) 0) > >> >> > (gt:V2DI (reg:V2DI 89) > >> >> > (reg:V2DI 93))) > >> >> > allowing combination of insns 7 and 9 > >> >> > > >> >> > and then: > >> >> > Trying 6, 9 -> 10: > >> >> > 6: r89:V2DI=const_vector > >> >> > 9: r91:V2DF#0=r89:V2DI>r93:V2DI > >> >> > REG_DEAD r89:V2DI > >> >> > REG_DEAD r93:V2DI > >> >> > 10: r87:SI=unspec[r91:V2DF] 43 > >> >> > REG_DEAD r91:V2DF > >> >> > Successfully matched this instruction: > >> >> > (set (reg:SI 87) > >> >> > (unspec:SI [ > >> >> > (lt:V2DF (reg:V2DI 93) > >> >> > (const_vector:V2DI [ > >> >> > (const_int 0 [0]) repeated x2 > >> >> > ])) > >> >> > ] UNSPEC_MOVMSK)) > >> >> > >> >> Eh? lt:*V2DF*? Does that mean that it's 0 for false and an all-1 NaN > >> >> for true? > >> > Well looking at .optimized dump: > >> > > >> > vector(2) long int _2; > >> > vector(2) double _3; > >> > int _6; > >> > signed long _7; > >> > long int _8; > >> > signed long _9; > >> > long int _10; > >> > > >> > <bb 2> [local count: 1073741824]: > >> > _7 = BIT_FIELD_REF <a_4(D), 64, 0>; > >> > _8 = _7 < 0 ? -1 : 0; > >> > _9 = BIT_FIELD_REF <a_4(D), 64, 64>; > >> > _10 = _9 < 0 ? -1 : 0; > >> > _2 = {_8, _10}; > >> > _3 = VIEW_CONVERT_EXPR<__m128d>(_2); > >> > _6 = __builtin_ia32_movmskpd (_3); [tail call] > >> > return _6; > >> > > >> > So AFAIU, we're using -1 for representing true and 0 for false > >> > and casting -1 (literally) to double would change it's value to -NaN ? > >> > >> Exactly. And for -ffinite-math-only, we might as well then fold the > >> condition to false. :-) > >> > >> IMO rtl condition results have to have integral modes and not folding > >> (subreg:V2DF (lt:V2DI ...) 0) is the correct behaviour. So I think this > >> is just a latent bug and shouldn't hold up the patch. > >> > >> I'm not sure whether: > >> > >> reinterpret_cast<__m128d> (a > ...) > >> > >> is something we expect users to write, or whether it was just > >> included for completeness. > > I will report the issue and commit after re-testing patch. > > Thanks a lot for the helpful reviews! > > Since it seems FP comparison results are OK, I guess it needs to be > fixed after all. I think the problem is that the simplification > is only done by gen_lowpart_for_combine: > > /* If X is a comparison operator, rewrite it in a new mode. This > probably won't match, but may allow further simplifications. */ > else if (COMPARISON_P (x)) > return gen_rtx_fmt_ee (GET_CODE (x), omode, XEXP (x, 0), XEXP (x, 1)); > > and triggers via expand_field_assignment when the subreg is on the lhs > of the SET. For it to work for a general subreg on the rhs, it probably > needs to be moved from here to simplify_subreg. > > We'll need to be careful about the conditions under which it > happens though. The above doesn't for example check whether > the new mode has the same number of elements as the original, > so could generate things like: > > (gt:V4SF (reg:V2DI X) (reg:V2DI Y)) > > for: > > (set (reg:V2DI res) (gt:V2DI (reg:V2DI X) (reg:V2DI Y))) > (subreg:V4SF (reg:V2DI res) 0) > > I think most code would expect the result of a comparison to have the > same number of elements as the inputs and could ICE if they don't, > so I don't think it's up to the target to decide whether mismatches > are OK. (But there again, I thought that last time. :-)) > > We'd also need to check the element sizes, since e.g. a lowpart > (subreg:V2HI ...) of a V2SI comparison isn't the same as a V2HI > comparison. Hi Richard, Thanks for the detailed suggestions and sorry for late response, for some reason, I missed this email earlier -;( With the changes to simplify_subreg, the test doesn't regress anymore. IIUC it does the following change: (subreg:V2DF (lt:V2DI (reg:V2DI 93) (const_vector 0))) -> (lt:V2DF (reg:V2DI 93) (const_vector 0)) which allowed the 6, 7 -> 10 combination which was failing earlier. Does the attached version look OK ? Testing in progress. Thanks, Prathamesh > > Thanks, > Richard
Thanks for fixing this. Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > index 89a46a933fa..79bd0cfbd28 100644 > --- a/gcc/simplify-rtx.c > +++ b/gcc/simplify-rtx.c > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, > } > } > > + /* If op is a vector comparison operator, rewrite it in a new mode. > + This probably won't match, but may allow further simplifications. > + Also check if number of elements and size of each element > + match for outermode and innermode. */ > + Excess blank line after the comment. IMO the second part of the comment reads too much like an afterthought. How about: /* If OP is a vector comparison and the subreg is not changing the number of elements or the size of the elements, change the result of the comparison to the new mode. */ > + if (COMPARISON_P (op) > + && VECTOR_MODE_P (outermode) > + && VECTOR_MODE_P (innermode) > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), > + GET_MODE_UNIT_SIZE (innermode)))) Redundant brackets around the known_eq calls. > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); This should use simplify_gen_relational, so that we try to simplify the new expression. Richard
On Tue, 2 Jul 2019 at 16:59, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Thanks for fixing this. > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > > index 89a46a933fa..79bd0cfbd28 100644 > > --- a/gcc/simplify-rtx.c > > +++ b/gcc/simplify-rtx.c > > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, > > } > > } > > > > + /* If op is a vector comparison operator, rewrite it in a new mode. > > + This probably won't match, but may allow further simplifications. > > + Also check if number of elements and size of each element > > + match for outermode and innermode. */ > > + > > Excess blank line after the comment. IMO the second part of the comment > reads too much like an afterthought. How about: > > /* If OP is a vector comparison and the subreg is not changing the > number of elements or the size of the elements, change the result > of the comparison to the new mode. */ > > > + if (COMPARISON_P (op) > > + && VECTOR_MODE_P (outermode) > > + && VECTOR_MODE_P (innermode) > > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) > > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), > > + GET_MODE_UNIT_SIZE (innermode)))) > > Redundant brackets around the known_eq calls. > > > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); > > This should use simplify_gen_relational, so that we try to simplify > the new expression. Does the attached version look OK ? Thanks, Prathamesh > > Richard diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..e6f37527192 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -448,6 +448,18 @@ enum { PR_OPTIMIZE_FOR_SPEED = 4 }; +/* Check that X has a single def. */ + +static bool +reg_single_def_p (rtx x) +{ + if (!REG_P (x)) + return false; + + int regno = REGNO (x); + return (DF_REG_DEF_COUNT (regno) == 1 + && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno)); +} /* Replace all occurrences of OLD in *PX with NEW and try to simplify the resulting expression. Replace *PX with a new RTL expression if an @@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + + /* Allow propagations into a loop only for reg-to-reg copies, since + replacing one register by another shouldn't increase the cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1553,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1572,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1612,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1648,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index 89a46a933fa..c588c91562b 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op, } } + /* If OP is a vector comparison and the subreg is not changing the + number of elements or the size of the elements, change the result + of the comparison to the new mode. */ + if (COMPARISON_P (op) + && VECTOR_MODE_P (outermode) + && VECTOR_MODE_P (innermode) + && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)) + && known_eq (GET_MODE_UNIT_SIZE (outermode), + GET_MODE_UNIT_SIZE (innermode))) + return simplify_gen_relational (GET_CODE (op), outermode, innermode, + XEXP (op, 0), XEXP (op, 1)); return NULL_RTX; } diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Thanks for fixing this. >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c >> > index 89a46a933fa..79bd0cfbd28 100644 >> > --- a/gcc/simplify-rtx.c >> > +++ b/gcc/simplify-rtx.c >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, >> > } >> > } >> > >> > + /* If op is a vector comparison operator, rewrite it in a new mode. >> > + This probably won't match, but may allow further simplifications. >> > + Also check if number of elements and size of each element >> > + match for outermode and innermode. */ >> > + >> >> Excess blank line after the comment. IMO the second part of the comment >> reads too much like an afterthought. How about: >> >> /* If OP is a vector comparison and the subreg is not changing the >> number of elements or the size of the elements, change the result >> of the comparison to the new mode. */ >> >> > + if (COMPARISON_P (op) >> > + && VECTOR_MODE_P (outermode) >> > + && VECTOR_MODE_P (innermode) >> > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) >> > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), >> > + GET_MODE_UNIT_SIZE (innermode)))) >> >> Redundant brackets around the known_eq calls. >> >> > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); >> >> This should use simplify_gen_relational, so that we try to simplify >> the new expression. > Does the attached version look OK ? OK with a suitable changelog (remember to post those!) if it passes testing. Thanks, Richard
On Tue, 2 Jul 2019 at 18:22, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Thanks for fixing this. > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > >> > index 89a46a933fa..79bd0cfbd28 100644 > >> > --- a/gcc/simplify-rtx.c > >> > +++ b/gcc/simplify-rtx.c > >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, > >> > } > >> > } > >> > > >> > + /* If op is a vector comparison operator, rewrite it in a new mode. > >> > + This probably won't match, but may allow further simplifications. > >> > + Also check if number of elements and size of each element > >> > + match for outermode and innermode. */ > >> > + > >> > >> Excess blank line after the comment. IMO the second part of the comment > >> reads too much like an afterthought. How about: > >> > >> /* If OP is a vector comparison and the subreg is not changing the > >> number of elements or the size of the elements, change the result > >> of the comparison to the new mode. */ > >> > >> > + if (COMPARISON_P (op) > >> > + && VECTOR_MODE_P (outermode) > >> > + && VECTOR_MODE_P (innermode) > >> > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) > >> > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), > >> > + GET_MODE_UNIT_SIZE (innermode)))) > >> > >> Redundant brackets around the known_eq calls. > >> > >> > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); > >> > >> This should use simplify_gen_relational, so that we try to simplify > >> the new expression. > > Does the attached version look OK ? > > OK with a suitable changelog (remember to post those!) if it passes testing. The change to simplify_subreg regressed avx2-pr54700-1.C -;) For following test-case: __attribute__((noipa)) __v8sf f7 (__v8si a, __v8sf b, __v8sf c) { return a < 0 ? b : c; } Before patch, combine shows: Trying 8, 9 -> 10: 8: r87:V8SI=const_vector 9: r89:V8SI=r87:V8SI>r90:V8SI REG_DEAD r90:V8SI REG_DEAD r87:V8SI 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 REG_DEAD r92:V8SF REG_DEAD r89:V8SI REG_DEAD r91:V8SF Successfully matched this instruction: (set (reg:V8SF 86) (unspec:V8SF [ (reg:V8SF 92) (reg:V8SF 91) (subreg:V8SF (lt:V8SI (reg:V8SI 90) (const_vector:V8SI [ (const_int 0 [0]) repeated x8 ])) 0) ] UNSPEC_BLENDV)) allowing combination of insns 8, 9 and 10 After applying patch, combine shows: Trying 8, 9 -> 10: 8: r87:V8SI=const_vector 9: r89:V8SI=r87:V8SI>r90:V8SI REG_DEAD r90:V8SI REG_DEAD r87:V8SI 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 REG_DEAD r92:V8SF REG_DEAD r89:V8SI REG_DEAD r91:V8SF Failed to match this instruction: (set (reg:V8SF 86) (unspec:V8SF [ (reg:V8SF 92) (reg:V8SF 91) (lt:V8SF (reg:V8SI 90) (const_vector:V8SI [ (const_int 0 [0]) repeated x8 ])) ] UNSPEC_BLENDV)) Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work. Does it look OK ? Testing the attached patch in progress. (A quick comparison for i386.exp now shows no difference before/after patch). Thanks, Prathamesh > > Thanks, > Richard 2019-07-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> * fwprop.c (reg_single_def_p): New function. (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case. (forward_propagate_into): New parameter reg_prop_only with default value false. Propagate def's src into loop only if SET_SRC and SET_DEST of def_set have single definitions. Likewise if reg_prop_only is set to true. (fwprop): New param fwprop_addr_p. Integrate fwprop_addr into fwprop. (fwprop_addr): Remove. (pass_rtl_fwprop_addr::execute): Call fwprop with arg set to true. (pass_rtl_fwprop::execute): Call fwprop with arg set to false. * simplify-rtx.c (simplify_subreg): Add case for vector comparison. * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern. testsuite/ * gfortran.dg/pr88833.f90: New test. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d7d542524fb..5384fe218f9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16623,10 +16623,9 @@ (unspec:VF_128_256 [(match_operand:VF_128_256 1 "register_operand" "0,0,x") (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") - (subreg:VF_128_256 - (lt:<sseintvecmode> + (lt:VF_128_256 (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") - (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] UNSPEC_BLENDV))] "TARGET_SSE4_1" "#" diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..e6f37527192 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -448,6 +448,18 @@ enum { PR_OPTIMIZE_FOR_SPEED = 4 }; +/* Check that X has a single def. */ + +static bool +reg_single_def_p (rtx x) +{ + if (!REG_P (x)) + return false; + + int regno = REGNO (x); + return (DF_REG_DEF_COUNT (regno) == 1 + && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno)); +} /* Replace all occurrences of OLD in *PX with NEW and try to simplify the resulting expression. Replace *PX with a new RTL expression if an @@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + + /* Allow propagations into a loop only for reg-to-reg copies, since + replacing one register by another shouldn't increase the cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && (!reg_single_def_p (SET_SRC (def_set)) + || !reg_single_def_p (SET_DEST (def_set)))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1553,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1572,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1612,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1648,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index 89a46a933fa..dd2acd4eca9 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -6697,6 +6697,17 @@ simplify_subreg (machine_mode outermode, rtx op, } } + /* If OP is a vector comparison and the subreg is not changing the + number of elements or the size of the elements, change the result + of the comparison to the new mode. */ + if (COMPARISON_P (op) + && VECTOR_MODE_P (outermode) + && VECTOR_MODE_P (innermode) + && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode)) + && known_eq (GET_MODE_UNIT_SIZE (outermode), + GET_MODE_UNIT_SIZE (innermode))) + return simplify_gen_relational (GET_CODE (op), outermode, innermode, + XEXP (op, 0), XEXP (op, 1)); return NULL_RTX; } diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > On Tue, 2 Jul 2019 at 18:22, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford >> > <richard.sandiford@arm.com> wrote: >> >> >> >> Thanks for fixing this. >> >> >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: >> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c >> >> > index 89a46a933fa..79bd0cfbd28 100644 >> >> > --- a/gcc/simplify-rtx.c >> >> > +++ b/gcc/simplify-rtx.c >> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, >> >> > } >> >> > } >> >> > >> >> > + /* If op is a vector comparison operator, rewrite it in a new mode. >> >> > + This probably won't match, but may allow further simplifications. >> >> > + Also check if number of elements and size of each element >> >> > + match for outermode and innermode. */ >> >> > + >> >> >> >> Excess blank line after the comment. IMO the second part of the comment >> >> reads too much like an afterthought. How about: >> >> >> >> /* If OP is a vector comparison and the subreg is not changing the >> >> number of elements or the size of the elements, change the result >> >> of the comparison to the new mode. */ >> >> >> >> > + if (COMPARISON_P (op) >> >> > + && VECTOR_MODE_P (outermode) >> >> > + && VECTOR_MODE_P (innermode) >> >> > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) >> >> > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), >> >> > + GET_MODE_UNIT_SIZE (innermode)))) >> >> >> >> Redundant brackets around the known_eq calls. >> >> >> >> > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); >> >> >> >> This should use simplify_gen_relational, so that we try to simplify >> >> the new expression. >> > Does the attached version look OK ? >> >> OK with a suitable changelog (remember to post those!) if it passes testing. > The change to simplify_subreg regressed avx2-pr54700-1.C -;) > > For following test-case: > __attribute__((noipa)) __v8sf > f7 (__v8si a, __v8sf b, __v8sf c) > { > return a < 0 ? b : c; > } > > Before patch, combine shows: > Trying 8, 9 -> 10: > 8: r87:V8SI=const_vector > 9: r89:V8SI=r87:V8SI>r90:V8SI > REG_DEAD r90:V8SI > REG_DEAD r87:V8SI > 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 > REG_DEAD r92:V8SF > REG_DEAD r89:V8SI > REG_DEAD r91:V8SF > Successfully matched this instruction: > (set (reg:V8SF 86) > (unspec:V8SF [ > (reg:V8SF 92) > (reg:V8SF 91) > (subreg:V8SF (lt:V8SI (reg:V8SI 90) > (const_vector:V8SI [ > (const_int 0 [0]) repeated x8 > ])) 0) > ] UNSPEC_BLENDV)) > allowing combination of insns 8, 9 and 10 > > After applying patch, combine shows: > > Trying 8, 9 -> 10: > 8: r87:V8SI=const_vector > 9: r89:V8SI=r87:V8SI>r90:V8SI > REG_DEAD r90:V8SI > REG_DEAD r87:V8SI > 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 > REG_DEAD r92:V8SF > REG_DEAD r89:V8SI > REG_DEAD r91:V8SF > Failed to match this instruction: > (set (reg:V8SF 86) > (unspec:V8SF [ > (reg:V8SF 92) > (reg:V8SF 91) > (lt:V8SF (reg:V8SI 90) > (const_vector:V8SI [ > (const_int 0 [0]) repeated x8 > ])) > ] UNSPEC_BLENDV)) Bah. If the port isn't self-consistent about whether a subreg should be used, it's tempting to say that a subreg should be used and fix the places that don't. At least that way we'd avoid the abomination - ABOMINATION! - of using NaNs to represent truth. But I agree it looks like this is the only pattern affected. > Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work. > Does it look OK ? > > Testing the attached patch in progress. > (A quick comparison for i386.exp now shows no difference before/after patch). > > Thanks, > Prathamesh >> >> Thanks, >> Richard > > 2019-07-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > > * fwprop.c (reg_single_def_p): New function. > (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case. > (forward_propagate_into): New parameter reg_prop_only > with default value false. > Propagate def's src into loop only if SET_SRC and SET_DEST > of def_set have single definitions. > Likewise if reg_prop_only is set to true. > (fwprop): New param fwprop_addr_p. > Integrate fwprop_addr into fwprop. > (fwprop_addr): Remove. > (pass_rtl_fwprop_addr::execute): Call fwprop with arg set > to true. > (pass_rtl_fwprop::execute): Call fwprop with arg set to false. > * simplify-rtx.c (simplify_subreg): Add case for vector comparison. > * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern. typo: config/i386/sse.md > > testsuite/ > * gfortran.dg/pr88833.f90: New test. > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index d7d542524fb..5384fe218f9 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -16623,10 +16623,9 @@ > (unspec:VF_128_256 > [(match_operand:VF_128_256 1 "register_operand" "0,0,x") > (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") > - (subreg:VF_128_256 > - (lt:<sseintvecmode> > + (lt:VF_128_256 > (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") > - (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] > + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] > UNSPEC_BLENDV))] > "TARGET_SSE4_1" > "#" The (lt:...) and its operands should now be indented by two columns fewer than previously. OK with that change, thanks. Richard
On Wed, 3 Jul 2019 at 17:06, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > > On Tue, 2 Jul 2019 at 18:22, Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> > On Tue, 2 Jul 2019 at 16:59, Richard Sandiford > >> > <richard.sandiford@arm.com> wrote: > >> >> > >> >> Thanks for fixing this. > >> >> > >> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes: > >> >> > diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c > >> >> > index 89a46a933fa..79bd0cfbd28 100644 > >> >> > --- a/gcc/simplify-rtx.c > >> >> > +++ b/gcc/simplify-rtx.c > >> >> > @@ -6697,6 +6697,19 @@ simplify_subreg (machine_mode outermode, rtx op, > >> >> > } > >> >> > } > >> >> > > >> >> > + /* If op is a vector comparison operator, rewrite it in a new mode. > >> >> > + This probably won't match, but may allow further simplifications. > >> >> > + Also check if number of elements and size of each element > >> >> > + match for outermode and innermode. */ > >> >> > + > >> >> > >> >> Excess blank line after the comment. IMO the second part of the comment > >> >> reads too much like an afterthought. How about: > >> >> > >> >> /* If OP is a vector comparison and the subreg is not changing the > >> >> number of elements or the size of the elements, change the result > >> >> of the comparison to the new mode. */ > >> >> > >> >> > + if (COMPARISON_P (op) > >> >> > + && VECTOR_MODE_P (outermode) > >> >> > + && VECTOR_MODE_P (innermode) > >> >> > + && (known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))) > >> >> > + && (known_eq (GET_MODE_UNIT_SIZE (outermode), > >> >> > + GET_MODE_UNIT_SIZE (innermode)))) > >> >> > >> >> Redundant brackets around the known_eq calls. > >> >> > >> >> > + return gen_rtx_fmt_ee (GET_CODE (op), outermode, XEXP (op, 0), XEXP (op, 1)); > >> >> > >> >> This should use simplify_gen_relational, so that we try to simplify > >> >> the new expression. > >> > Does the attached version look OK ? > >> > >> OK with a suitable changelog (remember to post those!) if it passes testing. > > The change to simplify_subreg regressed avx2-pr54700-1.C -;) > > > > For following test-case: > > __attribute__((noipa)) __v8sf > > f7 (__v8si a, __v8sf b, __v8sf c) > > { > > return a < 0 ? b : c; > > } > > > > Before patch, combine shows: > > Trying 8, 9 -> 10: > > 8: r87:V8SI=const_vector > > 9: r89:V8SI=r87:V8SI>r90:V8SI > > REG_DEAD r90:V8SI > > REG_DEAD r87:V8SI > > 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 > > REG_DEAD r92:V8SF > > REG_DEAD r89:V8SI > > REG_DEAD r91:V8SF > > Successfully matched this instruction: > > (set (reg:V8SF 86) > > (unspec:V8SF [ > > (reg:V8SF 92) > > (reg:V8SF 91) > > (subreg:V8SF (lt:V8SI (reg:V8SI 90) > > (const_vector:V8SI [ > > (const_int 0 [0]) repeated x8 > > ])) 0) > > ] UNSPEC_BLENDV)) > > allowing combination of insns 8, 9 and 10 > > > > After applying patch, combine shows: > > > > Trying 8, 9 -> 10: > > 8: r87:V8SI=const_vector > > 9: r89:V8SI=r87:V8SI>r90:V8SI > > REG_DEAD r90:V8SI > > REG_DEAD r87:V8SI > > 10: r86:V8SF=unspec[r92:V8SF,r91:V8SF,r89:V8SI#0] 104 > > REG_DEAD r92:V8SF > > REG_DEAD r89:V8SI > > REG_DEAD r91:V8SF > > Failed to match this instruction: > > (set (reg:V8SF 86) > > (unspec:V8SF [ > > (reg:V8SF 92) > > (reg:V8SF 91) > > (lt:V8SF (reg:V8SI 90) > > (const_vector:V8SI [ > > (const_int 0 [0]) repeated x8 > > ])) > > ] UNSPEC_BLENDV)) > > Bah. If the port isn't self-consistent about whether a subreg should > be used, it's tempting to say that a subreg should be used and fix the > places that don't. At least that way we'd avoid the abomination - > ABOMINATION! - of using NaNs to represent truth. > > But I agree it looks like this is the only pattern affected. > > > Adjusting UNSPEC_BLENDV pattern to match the simplified subreg seems to work. > > Does it look OK ? > > > > Testing the attached patch in progress. > > (A quick comparison for i386.exp now shows no difference before/after patch). > > > > Thanks, > > Prathamesh > >> > >> Thanks, > >> Richard > > > > 2019-07-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> > > > > * fwprop.c (reg_single_def_p): New function. > > (propagate_rtx_1): Add unconditional else inside RTX_EXTRA case. > > (forward_propagate_into): New parameter reg_prop_only > > with default value false. > > Propagate def's src into loop only if SET_SRC and SET_DEST > > of def_set have single definitions. > > Likewise if reg_prop_only is set to true. > > (fwprop): New param fwprop_addr_p. > > Integrate fwprop_addr into fwprop. > > (fwprop_addr): Remove. > > (pass_rtl_fwprop_addr::execute): Call fwprop with arg set > > to true. > > (pass_rtl_fwprop::execute): Call fwprop with arg set to false. > > * simplify-rtx.c (simplify_subreg): Add case for vector comparison. > > * i386/config/sse.md (UNSPEC_BLENDV): Adjust pattern. > > typo: config/i386/sse.md > > > > > testsuite/ > > * gfortran.dg/pr88833.f90: New test. > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index d7d542524fb..5384fe218f9 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -16623,10 +16623,9 @@ > > (unspec:VF_128_256 > > [(match_operand:VF_128_256 1 "register_operand" "0,0,x") > > (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") > > - (subreg:VF_128_256 > > - (lt:<sseintvecmode> > > + (lt:VF_128_256 > > (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") > > - (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)] > > + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] > > UNSPEC_BLENDV))] > > "TARGET_SSE4_1" > > "#" > > The (lt:...) and its operands should now be indented by two columns fewer > than previously. > > OK with that change, thanks. Thanks, committed in r273040. Thanks, Prathamesh > > Richard
diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 45703fe5f01..93a1a10c9a6 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -547,6 +547,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags) tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)), SUBREG_BYTE (x)); } + + else + { + rtvec vec; + rtvec newvec; + const char *fmt = GET_RTX_FORMAT (code); + rtx op; + + for (int i = 0; fmt[i]; i++) + switch (fmt[i]) + { + case 'E': + vec = XVEC (x, i); + newvec = vec; + for (int j = 0; j < GET_NUM_ELEM (vec); j++) + { + op = RTVEC_ELT (vec, j); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != RTVEC_ELT (vec, j)) + { + if (newvec == vec) + { + newvec = shallow_copy_rtvec (vec); + if (!tem) + tem = shallow_copy_rtx (x); + XVEC (tem, i) = newvec; + } + RTVEC_ELT (newvec, j) = op; + } + } + break; + + case 'e': + if (XEXP (x, i)) + { + op = XEXP (x, i); + valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags); + if (op != XEXP (x, i)) + { + if (!tem) + tem = shallow_copy_rtx (x); + XEXP (tem, i) = op; + } + } + break; + } + } + break; case RTX_OBJ: @@ -1370,10 +1418,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set) /* Given a use USE of an insn, if it has a single reaching definition, try to forward propagate it into that insn. - Return true if cfg cleanup will be needed. */ + Return true if cfg cleanup will be needed. + REG_PROP_ONLY is true if we should only propagate register copies. */ static bool -forward_propagate_into (df_ref use) +forward_propagate_into (df_ref use, bool reg_prop_only = false) { df_ref def; rtx_insn *def_insn, *use_insn; @@ -1394,10 +1443,6 @@ forward_propagate_into (df_ref use) if (DF_REF_IS_ARTIFICIAL (def)) return false; - /* Do not propagate loop invariant definitions inside the loop. */ - if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father) - return false; - /* Check if the use is still present in the insn! */ use_insn = DF_REF_INSN (use); if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE) @@ -1415,6 +1460,16 @@ forward_propagate_into (df_ref use) if (!def_set) return false; + if (reg_prop_only && !REG_P (SET_SRC (def_set))) + return false; + + /* Allow propagating def inside loop only if source of def_set is + reg, since replacing reg by source reg shouldn't increase cost. */ + + if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father + && !REG_P (SET_SRC (def_set))) + return false; + /* Only try one kind of propagation. If two are possible, we'll do it on the following iterations. */ if (forward_propagate_and_simplify (use, def_insn, def_set) @@ -1483,7 +1538,7 @@ gate_fwprop (void) } static unsigned int -fwprop (void) +fwprop (bool fwprop_addr_p) { unsigned i; @@ -1502,11 +1557,16 @@ fwprop (void) df_ref use = DF_USES_GET (i); if (use) - if (DF_REF_TYPE (use) == DF_REF_REG_USE - || DF_REF_BB (use)->loop_father == NULL - /* The outer most loop is not really a loop. */ - || loop_outer (DF_REF_BB (use)->loop_father) == NULL) - forward_propagate_into (use); + { + if (DF_REF_TYPE (use) == DF_REF_REG_USE + || DF_REF_BB (use)->loop_father == NULL + /* The outer most loop is not really a loop. */ + || loop_outer (DF_REF_BB (use)->loop_father) == NULL) + forward_propagate_into (use, fwprop_addr_p); + + else if (fwprop_addr_p) + forward_propagate_into (use, false); + } } fwprop_done (); @@ -1537,7 +1597,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop (); } + virtual unsigned int execute (function *) { return fwprop (false); } }; // class pass_rtl_fwprop @@ -1549,33 +1609,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt) return new pass_rtl_fwprop (ctxt); } -static unsigned int -fwprop_addr (void) -{ - unsigned i; - - fwprop_init (); - - /* Go through all the uses. df_uses_create will create new ones at the - end, and we'll go through them as well. */ - for (i = 0; i < DF_USES_TABLE_SIZE (); i++) - { - if (!propagations_left) - break; - - df_ref use = DF_USES_GET (i); - if (use) - if (DF_REF_TYPE (use) != DF_REF_REG_USE - && DF_REF_BB (use)->loop_father != NULL - /* The outer most loop is not really a loop. */ - && loop_outer (DF_REF_BB (use)->loop_father) != NULL) - forward_propagate_into (use); - } - - fwprop_done (); - return 0; -} - namespace { const pass_data pass_data_rtl_fwprop_addr = @@ -1600,7 +1633,7 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { return gate_fwprop (); } - virtual unsigned int execute (function *) { return fwprop_addr (); } + virtual unsigned int execute (function *) { return fwprop (true); } }; // class pass_rtl_fwprop_addr diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90 new file mode 100644 index 00000000000..224e6ce5f3d --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr88833.f90 @@ -0,0 +1,9 @@ +! { dg-do assemble { target aarch64_asm_sve_ok } } +! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" } + +subroutine foo(x) + real :: x(100) + x = x + 10 +end subroutine foo + +! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }