Message ID | 87d13nlc6v.fsf@linaro.org |
---|---|
State | New |
Headers | show |
Series | Make VEC_PERM_EXPR work for variable-length vectors | expand |
Ping Richard Sandiford <richard.sandiford@linaro.org> writes: > This patch makes users of vec_perm_builders use the compressed encoding > where possible. This means that they work with variable-length vectors. > > > 2017-12-09 Richard Sandiford <richard.sandiford@linaro.org> > > gcc/ > * optabs.c (expand_vec_perm_var): Use an explicit encoding for > the broadcast of the low byte. > (expand_mult_highpart): Use an explicit encoding for the permutes. > * optabs-query.c (can_mult_highpart_p): Likewise. > * tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise. > * tree-vect-stmts.c (perm_mask_for_reverse): Likewise. > (vectorizable_bswap): Likewise. > * tree-vect-data-refs.c (vect_grouped_store_supported): Use an > explicit encoding for the power-of-2 permutes. > (vect_permute_store_chain): Likewise. > (vect_grouped_load_supported): Likewise. > (vect_permute_load_chain): Likewise. > > Index: gcc/optabs.c > =================================================================== > --- gcc/optabs.c 2017-12-09 22:48:47.546825312 +0000 > +++ gcc/optabs.c 2017-12-09 22:48:52.266015836 +0000 > @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode, > NULL, 0, OPTAB_DIRECT); > gcc_assert (sel != NULL); > > - /* Broadcast the low byte each element into each of its bytes. */ > - vec_perm_builder const_sel (w, w, 1); > - for (i = 0; i < w; ++i) > - { > - int this_e = i / u * u; > - if (BYTES_BIG_ENDIAN) > - this_e += u - 1; > - const_sel.quick_push (this_e); > - } > + /* Broadcast the low byte each element into each of its bytes. > + The encoding has U interleaved stepped patterns, one for each > + byte of an element. */ > + vec_perm_builder const_sel (w, u, 3); > + unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0; > + for (i = 0; i < 3; ++i) > + for (unsigned int j = 0; j < u; ++j) > + const_sel.quick_push (i * u + low_byte_in_u); > sel = gen_lowpart (qimode, sel); > sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL); > gcc_assert (sel != NULL); > @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode, > expand_insn (optab_handler (tab2, mode), 3, eops); > m2 = gen_lowpart (mode, eops[0].value); > > - vec_perm_builder sel (nunits, nunits, 1); > + vec_perm_builder sel; > if (method == 2) > { > - for (i = 0; i < nunits; ++i) > + /* The encoding has 2 interleaved stepped patterns. */ > + sel.new_vector (nunits, 2, 3); > + for (i = 0; i < 6; ++i) > sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1) > + ((i & 1) ? nunits : 0)); > } > else > { > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single interleaved stepped pattern. */ > + sel.new_vector (nunits, 1, 3); > + for (i = 0; i < 3; ++i) > sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); > } > > Index: gcc/optabs-query.c > =================================================================== > --- gcc/optabs-query.c 2017-12-09 22:48:47.545825268 +0000 > +++ gcc/optabs-query.c 2017-12-09 22:48:52.265015799 +0000 > @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode, > op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab; > if (optab_handler (op, mode) != CODE_FOR_nothing) > { > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nunits, 2, 3); > + for (i = 0; i < 6; ++i) > sel.quick_push (!BYTES_BIG_ENDIAN > + (i & ~1) > + ((i & 1) ? nunits : 0)); > @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode, > op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab; > if (optab_handler (op, mode) != CODE_FOR_nothing) > { > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nunits, 1, 3); > + for (int i = 0; i < 3; ++i) > sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); > vec_perm_indices indices (sel, 2, nunits); > if (can_vec_perm_const_p (mode, indices)) > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c 2017-12-09 22:48:47.547825355 +0000 > +++ gcc/tree-vect-loop.c 2017-12-09 22:48:52.267015873 +0000 > @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop > calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, > vec_perm_builder *sel) > { > - sel->new_vector (nelt, nelt, 1); > - for (unsigned int i = 0; i < nelt; i++) > + /* The encoding is a single stepped pattern. Any wrap-around is handled > + by vec_perm_indices. */ > + sel->new_vector (nelt, 1, 3); > + for (unsigned int i = 0; i < 3; i++) > sel->quick_push (i + offset); > } > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c 2017-12-09 22:48:50.360942531 +0000 > +++ gcc/tree-vect-stmts.c 2017-12-09 22:48:52.268015910 +0000 > @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype) > > nunits = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nunits, 1, 3); > + for (i = 0; i < 3; ++i) > sel.quick_push (nunits - 1 - i); > > vec_perm_indices indices (sel, 1, nunits); > @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple > unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype); > unsigned word_bytes = num_bytes / nunits; > > - vec_perm_builder elts (num_bytes, num_bytes, 1); > - for (unsigned i = 0; i < nunits; ++i) > + /* The encoding uses one stepped pattern for each byte in the word. */ > + vec_perm_builder elts (num_bytes, word_bytes, 3); > + for (unsigned i = 0; i < 3; ++i) > for (unsigned j = 0; j < word_bytes; ++j) > elts.quick_push ((i + 1) * word_bytes - j - 1); > > Index: gcc/tree-vect-data-refs.c > =================================================================== > --- gcc/tree-vect-data-refs.c 2017-12-09 22:48:47.546825312 +0000 > +++ gcc/tree-vect-data-refs.c 2017-12-09 22:48:52.267015873 +0000 > @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty > if (VECTOR_MODE_P (mode)) > { > unsigned int i, nelt = GET_MODE_NUNITS (mode); > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > if (count == 3) > { > unsigned int j0 = 0, j1 = 0, j2 = 0; > unsigned int i, j; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (j = 0; j < 3; j++) > { > @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (count)); > > - for (i = 0; i < nelt / 2; i++) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nelt, 2, 3); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > { > sel[i * 2] = i; > sel[i * 2 + 1] = i + nelt; > @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty > vec_perm_indices indices (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > { > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 6; i++) > sel[i] += nelt / 2; > indices.new_vector (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c > unsigned int i, n, log_length = exact_log2 (length); > unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > result_chain->quick_grow (length); > memcpy (result_chain->address (), dr_chain.address (), > length * sizeof (tree)); > @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c > { > unsigned int j0 = 0, j1 = 0, j2 = 0; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (j = 0; j < 3; j++) > { > @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (length)); > > - for (i = 0, n = nelt / 2; i < n; i++) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nelt, 2, 3); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > { > sel[i * 2] = i; > sel[i * 2 + 1] = i + nelt; > @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c > vec_perm_indices indices (sel, 2, nelt); > perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); > > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 6; i++) > sel[i] += nelt / 2; > indices.new_vector (sel, 2, nelt); > perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); > @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp > if (VECTOR_MODE_P (mode)) > { > unsigned int i, j, nelt = GET_MODE_NUNITS (mode); > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > > if (count == 3) > { > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > unsigned int k; > for (k = 0; k < 3; k++) > @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (count)); > > - for (i = 0; i < nelt; i++) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nelt, 1, 3); > + sel.quick_grow (3); > + for (i = 0; i < 3; i++) > sel[i] = i * 2; > vec_perm_indices indices (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > { > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 3; i++) > sel[i] = i * 2 + 1; > indices.new_vector (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch > unsigned int i, j, log_length = exact_log2 (length); > unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > result_chain->quick_grow (length); > memcpy (result_chain->address (), dr_chain.address (), > length * sizeof (tree)); > @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch > { > unsigned int k; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (k = 0; k < 3; k++) > { > @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (length)); > > - for (i = 0; i < nelt; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nelt, 1, 3); > + sel.quick_grow (3); > + for (i = 0; i < 3; ++i) > sel[i] = i * 2; > vec_perm_indices indices (sel, 2, nelt); > perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); > > - for (i = 0; i < nelt; ++i) > + for (i = 0; i < 3; ++i) > sel[i] = i * 2 + 1; > indices.new_vector (sel, 2, nelt); > perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
On Sun, Dec 10, 2017 at 12:21 AM, Richard Sandiford <richard.sandiford@linaro.org> wrote: > This patch makes users of vec_perm_builders use the compressed encoding > where possible. This means that they work with variable-length vectors. > Ok. Richard. > 2017-12-09 Richard Sandiford <richard.sandiford@linaro.org> > > gcc/ > * optabs.c (expand_vec_perm_var): Use an explicit encoding for > the broadcast of the low byte. > (expand_mult_highpart): Use an explicit encoding for the permutes. > * optabs-query.c (can_mult_highpart_p): Likewise. > * tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise. > * tree-vect-stmts.c (perm_mask_for_reverse): Likewise. > (vectorizable_bswap): Likewise. > * tree-vect-data-refs.c (vect_grouped_store_supported): Use an > explicit encoding for the power-of-2 permutes. > (vect_permute_store_chain): Likewise. > (vect_grouped_load_supported): Likewise. > (vect_permute_load_chain): Likewise. > > Index: gcc/optabs.c > =================================================================== > --- gcc/optabs.c 2017-12-09 22:48:47.546825312 +0000 > +++ gcc/optabs.c 2017-12-09 22:48:52.266015836 +0000 > @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode, > NULL, 0, OPTAB_DIRECT); > gcc_assert (sel != NULL); > > - /* Broadcast the low byte each element into each of its bytes. */ > - vec_perm_builder const_sel (w, w, 1); > - for (i = 0; i < w; ++i) > - { > - int this_e = i / u * u; > - if (BYTES_BIG_ENDIAN) > - this_e += u - 1; > - const_sel.quick_push (this_e); > - } > + /* Broadcast the low byte each element into each of its bytes. > + The encoding has U interleaved stepped patterns, one for each > + byte of an element. */ > + vec_perm_builder const_sel (w, u, 3); > + unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0; > + for (i = 0; i < 3; ++i) > + for (unsigned int j = 0; j < u; ++j) > + const_sel.quick_push (i * u + low_byte_in_u); > sel = gen_lowpart (qimode, sel); > sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL); > gcc_assert (sel != NULL); > @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode, > expand_insn (optab_handler (tab2, mode), 3, eops); > m2 = gen_lowpart (mode, eops[0].value); > > - vec_perm_builder sel (nunits, nunits, 1); > + vec_perm_builder sel; > if (method == 2) > { > - for (i = 0; i < nunits; ++i) > + /* The encoding has 2 interleaved stepped patterns. */ > + sel.new_vector (nunits, 2, 3); > + for (i = 0; i < 6; ++i) > sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1) > + ((i & 1) ? nunits : 0)); > } > else > { > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single interleaved stepped pattern. */ > + sel.new_vector (nunits, 1, 3); > + for (i = 0; i < 3; ++i) > sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); > } > > Index: gcc/optabs-query.c > =================================================================== > --- gcc/optabs-query.c 2017-12-09 22:48:47.545825268 +0000 > +++ gcc/optabs-query.c 2017-12-09 22:48:52.265015799 +0000 > @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode, > op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab; > if (optab_handler (op, mode) != CODE_FOR_nothing) > { > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nunits, 2, 3); > + for (i = 0; i < 6; ++i) > sel.quick_push (!BYTES_BIG_ENDIAN > + (i & ~1) > + ((i & 1) ? nunits : 0)); > @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode, > op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab; > if (optab_handler (op, mode) != CODE_FOR_nothing) > { > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nunits, 1, 3); > + for (int i = 0; i < 3; ++i) > sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); > vec_perm_indices indices (sel, 2, nunits); > if (can_vec_perm_const_p (mode, indices)) > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c 2017-12-09 22:48:47.547825355 +0000 > +++ gcc/tree-vect-loop.c 2017-12-09 22:48:52.267015873 +0000 > @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop > calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, > vec_perm_builder *sel) > { > - sel->new_vector (nelt, nelt, 1); > - for (unsigned int i = 0; i < nelt; i++) > + /* The encoding is a single stepped pattern. Any wrap-around is handled > + by vec_perm_indices. */ > + sel->new_vector (nelt, 1, 3); > + for (unsigned int i = 0; i < 3; i++) > sel->quick_push (i + offset); > } > > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c 2017-12-09 22:48:50.360942531 +0000 > +++ gcc/tree-vect-stmts.c 2017-12-09 22:48:52.268015910 +0000 > @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype) > > nunits = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nunits, nunits, 1); > - for (i = 0; i < nunits; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nunits, 1, 3); > + for (i = 0; i < 3; ++i) > sel.quick_push (nunits - 1 - i); > > vec_perm_indices indices (sel, 1, nunits); > @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple > unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype); > unsigned word_bytes = num_bytes / nunits; > > - vec_perm_builder elts (num_bytes, num_bytes, 1); > - for (unsigned i = 0; i < nunits; ++i) > + /* The encoding uses one stepped pattern for each byte in the word. */ > + vec_perm_builder elts (num_bytes, word_bytes, 3); > + for (unsigned i = 0; i < 3; ++i) > for (unsigned j = 0; j < word_bytes; ++j) > elts.quick_push ((i + 1) * word_bytes - j - 1); > > Index: gcc/tree-vect-data-refs.c > =================================================================== > --- gcc/tree-vect-data-refs.c 2017-12-09 22:48:47.546825312 +0000 > +++ gcc/tree-vect-data-refs.c 2017-12-09 22:48:52.267015873 +0000 > @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty > if (VECTOR_MODE_P (mode)) > { > unsigned int i, nelt = GET_MODE_NUNITS (mode); > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > if (count == 3) > { > unsigned int j0 = 0, j1 = 0, j2 = 0; > unsigned int i, j; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (j = 0; j < 3; j++) > { > @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (count)); > > - for (i = 0; i < nelt / 2; i++) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nelt, 2, 3); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > { > sel[i * 2] = i; > sel[i * 2 + 1] = i + nelt; > @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty > vec_perm_indices indices (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > { > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 6; i++) > sel[i] += nelt / 2; > indices.new_vector (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c > unsigned int i, n, log_length = exact_log2 (length); > unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > result_chain->quick_grow (length); > memcpy (result_chain->address (), dr_chain.address (), > length * sizeof (tree)); > @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c > { > unsigned int j0 = 0, j1 = 0, j2 = 0; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (j = 0; j < 3; j++) > { > @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (length)); > > - for (i = 0, n = nelt / 2; i < n; i++) > + /* The encoding has 2 interleaved stepped patterns. */ > + vec_perm_builder sel (nelt, 2, 3); > + sel.quick_grow (6); > + for (i = 0; i < 3; i++) > { > sel[i * 2] = i; > sel[i * 2 + 1] = i + nelt; > @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c > vec_perm_indices indices (sel, 2, nelt); > perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); > > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 6; i++) > sel[i] += nelt / 2; > indices.new_vector (sel, 2, nelt); > perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); > @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp > if (VECTOR_MODE_P (mode)) > { > unsigned int i, j, nelt = GET_MODE_NUNITS (mode); > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > > if (count == 3) > { > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > unsigned int k; > for (k = 0; k < 3; k++) > @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (count)); > > - for (i = 0; i < nelt; i++) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nelt, 1, 3); > + sel.quick_grow (3); > + for (i = 0; i < 3; i++) > sel[i] = i * 2; > vec_perm_indices indices (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > { > - for (i = 0; i < nelt; i++) > + for (i = 0; i < 3; i++) > sel[i] = i * 2 + 1; > indices.new_vector (sel, 2, nelt); > if (can_vec_perm_const_p (mode, indices)) > @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch > unsigned int i, j, log_length = exact_log2 (length); > unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); > > - vec_perm_builder sel (nelt, nelt, 1); > - sel.quick_grow (nelt); > - > result_chain->quick_grow (length); > memcpy (result_chain->address (), dr_chain.address (), > length * sizeof (tree)); > @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch > { > unsigned int k; > > + vec_perm_builder sel (nelt, nelt, 1); > + sel.quick_grow (nelt); > vec_perm_indices indices; > for (k = 0; k < 3; k++) > { > @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch > /* If length is not equal to 3 then only power of 2 is supported. */ > gcc_assert (pow2p_hwi (length)); > > - for (i = 0; i < nelt; ++i) > + /* The encoding has a single stepped pattern. */ > + vec_perm_builder sel (nelt, 1, 3); > + sel.quick_grow (3); > + for (i = 0; i < 3; ++i) > sel[i] = i * 2; > vec_perm_indices indices (sel, 2, nelt); > perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); > > - for (i = 0; i < nelt; ++i) > + for (i = 0; i < 3; ++i) > sel[i] = i * 2 + 1; > indices.new_vector (sel, 2, nelt); > perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
Index: gcc/optabs.c =================================================================== --- gcc/optabs.c 2017-12-09 22:48:47.546825312 +0000 +++ gcc/optabs.c 2017-12-09 22:48:52.266015836 +0000 @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode, NULL, 0, OPTAB_DIRECT); gcc_assert (sel != NULL); - /* Broadcast the low byte each element into each of its bytes. */ - vec_perm_builder const_sel (w, w, 1); - for (i = 0; i < w; ++i) - { - int this_e = i / u * u; - if (BYTES_BIG_ENDIAN) - this_e += u - 1; - const_sel.quick_push (this_e); - } + /* Broadcast the low byte each element into each of its bytes. + The encoding has U interleaved stepped patterns, one for each + byte of an element. */ + vec_perm_builder const_sel (w, u, 3); + unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0; + for (i = 0; i < 3; ++i) + for (unsigned int j = 0; j < u; ++j) + const_sel.quick_push (i * u + low_byte_in_u); sel = gen_lowpart (qimode, sel); sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL); gcc_assert (sel != NULL); @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode, expand_insn (optab_handler (tab2, mode), 3, eops); m2 = gen_lowpart (mode, eops[0].value); - vec_perm_builder sel (nunits, nunits, 1); + vec_perm_builder sel; if (method == 2) { - for (i = 0; i < nunits; ++i) + /* The encoding has 2 interleaved stepped patterns. */ + sel.new_vector (nunits, 2, 3); + for (i = 0; i < 6; ++i) sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1) + ((i & 1) ? nunits : 0)); } else { - for (i = 0; i < nunits; ++i) + /* The encoding has a single interleaved stepped pattern. */ + sel.new_vector (nunits, 1, 3); + for (i = 0; i < 3; ++i) sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); } Index: gcc/optabs-query.c =================================================================== --- gcc/optabs-query.c 2017-12-09 22:48:47.545825268 +0000 +++ gcc/optabs-query.c 2017-12-09 22:48:52.265015799 +0000 @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode, op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab; if (optab_handler (op, mode) != CODE_FOR_nothing) { - vec_perm_builder sel (nunits, nunits, 1); - for (i = 0; i < nunits; ++i) + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nunits, 2, 3); + for (i = 0; i < 6; ++i) sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1) + ((i & 1) ? nunits : 0)); @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode, op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab; if (optab_handler (op, mode) != CODE_FOR_nothing) { - vec_perm_builder sel (nunits, nunits, 1); - for (i = 0; i < nunits; ++i) + /* The encoding has a single stepped pattern. */ + vec_perm_builder sel (nunits, 1, 3); + for (int i = 0; i < 3; ++i) sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1)); vec_perm_indices indices (sel, 2, nunits); if (can_vec_perm_const_p (mode, indices)) Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c 2017-12-09 22:48:47.547825355 +0000 +++ gcc/tree-vect-loop.c 2017-12-09 22:48:52.267015873 +0000 @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, vec_perm_builder *sel) { - sel->new_vector (nelt, nelt, 1); - for (unsigned int i = 0; i < nelt; i++) + /* The encoding is a single stepped pattern. Any wrap-around is handled + by vec_perm_indices. */ + sel->new_vector (nelt, 1, 3); + for (unsigned int i = 0; i < 3; i++) sel->quick_push (i + offset); } Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c 2017-12-09 22:48:50.360942531 +0000 +++ gcc/tree-vect-stmts.c 2017-12-09 22:48:52.268015910 +0000 @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype) nunits = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nunits, nunits, 1); - for (i = 0; i < nunits; ++i) + /* The encoding has a single stepped pattern. */ + vec_perm_builder sel (nunits, 1, 3); + for (i = 0; i < 3; ++i) sel.quick_push (nunits - 1 - i); vec_perm_indices indices (sel, 1, nunits); @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype); unsigned word_bytes = num_bytes / nunits; - vec_perm_builder elts (num_bytes, num_bytes, 1); - for (unsigned i = 0; i < nunits; ++i) + /* The encoding uses one stepped pattern for each byte in the word. */ + vec_perm_builder elts (num_bytes, word_bytes, 3); + for (unsigned i = 0; i < 3; ++i) for (unsigned j = 0; j < word_bytes; ++j) elts.quick_push ((i + 1) * word_bytes - j - 1); Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c 2017-12-09 22:48:47.546825312 +0000 +++ gcc/tree-vect-data-refs.c 2017-12-09 22:48:52.267015873 +0000 @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty if (VECTOR_MODE_P (mode)) { unsigned int i, nelt = GET_MODE_NUNITS (mode); - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - if (count == 3) { unsigned int j0 = 0, j1 = 0, j2 = 0; unsigned int i, j; + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); vec_perm_indices indices; for (j = 0; j < 3; j++) { @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty /* If length is not equal to 3 then only power of 2 is supported. */ gcc_assert (pow2p_hwi (count)); - for (i = 0; i < nelt / 2; i++) + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nelt, 2, 3); + sel.quick_grow (6); + for (i = 0; i < 3; i++) { sel[i * 2] = i; sel[i * 2 + 1] = i + nelt; @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty vec_perm_indices indices (sel, 2, nelt); if (can_vec_perm_const_p (mode, indices)) { - for (i = 0; i < nelt; i++) + for (i = 0; i < 6; i++) sel[i] += nelt / 2; indices.new_vector (sel, 2, nelt); if (can_vec_perm_const_p (mode, indices)) @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c unsigned int i, n, log_length = exact_log2 (length); unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - result_chain->quick_grow (length); memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c { unsigned int j0 = 0, j1 = 0, j2 = 0; + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); vec_perm_indices indices; for (j = 0; j < 3; j++) { @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c /* If length is not equal to 3 then only power of 2 is supported. */ gcc_assert (pow2p_hwi (length)); - for (i = 0, n = nelt / 2; i < n; i++) + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nelt, 2, 3); + sel.quick_grow (6); + for (i = 0; i < 3; i++) { sel[i * 2] = i; sel[i * 2 + 1] = i + nelt; @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c vec_perm_indices indices (sel, 2, nelt); perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); - for (i = 0; i < nelt; i++) + for (i = 0; i < 6; i++) sel[i] += nelt / 2; indices.new_vector (sel, 2, nelt); perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp if (VECTOR_MODE_P (mode)) { unsigned int i, j, nelt = GET_MODE_NUNITS (mode); - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); if (count == 3) { + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); vec_perm_indices indices; unsigned int k; for (k = 0; k < 3; k++) @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp /* If length is not equal to 3 then only power of 2 is supported. */ gcc_assert (pow2p_hwi (count)); - for (i = 0; i < nelt; i++) + /* The encoding has a single stepped pattern. */ + vec_perm_builder sel (nelt, 1, 3); + sel.quick_grow (3); + for (i = 0; i < 3; i++) sel[i] = i * 2; vec_perm_indices indices (sel, 2, nelt); if (can_vec_perm_const_p (mode, indices)) { - for (i = 0; i < nelt; i++) + for (i = 0; i < 3; i++) sel[i] = i * 2 + 1; indices.new_vector (sel, 2, nelt); if (can_vec_perm_const_p (mode, indices)) @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch unsigned int i, j, log_length = exact_log2 (length); unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); - vec_perm_builder sel (nelt, nelt, 1); - sel.quick_grow (nelt); - result_chain->quick_grow (length); memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch { unsigned int k; + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); vec_perm_indices indices; for (k = 0; k < 3; k++) { @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch /* If length is not equal to 3 then only power of 2 is supported. */ gcc_assert (pow2p_hwi (length)); - for (i = 0; i < nelt; ++i) + /* The encoding has a single stepped pattern. */ + vec_perm_builder sel (nelt, 1, 3); + sel.quick_grow (3); + for (i = 0; i < 3; ++i) sel[i] = i * 2; vec_perm_indices indices (sel, 2, nelt); perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); - for (i = 0; i < nelt; ++i) + for (i = 0; i < 3; ++i) sel[i] = i * 2 + 1; indices.new_vector (sel, 2, nelt); perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);