===================================================================
@@ -351,6 +351,12 @@ typedef struct _loop_vec_info : public v
on inactive scalars. */
vec_loop_masks masks;
+ /* If we are using a loop mask to align memory addresses, this variable
+ contains the number of vector elements that we should skip in the
+ first iteration of the vector loop (i.e. the number of leading
+ elements that should be false in the first mask). */
+ tree mask_skip_niters;
+
/* Type of the variables to use in the WHILE_ULT call for fully-masked
loops. */
tree mask_compare_type;
@@ -480,6 +486,7 @@ #define LOOP_VINFO_FULLY_MASKED_P(L)
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
#define LOOP_VINFO_MASKS(L) (L)->masks
+#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
#define LOOP_VINFO_LOOP_NEST(L) (L)->loop_nest
@@ -1230,6 +1237,17 @@ unlimited_cost_model (loop_p loop)
return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED);
}
+/* Return true if the loop described by LOOP_VINFO is fully-masked and
+ if the first iteration should use a partial mask in order to achieve
+ alignment. */
+
+static inline bool
+vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
+{
+ return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+}
+
/* Return the number of vectors of type VECTYPE that are needed to get
NUNITS elements. NUNITS should be based on the vectorization factor,
so it is always a known multiple of the number of elements in VECTYPE. */
@@ -1328,6 +1346,7 @@ extern void vect_loop_versioning (loop_v
poly_uint64);
extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
tree *, tree *, tree *, int, bool, bool);
+extern void vect_prepare_for_masked_peels (loop_vec_info);
extern source_location find_loop_location (struct loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);
@@ -1392,6 +1411,7 @@ extern tree vect_gen_perm_mask_any (tree
extern tree vect_gen_perm_mask_checked (tree, vec_perm_indices);
extern void optimize_mask_stores (struct loop*);
extern gcall *vect_gen_while (tree, tree, tree);
+extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree);
/* In tree-vect-data-refs.c. */
extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
===================================================================
@@ -362,6 +362,11 @@ vect_maybe_permute_loop_masks (gimple_se
times and has been vectorized according to LOOP_VINFO. Each iteration
of the vectorized loop handles VF iterations of the scalar loop.
+ If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
+ starts with NITERS_SKIP dummy iterations of the scalar loop before
+ the real work starts. The mask elements for these dummy iterations
+ must be 0, to ensure that the extra iterations do not have an effect.
+
It is known that:
NITERS * RGM->max_nscalars_per_iter
@@ -373,7 +378,7 @@ vect_maybe_permute_loop_masks (gimple_se
might overflow before hitting a value above:
- NITERS * RGM->max_nscalars_per_iter
+ (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
This means that we cannot guarantee that such an induction variable
would ever hit a value that produces a set of all-false masks for RGM. */
@@ -383,7 +388,8 @@ vect_set_loop_masks_directly (struct loo
gimple_seq *preheader_seq,
gimple_stmt_iterator loop_cond_gsi,
rgroup_masks *rgm, tree vf,
- tree niters, bool might_wrap_p)
+ tree niters, tree niters_skip,
+ bool might_wrap_p)
{
tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
tree mask_type = rgm->mask_type;
@@ -391,10 +397,12 @@ vect_set_loop_masks_directly (struct loo
poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
/* Calculate the maximum number of scalar values that the rgroup
- handles in total and the number that it handles for each iteration
- of the vector loop. */
+ handles in total, the number that it handles for each iteration
+ of the vector loop, and the number that it should skip during the
+ first iteration of the vector loop. */
tree nscalars_total = niters;
tree nscalars_step = vf;
+ tree nscalars_skip = niters_skip;
if (nscalars_per_iter != 1)
{
/* We checked before choosing to use a fully-masked loop that these
@@ -404,6 +412,9 @@ vect_set_loop_masks_directly (struct loo
nscalars_total, factor);
nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nscalars_step, factor);
+ if (nscalars_skip)
+ nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nscalars_skip, factor);
}
/* Create an induction variable that counts the number of scalars
@@ -416,29 +427,66 @@ vect_set_loop_masks_directly (struct loo
create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);
- tree test_index, test_limit;
+ tree test_index, test_limit, first_limit;
gimple_stmt_iterator *test_gsi;
if (might_wrap_p)
{
/* In principle the loop should stop iterating once the incremented
- IV reaches a value greater than or equal to NSCALAR_TOTAL.
- However, there's no guarantee that the IV hits a value above
- this value before wrapping around. We therefore adjust the
- limit down by one IV step:
+ IV reaches a value greater than or equal to:
+
+ NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
+
+ However, there's no guarantee that this addition doesn't overflow
+ the comparison type, or that the IV hits a value above it before
+ wrapping around. We therefore adjust the limit down by one
+ IV step:
- NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP
+ (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+ -[infinite-prec] NSCALARS_STEP
and compare the IV against this limit _before_ incrementing it.
Since the comparison type is unsigned, we actually want the
subtraction to saturate at zero:
- NSCALARS_TOTAL -[sat] NSCALARS_STEP. */
+ (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+ -[sat] NSCALARS_STEP
+
+ And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
+
+ NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
+
+ where the rightmost subtraction can be done directly in
+ COMPARE_TYPE. */
test_index = index_before_incr;
+ tree adjust = nscalars_step;
+ if (nscalars_skip)
+ adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
+ adjust, nscalars_skip);
test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
- nscalars_total, nscalars_step);
+ nscalars_total, adjust);
test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
- test_limit, nscalars_step);
+ test_limit, adjust);
test_gsi = &incr_gsi;
+
+ /* Get a safe limit for the first iteration. */
+ if (nscalars_skip)
+ {
+ /* The first vector iteration can handle at most NSCALARS_STEP
+ scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
+ NSCALARS_SKIP to that cannot overflow. */
+ tree const_limit = build_int_cst (compare_type,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ * nscalars_per_iter);
+ first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
+ nscalars_total, const_limit);
+ first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+ first_limit, nscalars_skip);
+ }
+ else
+ /* For the first iteration it doesn't matter whether the IV hits
+ a value above NSCALARS_TOTAL. That only matters for the latch
+ condition. */
+ first_limit = nscalars_total;
}
else
{
@@ -446,7 +494,12 @@ vect_set_loop_masks_directly (struct loo
the bound before wrapping. */
test_index = index_after_incr;
test_limit = nscalars_total;
+ if (nscalars_skip)
+ test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+ test_limit, nscalars_skip);
test_gsi = &loop_cond_gsi;
+
+ first_limit = test_limit;
}
/* Provide a definition of each mask in the group. */
@@ -465,7 +518,7 @@ vect_set_loop_masks_directly (struct loo
to have a full mask. */
poly_uint64 const_limit;
bool first_iteration_full
- = (poly_int_tree_p (nscalars_total, &const_limit)
+ = (poly_int_tree_p (first_limit, &const_limit)
&& must_ge (const_limit, (i + 1) * nscalars_per_mask));
/* Rather than have a new IV that starts at BIAS and goes up to
@@ -482,12 +535,13 @@ vect_set_loop_masks_directly (struct loo
bias_tree);
}
- /* Create the initial mask. */
+ /* Create the initial mask. First include all scalars that
+ are within the loop limit. */
tree init_mask = NULL_TREE;
if (!first_iteration_full)
{
tree start, end;
- if (nscalars_total == test_limit)
+ if (first_limit == test_limit)
{
/* Use a natural test between zero (the initial IV value)
and the loop limit. The "else" block would be valid too,
@@ -498,8 +552,11 @@ vect_set_loop_masks_directly (struct loo
}
else
{
+ /* FIRST_LIMIT is the maximum number of scalars handled by the
+ first iteration of the vector loop. Test the portion
+ associated with this mask. */
start = bias_tree;
- end = nscalars_total;
+ end = first_limit;
}
init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
@@ -507,6 +564,22 @@ vect_set_loop_masks_directly (struct loo
gimple_seq_add_stmt (preheader_seq, tmp_stmt);
}
+ /* Now AND out the bits that are within the number of skipped
+ scalars. */
+ poly_uint64 const_skip;
+ if (nscalars_skip
+ && !(poly_int_tree_p (nscalars_skip, &const_skip)
+ && must_le (const_skip, bias)))
+ {
+ tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
+ bias_tree, nscalars_skip);
+ if (init_mask)
+ init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
+ init_mask, unskipped_mask);
+ else
+ init_mask = unskipped_mask;
+ }
+
if (!init_mask)
/* First iteration is full. */
init_mask = build_minus_one_cst (mask_type);
@@ -564,6 +637,9 @@ vect_set_loop_condition_masked (struct l
else
niters = gimple_convert (&preheader_seq, compare_type, niters);
+ /* Convert skip_niters to the right type. */
+ tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+
/* Now calculate the value that the induction variable must be able
to hit in order to ensure that we end the loop with an all-false mask.
This involves adding the maximum number of inactive trailing scalar
@@ -572,6 +648,15 @@ vect_set_loop_condition_masked (struct l
bool known_max_iters = max_loop_iterations (loop, &iv_limit);
if (known_max_iters)
{
+ if (niters_skip)
+ {
+ /* Add the maximum number of skipped iterations to the
+ maximum iteration count. */
+ if (TREE_CODE (niters_skip) == INTEGER_CST)
+ iv_limit += wi::to_widest (niters_skip);
+ else
+ iv_limit += max_vf - 1;
+ }
/* IV_LIMIT is the maximum number of latch iterations, which is also
the maximum in-range IV value. Round this value down to the previous
vector alignment boundary and then add an extra full iteration. */
@@ -617,7 +702,8 @@ vect_set_loop_condition_masked (struct l
test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
&preheader_seq,
loop_cond_gsi, rgm, vf,
- niters, might_wrap_p);
+ niters, niters_skip,
+ might_wrap_p);
}
/* Emit all accumulated statements. */
@@ -1439,6 +1525,46 @@ vect_update_ivs_after_vectorizer (loop_v
}
}
+/* Return a gimple value containing the misalignment (measured in vector
+ elements) for the loop described by LOOP_VINFO, i.e. how many elements
+ it is away from a perfectly aligned address. Add any new statements
+ to SEQ. */
+
+static tree
+get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
+{
+ struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+ gimple *dr_stmt = DR_STMT (dr);
+ stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+ unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
+ gcc_assert (target_align != 0);
+
+ bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
+ tree offset = (negative
+ ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
+ : size_zero_node);
+ tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
+ offset);
+ tree type = unsigned_type_for (TREE_TYPE (start_addr));
+ tree target_align_minus_1 = build_int_cst (type, target_align - 1);
+ HOST_WIDE_INT elem_size
+ = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+ tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
+
+ /* Create: misalign_in_bytes = addr & (target_align - 1). */
+ tree int_start_addr = fold_convert (type, start_addr);
+ tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
+ target_align_minus_1);
+
+ /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
+ tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
+ elem_size_log);
+
+ return misalign_in_elems;
+}
+
/* Function vect_gen_prolog_loop_niters
Generate the number of iterations which should be peeled as prolog for the
@@ -1450,7 +1576,7 @@ vect_update_ivs_after_vectorizer (loop_v
If the misalignment of DR is known at compile time:
addr_mis = int mis = DR_MISALIGNMENT (dr);
Else, compute address misalignment in bytes:
- addr_mis = addr & (vectype_align - 1)
+ addr_mis = addr & (target_align - 1)
prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
@@ -1497,33 +1623,17 @@ vect_gen_prolog_loop_niters (loop_vec_in
}
else
{
- bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
- tree offset = negative
- ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
- tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
- &stmts, offset);
- tree type = unsigned_type_for (TREE_TYPE (start_addr));
- tree target_align_minus_1 = build_int_cst (type, target_align - 1);
+ tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
+ tree type = TREE_TYPE (misalign_in_elems);
HOST_WIDE_INT elem_size
= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
- tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
HOST_WIDE_INT align_in_elems = target_align / elem_size;
tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
tree align_in_elems_tree = build_int_cst (type, align_in_elems);
- tree misalign_in_bytes;
- tree misalign_in_elems;
-
- /* Create: misalign_in_bytes = addr & (target_align - 1). */
- misalign_in_bytes
- = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
- target_align_minus_1);
-
- /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
- misalign_in_elems
- = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
/* Create: (niters_type) ((align_in_elems - misalign_in_elems)
& (align_in_elems - 1)). */
+ bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
if (negative)
iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
align_in_elems_tree);
@@ -1563,20 +1673,22 @@ vect_gen_prolog_loop_niters (loop_vec_in
/* Function vect_update_init_of_dr
- NITERS iterations were peeled from LOOP. DR represents a data reference
- in LOOP. This function updates the information recorded in DR to
- account for the fact that the first NITERS iterations had already been
- executed. Specifically, it updates the OFFSET field of DR. */
+ If CODE is PLUS, the vector loop starts NITERS iterations after the
+ scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
+ iterations before the scalar one (using masking to skip inactive
+ elements). This function updates the information recorded in DR to
+ account for the difference. Specifically, it updates the OFFSET
+ field of DR. */
static void
-vect_update_init_of_dr (struct data_reference *dr, tree niters)
+vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
{
tree offset = DR_OFFSET (dr);
niters = fold_build2 (MULT_EXPR, sizetype,
fold_convert (sizetype, niters),
fold_convert (sizetype, DR_STEP (dr)));
- offset = fold_build2 (PLUS_EXPR, sizetype,
+ offset = fold_build2 (code, sizetype,
fold_convert (sizetype, offset), niters);
DR_OFFSET (dr) = offset;
}
@@ -1584,14 +1696,12 @@ vect_update_init_of_dr (struct data_refe
/* Function vect_update_inits_of_drs
- NITERS iterations were peeled from the loop represented by LOOP_VINFO.
- This function updates the information recorded for the data references in
- the loop to account for the fact that the first NITERS iterations had
- already been executed. Specifically, it updates the initial_condition of
- the access_function of all the data_references in the loop. */
+ Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
+ CODE and NITERS are as for vect_update_inits_of_dr. */
static void
-vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
+vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
+ tree_code code)
{
unsigned int i;
vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
@@ -1618,9 +1728,57 @@ vect_update_inits_of_drs (loop_vec_info
}
FOR_EACH_VEC_ELT (datarefs, i, dr)
- vect_update_init_of_dr (dr, niters);
+ vect_update_init_of_dr (dr, niters, code);
}
+/* For the information recorded in LOOP_VINFO prepare the loop for peeling
+ by masking. This involves calculating the number of iterations to
+ be peeled and then aligning all memory references appropriately. */
+
+void
+vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
+{
+ tree misalign_in_elems;
+ tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+
+ gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
+
+ /* From the information recorded in LOOP_VINFO get the number of iterations
+ that need to be skipped via masking. */
+ if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+ {
+ poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+ misalign_in_elems = build_int_cst (type, misalign);
+ }
+ else
+ {
+ gimple_seq seq1 = NULL, seq2 = NULL;
+ misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
+ misalign_in_elems = fold_convert (type, misalign_in_elems);
+ misalign_in_elems = force_gimple_operand (misalign_in_elems,
+ &seq2, true, NULL_TREE);
+ gimple_seq_add_seq (&seq1, seq2);
+ if (seq1)
+ {
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
+ gcc_assert (!new_bb);
+ }
+ }
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "misalignment for fully-masked loop: ");
+ dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
+ dump_printf (MSG_NOTE, "\n");
+ }
+
+ LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
+
+ vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
+}
/* This function builds ni_name = number of iterations. Statements
are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
@@ -2226,7 +2384,9 @@ vect_do_peeling (loop_vec_info loop_vinf
int bound_prolog = 0;
poly_uint64 bound_scalar = 0;
int estimated_vf;
- int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ int prolog_peeling = 0;
+ if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
+ prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
|| LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
@@ -2343,7 +2503,7 @@ vect_do_peeling (loop_vec_info loop_vinf
scale_loop_profile (prolog, prob_prolog, bound_prolog);
}
/* Update init address of DRs. */
- vect_update_inits_of_drs (loop_vinfo, niters_prolog);
+ vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
/* Update niters for vector loop. */
LOOP_VINFO_NITERS (loop_vinfo)
= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
===================================================================
@@ -1119,6 +1119,7 @@ _loop_vec_info::_loop_vec_info (struct l
versioning_threshold (0),
vectorization_factor (0),
max_vectorization_factor (0),
+ mask_skip_niters (NULL_TREE),
mask_compare_type (NULL_TREE),
unaligned_dr (NULL),
peeling_for_alignment (0),
@@ -2266,16 +2267,6 @@ vect_analyze_loop_2 (loop_vec_info loop_
" gaps is required.\n");
}
- if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
- && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
- {
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because peeling for"
- " alignment is required.\n");
- }
-
/* Decide whether to use a fully-masked loop for this vectorization
factor. */
LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
@@ -2376,18 +2367,21 @@ vect_analyze_loop_2 (loop_vec_info loop_
increase threshold for this case if necessary. */
if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
{
- poly_uint64 niters_th;
+ poly_uint64 niters_th = 0;
- /* Niters for peeled prolog loop. */
- if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+ if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
{
- struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
- tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
-
- niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
+ /* Niters for peeled prolog loop. */
+ if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+ {
+ struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+ tree vectype
+ = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+ niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
+ }
+ else
+ niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
}
- else
- niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
/* Niters for at least one iteration of vectorized loop. */
if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
@@ -7167,9 +7161,28 @@ vectorizable_induction (gimple *phi,
init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
loop_preheader_edge (iv_loop));
- /* Convert the step to the desired type. */
+ /* Convert the initial value and step to the desired type. */
stmts = NULL;
+ init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
+
+ /* If we are using the loop mask to "peel" for alignment then we need
+ to adjust the start value here. */
+ tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ if (skip_niters != NULL_TREE)
+ {
+ if (FLOAT_TYPE_P (vectype))
+ skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype),
+ skip_niters);
+ else
+ skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype),
+ skip_niters);
+ tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype),
+ skip_niters, step_expr);
+ init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype),
+ init_expr, skip_step);
+ }
+
if (stmts)
{
new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -8040,6 +8053,11 @@ vect_transform_loop (loop_vec_info loop_
split_edge (loop_preheader_edge (loop));
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+ /* This will deal with any possible peeling. */
+ vect_prepare_for_masked_peels (loop_vinfo);
+
/* FORNOW: the vectorizer supports only loops which body consist
of one basic block (header + empty latch). When the vectorizer will
support more involved loop forms, the order by which the BBs are
@@ -8319,29 +8337,40 @@ vect_transform_loop (loop_vec_info loop_
/* +1 to convert latch counts to loop iteration counts,
-min_epilogue_iters to remove iterations that cannot be performed
by the vector code. */
- int bias = 1 - min_epilogue_iters;
+ int bias_for_lowest = 1 - min_epilogue_iters;
+ int bias_for_assumed = bias_for_lowest;
+ int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ /* When the amount of peeling is known at compile time, the first
+ iteration will have exactly alignment_npeels active elements.
+ In the worst case it will have at least one. */
+ int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
+ bias_for_lowest += lowest_vf - min_first_active;
+ bias_for_assumed += assumed_vf - min_first_active;
+ }
/* In these calculations the "- 1" converts loop iteration counts
back to latch counts. */
if (loop->any_upper_bound)
loop->nb_iterations_upper_bound
= (final_iter_may_be_partial
- ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias,
+ ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
lowest_vf) - 1
- : wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
+ : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
lowest_vf) - 1);
if (loop->any_likely_upper_bound)
loop->nb_iterations_likely_upper_bound
= (final_iter_may_be_partial
- ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound + bias,
- lowest_vf) - 1
- : wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
- lowest_vf) - 1);
+ ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
+ + bias_for_lowest, lowest_vf) - 1
+ : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
+ + bias_for_lowest, lowest_vf) - 1);
if (loop->any_estimate)
loop->nb_iterations_estimate
= (final_iter_may_be_partial
- ? wi::udiv_ceil (loop->nb_iterations_estimate + bias,
+ ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
assumed_vf) - 1
- : wi::udiv_floor (loop->nb_iterations_estimate + bias,
+ : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
assumed_vf) - 1);
if (dump_enabled_p ())
===================================================================
@@ -9975,3 +9975,16 @@ vect_gen_while (tree mask, tree start_in
gimple_call_set_lhs (call, mask);
return call;
}
+
+/* Generate a vector mask of type MASK_TYPE for which index I is false iff
+ J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */
+
+tree
+vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index,
+ tree end_index)
+{
+ tree tmp = make_ssa_name (mask_type);
+ gcall *call = vect_gen_while (tmp, start_index, end_index);
+ gimple_seq_add_stmt (seq, call);
+ return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp);
+}
===================================================================
@@ -0,0 +1,39 @@
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+#define TEST(NAME, TYPE) \
+ void \
+ NAME##1 (TYPE *x, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ x[i] += 1; \
+ } \
+ TYPE NAME##_array[1024]; \
+ void \
+ NAME##2 (void) \
+ { \
+ for (int i = 1; i < 200; ++i) \
+ NAME##_array[i] += 1; \
+ }
+
+TEST (s8, int8_t)
+TEST (u8, uint8_t)
+TEST (s16, int16_t)
+TEST (u16, uint16_t)
+TEST (s32, int32_t)
+TEST (u32, uint32_t)
+TEST (s64, int64_t)
+TEST (u64, uint64_t)
+TEST (f16, _Float16)
+TEST (f32, float)
+TEST (f64, double)
+
+/* No scalar memory accesses. */
+/* { dg-final { scan-assembler-not {[wx][0-9]*, \[} } } */
+/* 2 for each NAME##1 test, one in the header and one in the main loop
+ and 1 for each NAME##2 test, in the main loop only. */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 6 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 6 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 9 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 9 } } */
===================================================================
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+ expensive. */
+/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (void)
+{
+ unsigned int v = 0;
+ for (unsigned int i = START; i < END; ++i)
+ {
+ x[i] = v;
+ v += 5;
+ }
+}
+
+/* We should operate on aligned vectors. */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* We should use an induction that starts at -5, with only the last
+ 7 elements of the first iteration being active. */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */
===================================================================
@@ -0,0 +1,18 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_peel_ind_1.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ foo ();
+ for (int i = 0; i < N; ++i)
+ {
+ if (x[i] != (i < START || i >= END ? 0 : (i - START) * 5))
+ __builtin_abort ();
+ asm volatile ("" ::: "memory");
+ }
+ return 0;
+}
===================================================================
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+ expensive. */
+/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 512
+#define START 7
+#define END 22
+
+int x[N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (void)
+{
+ for (unsigned int i = START; i < END; ++i)
+ x[i] = i;
+}
+
+/* We should operate on aligned vectors. */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* We should unroll the loop three times. */
+/* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */
===================================================================
@@ -0,0 +1,18 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_peel_ind_2.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ foo ();
+ for (int i = 0; i < N; ++i)
+ {
+ if (x[i] != (i < START || i >= END ? 0 : i))
+ __builtin_abort ();
+ asm volatile ("" ::: "memory");
+ }
+ return 0;
+}
===================================================================
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+ expensive. */
+/* { dg-options "-O3 -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx" } */
+
+#define N 32
+#define MAX_START 8
+#define COUNT 16
+
+int x[MAX_START][N] __attribute__((aligned(32)));
+
+void __attribute__((noinline, noclone))
+foo (int start)
+{
+ for (int i = start; i < start + COUNT; ++i)
+ x[start][i] = i;
+}
+
+/* We should operate on aligned vectors. */
+/* { dg-final { scan-assembler {\tadrp\tx[0-9]+, x\n} } } */
+/* { dg-final { scan-assembler {\tubfx\t} } } */
===================================================================
@@ -0,0 +1,21 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx" } */
+/* { dg-options "-O3 -march=armv8-a+sve -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_peel_ind_3.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ for (int start = 0; start < MAX_START; ++start)
+ {
+ foo (start);
+ for (int i = 0; i < N; ++i)
+ {
+ if (x[start][i] != (i < start || i >= start + COUNT ? 0 : i))
+ __builtin_abort ();
+ asm volatile ("" ::: "memory");
+ }
+ }
+ return 0;
+}
===================================================================
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* Pick an arbitrary target for which unaligned accesses are more
+ expensive. */
+/* { dg-options "-Ofast -march=armv8-a+sve -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
+
+#define START 1
+#define END 505
+
+void __attribute__((noinline, noclone))
+foo (double *x)
+{
+ double v = 10.0;
+ for (unsigned int i = START; i < END; ++i)
+ {
+ x[i] = v;
+ v += 5.0;
+ }
+}
+
+/* We should operate on aligned vectors. */
+/* { dg-final { scan-assembler {\tubfx\t} } } */
===================================================================
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -march=armv8-a+sve -mtune=thunderx" } */
+/* { dg-options "-Ofast -march=armv8-a+sve -mtune=thunderx -mtune=thunderx" { target aarch64_sve256_hw } } */
+
+#include "sve_peel_ind_4.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ double x[END + 1];
+ for (int i = 0; i < END + 1; ++i)
+ {
+ x[i] = i;
+ asm volatile ("" ::: "memory");
+ }
+ foo (x);
+ for (int i = 0; i < END + 1; ++i)
+ {
+ double expected;
+ if (i < START || i >= END)
+ expected = i;
+ else
+ expected = 10 + (i - START) * 5;
+ if (x[i] != expected)
+ __builtin_abort ();
+ asm volatile ("" ::: "memory");
+ }
+ return 0;
+}