===================================================================
@@ -4905,6 +4905,35 @@ for (j = 0; j < GET_MODE_NUNITS (@var{n}
This pattern is not allowed to @code{FAIL}.
+@cindex @code{gather_load@var{m}} instruction pattern
+@item @samp{gather_load@var{m}}
+Load several separate memory locations into a vector of mode @var{m}.
+Operand 1 is a scalar base address and operand 2 is a vector of
+offsets from that base. Operand 0 is a destination vector with the
+same number of elements as the offset. For each element index @var{i}:
+
+@itemize @bullet
+@item
+extend the offset element @var{i} to address width, using zero
+extension if operand 3 is 1 and sign extension if operand 3 is zero;
+@item
+multiply the extended offset by operand 4;
+@item
+add the result to the base; and
+@item
+load the value at that address into element @var{i} of operand 0.
+@end itemize
+
+The value of operand 3 does not matter if the offsets are already
+address width.
+
+@cindex @code{mask_gather_load@var{m}} instruction pattern
+@item @samp{mask_gather_load@var{m}}
+Like @samp{gather_load@var{m}}, but takes an extra mask operand as
+operand 5. Bit @var{i} of the mask is set if element @var{i}
+of the result should be loaded from memory and clear if element @var{i}
+of the result should be set to zero.
+
@cindex @code{vec_set@var{m}} instruction pattern
@item @samp{vec_set@var{m}}
Set given field in the vector value. Operand 0 is the vector to modify,
===================================================================
@@ -234,6 +234,11 @@ main (int argc, const char **argv)
"struct target_optabs {\n"
" /* Patterns that are used by optabs that are enabled for this target. */\n"
" bool pat_enable[NUM_OPTAB_PATTERNS];\n"
+ "\n"
+ " /* Cache if the target supports vec_gather_load for at least one vector\n"
+ " mode. */\n"
+ " bool supports_vec_gather_load;\n"
+ " bool supports_vec_gather_load_cached;\n"
"};\n"
"extern void init_all_optabs (struct target_optabs *);\n"
"\n"
===================================================================
@@ -380,7 +380,7 @@ init_tree_optimization_optabs (tree optn
if (tmp_optabs)
memset (tmp_optabs, 0, sizeof (struct target_optabs));
else
- tmp_optabs = ggc_alloc<target_optabs> ();
+ tmp_optabs = ggc_cleared_alloc<target_optabs> ();
/* Generate a new set of optabs into tmp_optabs. */
init_all_optabs (tmp_optabs);
===================================================================
@@ -390,6 +390,9 @@ OPTAB_D (atomic_xor_optab, "atomic_xor$I
OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a")
OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
+OPTAB_D (gather_load_optab, "gather_load$a")
+OPTAB_D (mask_gather_load_optab, "mask_gather_load$a")
+
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
===================================================================
@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.
- mask_load: currently just maskload
- load_lanes: currently just vec_load_lanes
- mask_load_lanes: currently just vec_mask_load_lanes
+ - gather_load: used for {mask_,}gather_load
- mask_store: currently just maskstore
- store_lanes: currently just vec_store_lanes
@@ -110,6 +111,10 @@ DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_C
DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
vec_mask_load_lanes, mask_load_lanes)
+DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
+ mask_gather_load, gather_load)
+
DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
===================================================================
@@ -192,6 +192,12 @@ extern bool set_edom_supported_p (void);
extern internal_fn get_conditional_internal_fn (tree_code, tree);
+extern bool internal_load_fn_p (internal_fn);
+extern bool internal_gather_scatter_fn_p (internal_fn);
+extern int internal_fn_mask_index (internal_fn);
+extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
+ tree, signop, int);
+
extern void expand_internal_call (gcall *);
extern void expand_internal_call (internal_fn, gcall *);
extern void expand_PHI (internal_fn, gcall *);
===================================================================
@@ -83,6 +83,7 @@ #define not_direct { -2, -2, false }
#define mask_load_direct { -1, 2, false }
#define load_lanes_direct { -1, -1, false }
#define mask_load_lanes_direct { -1, -1, false }
+#define gather_load_direct { -1, 1, false }
#define mask_store_direct { 3, 2, false }
#define store_lanes_direct { 0, 0, false }
#define mask_store_lanes_direct { 0, 0, false }
@@ -2676,6 +2677,38 @@ expand_LAUNDER (internal_fn, gcall *call
expand_assignment (lhs, gimple_call_arg (call, 0), false);
}
+/* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB. */
+
+static void
+expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+{
+ tree lhs = gimple_call_lhs (stmt);
+ tree base = gimple_call_arg (stmt, 0);
+ tree offset = gimple_call_arg (stmt, 1);
+ tree scale = gimple_call_arg (stmt, 2);
+
+ rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ rtx base_rtx = expand_normal (base);
+ rtx offset_rtx = expand_normal (offset);
+ HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+
+ int i = 0;
+ struct expand_operand ops[6];
+ create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
+ create_address_operand (&ops[i++], base_rtx);
+ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+ create_integer_operand (&ops[i++], scale_int);
+ if (optab == mask_gather_load_optab)
+ {
+ tree mask = gimple_call_arg (stmt, 3);
+ rtx mask_rtx = expand_normal (mask);
+ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+ }
+ insn_code icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)));
+ expand_insn (icode, i, ops);
+}
+
/* Expand DIVMOD() using:
a) optab handler for udivmod/sdivmod if it is available.
b) If optab_handler doesn't exist, generate call to
@@ -2915,12 +2948,32 @@ #define direct_cond_binary_optab_support
#define direct_mask_load_optab_supported_p direct_optab_supported_p
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_gather_load_optab_supported_p direct_optab_supported_p
#define direct_mask_store_optab_supported_p direct_optab_supported_p
#define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
+/* Return the optab used by internal function FN. */
+
+static optab
+direct_internal_fn_optab (internal_fn fn)
+{
+ switch (fn)
+ {
+#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) \
+ case IFN_##CODE: break;
+#define DEF_INTERNAL_OPTAB_FN(CODE, FLAGS, OPTAB, TYPE) \
+ case IFN_##CODE: return OPTAB##_optab;
+#include "internal-fn.def"
+
+ case IFN_LAST:
+ break;
+ }
+ gcc_unreachable ();
+}
+
/* Return true if FN is supported for the types in TYPES when the
optimization type is OPT_TYPE. The types are those associated with
the "type0" and "type1" fields of FN's direct_internal_fn_info
@@ -3022,6 +3075,87 @@ get_conditional_internal_fn (tree_code c
}
}
+/* Return true if IFN is some form of load from memory. */
+
+bool
+internal_load_fn_p (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_LOAD:
+ case IFN_LOAD_LANES:
+ case IFN_MASK_LOAD_LANES:
+ case IFN_GATHER_LOAD:
+ case IFN_MASK_GATHER_LOAD:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/* Return true if IFN is some form of gather load or scatter store. */
+
+bool
+internal_gather_scatter_fn_p (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_GATHER_LOAD:
+ case IFN_MASK_GATHER_LOAD:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/* If FN takes a vector mask argument, return the index of that argument,
+ otherwise return -1. */
+
+int
+internal_fn_mask_index (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MASK_LOAD:
+ case IFN_MASK_LOAD_LANES:
+ case IFN_MASK_STORE:
+ case IFN_MASK_STORE_LANES:
+ return 2;
+
+ case IFN_MASK_GATHER_LOAD:
+ return 3;
+
+ default:
+ return -1;
+ }
+}
+
+/* Return true if the target supports gather load or scatter store function
+ IFN. For loads, VECTOR_TYPE is the vector type of the load result,
+ while for stores it is the vector type of the stored data argument.
+ MEMORY_ELEMENT_TYPE is the type of the memory elements being loaded
+ or stored. OFFSET_SIGN is the sign of the offset argument, which is
+ only relevant when the offset is narrower than an address. SCALE is
+ the amount by which the offset should be multiplied *after* it has
+ been extended to address width. */
+
+bool
+internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type,
+ tree memory_element_type,
+ signop offset_sign, int scale)
+{
+ if (!tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (vector_type)),
+ TYPE_SIZE (memory_element_type)))
+ return false;
+ optab optab = direct_internal_fn_optab (ifn);
+ insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+ return (icode != CODE_FOR_nothing
+ && insn_operand_matches (icode, 3, GEN_INT (offset_sign == UNSIGNED))
+ && insn_operand_matches (icode, 4, GEN_INT (scale)));
+}
+
/* Expand STMT as though it were a call to internal function FN. */
void
===================================================================
@@ -621,3 +621,32 @@ lshift_cheap_p (bool speed_p)
return cheap[speed_p];
}
+
+/* Return true if optab OP supports at least one mode. */
+
+static bool
+supports_at_least_one_mode_p (optab op)
+{
+ for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+ if (direct_optab_handler (op, (machine_mode) i) != CODE_FOR_nothing)
+ return true;
+
+ return false;
+}
+
+/* Return true if vec_gather_load is available for at least one vector
+ mode. */
+
+bool
+supports_vec_gather_load_p ()
+{
+ if (this_fn_optabs->supports_vec_gather_load_cached)
+ return this_fn_optabs->supports_vec_gather_load;
+
+ this_fn_optabs->supports_vec_gather_load_cached = true;
+
+ this_fn_optabs->supports_vec_gather_load
+ = supports_at_least_one_mode_p (gather_load_optab);
+
+ return this_fn_optabs->supports_vec_gather_load;
+}
===================================================================
@@ -187,6 +187,7 @@ bool can_compare_and_swap_p (machine_mod
bool can_atomic_exchange_p (machine_mode, bool);
bool can_atomic_load_p (machine_mode);
bool lshift_cheap_p (bool);
+bool supports_vec_gather_load_p ();
/* Version of find_widening_optab_handler_and_mode that operates on
specific mode types. */
===================================================================
@@ -844,7 +844,12 @@ typedef struct _stmt_vec_info {
/* Information about a gather/scatter call. */
struct gather_scatter_info {
- /* The FUNCTION_DECL for the built-in gather/scatter function. */
+ /* The internal function to use for the gather/scatter operation,
+ or IFN_LAST if a built-in function should be used instead. */
+ internal_fn ifn;
+
+ /* The FUNCTION_DECL for the built-in gather/scatter function,
+ or null if an internal function should be used instead. */
tree decl;
/* The loop-invariant base value. */
@@ -862,6 +867,12 @@ struct gather_scatter_info {
/* The type of the vectorized offset. */
tree offset_vectype;
+
+ /* The type of the scalar elements after loading or before storing. */
+ tree element_type;
+
+ /* The type of the scalar elements being loaded or stored. */
+ tree memory_type;
};
/* Access Functions. */
@@ -1529,7 +1540,7 @@ extern void duplicate_and_interleave (gi
Additional pattern recognition functions can (and will) be added
in the future. */
typedef gimple *(* vect_recog_func_ptr) (vec<gimple *> *, tree *, tree *);
-#define NUM_PATTERNS 14
+#define NUM_PATTERNS 15
void vect_pattern_recog (vec_info *);
/* In tree-vectorizer.c. */
===================================================================
@@ -3296,6 +3296,74 @@ vect_prune_runtime_alias_test_list (loop
return true;
}
+/* Check whether we can use an internal function for a gather load
+ or scatter store. READ_P is true for loads and false for stores.
+ MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
+ the type of the memory elements being loaded or stored. OFFSET_BITS
+ is the number of bits in each scalar offset and OFFSET_SIGN is the
+ sign of the offset. SCALE is the amount by which the offset should
+ be multiplied *after* it has been converted to address width.
+
+ Return true if the function is supported, storing the function
+ id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
+
+static bool
+vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
+ tree memory_type, unsigned int offset_bits,
+ signop offset_sign, int scale,
+ internal_fn *ifn_out, tree *element_type_out)
+{
+ unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
+ unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
+ if (offset_bits > element_bits)
+ /* Internal functions require the offset to be the same width as
+ the vector elements. We can extend narrower offsets, but it isn't
+ safe to truncate wider offsets. */
+ return false;
+
+ if (element_bits != memory_bits)
+ /* For now the vector elements must be the same width as the
+ memory elements. */
+ return false;
+
+ /* Work out which function we need. */
+ internal_fn ifn;
+ if (read_p)
+ ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
+ else
+ return false;
+
+ /* Test whether the target supports this combination. */
+ if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
+ offset_sign, scale))
+ return false;
+
+ *ifn_out = ifn;
+ *element_type_out = TREE_TYPE (vectype);
+ return true;
+}
+
+/* CALL is a call to an internal gather load or scatter store function.
+ Describe the operation in INFO. */
+
+static void
+vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (call);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+
+ info->ifn = gimple_call_internal_fn (call);
+ info->decl = NULL_TREE;
+ info->base = gimple_call_arg (call, 0);
+ info->offset = gimple_call_arg (call, 1);
+ info->offset_dt = vect_unknown_def_type;
+ info->offset_vectype = NULL_TREE;
+ info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+ info->element_type = TREE_TYPE (vectype);
+ info->memory_type = TREE_TYPE (DR_REF (dr));
+}
+
/* Return true if a non-affine read or write in STMT is suitable for a
gather load or scatter store. Describe the operation in *INFO if so. */
@@ -3309,17 +3377,38 @@ vect_check_gather_scatter (gimple *stmt,
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
tree offtype = NULL_TREE;
- tree decl, base, off;
+ tree decl = NULL_TREE, base, off;
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ tree memory_type = TREE_TYPE (DR_REF (dr));
machine_mode pmode;
int punsignedp, reversep, pvolatilep = 0;
+ internal_fn ifn;
+ tree element_type;
+ bool masked_p = false;
+
+ /* See whether this is already a call to a gather/scatter internal function.
+ If not, see whether it's a masked load or store. */
+ gcall *call = dyn_cast <gcall *> (stmt);
+ if (call && gimple_call_internal_p (call))
+ {
+ ifn = gimple_call_internal_fn (stmt);
+ if (internal_gather_scatter_fn_p (ifn))
+ {
+ vect_describe_gather_scatter_call (call, info);
+ return true;
+ }
+ masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
+ }
+
+ /* True if we should aim to use internal functions rather than
+ built-in functions. */
+ bool use_ifn_p = (DR_IS_READ (dr)
+ && supports_vec_gather_load_p ());
base = DR_REF (dr);
/* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
see if we can use the def stmt of the address. */
- if (is_gimple_call (stmt)
- && gimple_call_internal_p (stmt)
- && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
- || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+ if (masked_p
&& TREE_CODE (base) == MEM_REF
&& TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
&& integer_zerop (TREE_OPERAND (base, 1))
@@ -3450,7 +3539,17 @@ vect_check_gather_scatter (gimple *stmt,
case MULT_EXPR:
if (scale == 1 && tree_fits_shwi_p (op1))
{
- scale = tree_to_shwi (op1);
+ int new_scale = tree_to_shwi (op1);
+ /* Only treat this as a scaling operation if the target
+ supports it. */
+ if (use_ifn_p
+ && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
+ vectype, memory_type, 1,
+ TYPE_SIGN (TREE_TYPE (op0)),
+ new_scale, &ifn,
+ &element_type))
+ break;
+ scale = new_scale;
off = op0;
continue;
}
@@ -3468,6 +3567,15 @@ vect_check_gather_scatter (gimple *stmt,
off = op0;
continue;
}
+
+ /* The internal functions need the offset to be the same width
+ as the elements of VECTYPE. Don't include operations that
+ cast the offset from that width to a different width. */
+ if (use_ifn_p
+ && (int_size_in_bytes (TREE_TYPE (vectype))
+ == int_size_in_bytes (TREE_TYPE (off))))
+ break;
+
if (TYPE_PRECISION (TREE_TYPE (op0))
< TYPE_PRECISION (TREE_TYPE (off)))
{
@@ -3492,22 +3600,37 @@ vect_check_gather_scatter (gimple *stmt,
if (offtype == NULL_TREE)
offtype = TREE_TYPE (off);
- if (DR_IS_READ (dr))
- decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
- offtype, scale);
+ if (use_ifn_p)
+ {
+ if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
+ memory_type, TYPE_PRECISION (offtype),
+ TYPE_SIGN (offtype), scale, &ifn,
+ &element_type))
+ return false;
+ }
else
- decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
- offtype, scale);
+ {
+ if (DR_IS_READ (dr))
+ decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
+ else
+ decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
- if (decl == NULL_TREE)
- return false;
+ if (!decl)
+ return false;
+
+ ifn = IFN_LAST;
+ element_type = TREE_TYPE (vectype);
+ }
+ info->ifn = ifn;
info->decl = decl;
info->base = base;
info->offset = off;
info->offset_dt = vect_unknown_def_type;
info->offset_vectype = NULL_TREE;
info->scale = scale;
+ info->element_type = element_type;
+ info->memory_type = memory_type;
return true;
}
@@ -3588,7 +3711,8 @@ vect_analyze_data_refs (vec_info *vinfo,
bool maybe_gather
= DR_IS_READ (dr)
&& !TREE_THIS_VOLATILE (DR_REF (dr))
- && targetm.vectorize.builtin_gather != NULL;
+ && (targetm.vectorize.builtin_gather != NULL
+ || supports_vec_gather_load_p ());
bool maybe_scatter
= DR_IS_WRITE (dr)
&& !TREE_THIS_VOLATILE (DR_REF (dr))
===================================================================
@@ -69,6 +69,7 @@ static gimple *vect_recog_mixed_size_con
tree *, tree *);
static gimple *vect_recog_bool_pattern (vec<gimple *> *, tree *, tree *);
static gimple *vect_recog_mask_conversion_pattern (vec<gimple *> *, tree *, tree *);
+static gimple *vect_recog_gather_scatter_pattern (vec<gimple *> *, tree *, tree *);
struct vect_recog_func
{
@@ -93,6 +94,10 @@ static vect_recog_func vect_vect_recog_f
{ vect_recog_mult_pattern, "mult" },
{ vect_recog_mixed_size_cond_pattern, "mixed_size_cond" },
{ vect_recog_bool_pattern, "bool" },
+ /* This must come before mask conversion, and includes the parts
+ of mask conversion that are needed for gather and scatter
+ internal functions. */
+ { vect_recog_gather_scatter_pattern, "gather_scatter" },
{ vect_recog_mask_conversion_pattern, "mask_conversion" }
};
@@ -4090,6 +4095,202 @@ vect_recog_mask_conversion_pattern (vec<
return pattern_stmt;
}
+/* STMT is a load or store. If the load or store is conditional, return
+ the boolean condition under which it occurs, otherwise return null. */
+
+static tree
+vect_get_load_store_mask (gimple *stmt)
+{
+ if (gassign *def_assign = dyn_cast <gassign *> (stmt))
+ {
+ gcc_assert (gimple_assign_single_p (def_assign));
+ return NULL_TREE;
+ }
+
+ if (gcall *def_call = dyn_cast <gcall *> (stmt))
+ {
+ internal_fn ifn = gimple_call_internal_fn (def_call);
+ int mask_index = internal_fn_mask_index (ifn);
+ return gimple_call_arg (def_call, mask_index);
+ }
+
+ gcc_unreachable ();
+}
+
+/* Return the scalar offset type that an internal gather/scatter function
+ should use. GS_INFO describes the gather/scatter operation. */
+
+static tree
+vect_get_gather_scatter_offset_type (gather_scatter_info *gs_info)
+{
+ tree offset_type = TREE_TYPE (gs_info->offset);
+ unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (gs_info->element_type));
+
+ /* Enforced by vect_check_gather_scatter. */
+ unsigned int offset_bits = TYPE_PRECISION (offset_type);
+ gcc_assert (element_bits >= offset_bits);
+
+ /* If the offset is narrower than the elements, extend it according
+ to its sign. */
+ if (element_bits > offset_bits)
+ return build_nonstandard_integer_type (element_bits,
+ TYPE_UNSIGNED (offset_type));
+
+ return offset_type;
+}
+
+/* Return MASK if MASK is suitable for masking an operation on vectors
+ of type VECTYPE, otherwise convert it into such a form and return
+ the result. Associate any conversion statements with STMT_INFO's
+ pattern. */
+
+static tree
+vect_convert_mask_for_vectype (tree mask, tree vectype,
+ stmt_vec_info stmt_info, vec_info *vinfo)
+{
+ tree mask_type = search_type_for_mask (mask, vinfo);
+ if (mask_type)
+ {
+ tree mask_vectype = get_mask_type_for_scalar_type (mask_type);
+ if (mask_vectype
+ && may_ne (TYPE_VECTOR_SUBPARTS (vectype),
+ TYPE_VECTOR_SUBPARTS (mask_vectype)))
+ mask = build_mask_conversion (mask, vectype, stmt_info, vinfo);
+ }
+ return mask;
+}
+
+/* Return the equivalent of:
+
+ fold_convert (TYPE, VALUE)
+
+ with the expectation that the operation will be vectorized.
+ If new statements are needed, add them as pattern statements
+ to STMT_INFO. */
+
+static tree
+vect_add_conversion_to_patterm (tree type, tree value,
+ stmt_vec_info stmt_info,
+ vec_info *vinfo)
+{
+ if (useless_type_conversion_p (type, TREE_TYPE (value)))
+ return value;
+
+ tree new_value = vect_recog_temp_ssa_var (type, NULL);
+ gassign *conversion = gimple_build_assign (new_value, CONVERT_EXPR, value);
+ stmt_vec_info new_stmt_info = new_stmt_vec_info (conversion, vinfo);
+ set_vinfo_for_stmt (conversion, new_stmt_info);
+ STMT_VINFO_VECTYPE (new_stmt_info) = get_vectype_for_scalar_type (type);
+ append_pattern_def_seq (stmt_info, conversion);
+ return new_value;
+}
+
+/* Try to convert STMT into a call to a gather load or scatter store
+ internal function. Return the final statement on success and set
+ *TYPE_IN and *TYPE_OUT to the vector type being loaded or stored.
+
+ This function only handles gathers and scatters that were recognized
+ as such from the outset (indicated by STMT_VINFO_GATHER_SCATTER_P). */
+
+static gimple *
+vect_try_gather_scatter_pattern (gimple *stmt, stmt_vec_info last_stmt_info,
+ tree *type_in, tree *type_out)
+{
+ /* Currently we only support this for loop vectorization. */
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (stmt_info->vinfo);
+ if (!loop_vinfo)
+ return NULL;
+
+ /* Make sure that we're looking at a gather load or scatter store. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ if (!dr || !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ return NULL;
+
+ /* Reject stores for now. */
+ if (!DR_IS_READ (dr))
+ return NULL;
+
+ /* Get the boolean that controls whether the load or store happens.
+ This is null if the operation is unconditional. */
+ tree mask = vect_get_load_store_mask (stmt);
+
+ /* Make sure that the target supports an appropriate internal
+ function for the gather/scatter operation. */
+ gather_scatter_info gs_info;
+ if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
+ return NULL;
+
+ /* Convert the mask to the right form. */
+ tree gs_vectype = get_vectype_for_scalar_type (gs_info.element_type);
+ if (mask)
+ mask = vect_convert_mask_for_vectype (mask, gs_vectype, last_stmt_info,
+ loop_vinfo);
+
+ /* Get the invariant base and non-invariant offset, converting the
+ latter to the same width as the vector elements. */
+ tree base = gs_info.base;
+ tree offset_type = vect_get_gather_scatter_offset_type (&gs_info);
+ tree offset = vect_add_conversion_to_patterm (offset_type, gs_info.offset,
+ last_stmt_info, loop_vinfo);
+
+ /* Build the new pattern statement. */
+ tree scale = size_int (gs_info.scale);
+ gcall *pattern_stmt;
+ if (DR_IS_READ (dr))
+ {
+ if (mask != NULL)
+ pattern_stmt = gimple_build_call_internal (gs_info.ifn, 4, base,
+ offset, scale, mask);
+ else
+ pattern_stmt = gimple_build_call_internal (gs_info.ifn, 3, base,
+ offset, scale);
+ tree load_lhs = vect_recog_temp_ssa_var (gs_info.element_type, NULL);
+ gimple_call_set_lhs (pattern_stmt, load_lhs);
+ }
+ else
+ /* Not yet supported. */
+ gcc_unreachable ();
+ gimple_call_set_nothrow (pattern_stmt, true);
+
+ /* Copy across relevant vectorization info and associate DR with the
+ new pattern statement instead of the original statement. */
+ stmt_vec_info pattern_stmt_info = new_stmt_vec_info (pattern_stmt,
+ loop_vinfo);
+ set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
+ STMT_VINFO_DATA_REF (pattern_stmt_info) = dr;
+ STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
+ = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info);
+ STMT_VINFO_GATHER_SCATTER_P (pattern_stmt_info)
+ = STMT_VINFO_GATHER_SCATTER_P (stmt_info);
+ DR_STMT (dr) = pattern_stmt;
+
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ *type_out = vectype;
+ *type_in = vectype;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "gather/scatter pattern detected:\n");
+
+ return pattern_stmt;
+}
+
+/* Pattern wrapper around vect_try_gather_scatter_pattern. */
+
+static gimple *
+vect_recog_gather_scatter_pattern (vec<gimple *> *stmts, tree *type_in,
+ tree *type_out)
+{
+ gimple *last_stmt = stmts->pop ();
+ stmt_vec_info last_stmt_info = vinfo_for_stmt (last_stmt);
+ gimple *pattern_stmt = vect_try_gather_scatter_pattern (last_stmt,
+ last_stmt_info,
+ type_in, type_out);
+ if (pattern_stmt)
+ stmts->safe_push (last_stmt);
+ return pattern_stmt;
+}
/* Mark statements that are involved in a pattern. */
===================================================================
@@ -389,21 +389,19 @@ exist_non_indexing_operands_for_use_p (t
{
if (is_gimple_call (stmt)
&& gimple_call_internal_p (stmt))
- switch (gimple_call_internal_fn (stmt))
- {
- case IFN_MASK_STORE:
- operand = gimple_call_arg (stmt, 3);
- if (operand == use)
- return true;
- /* FALLTHRU */
- case IFN_MASK_LOAD:
- operand = gimple_call_arg (stmt, 2);
- if (operand == use)
- return true;
- break;
- default:
- break;
- }
+ {
+ internal_fn ifn = gimple_call_internal_fn (stmt);
+ int mask_index = internal_fn_mask_index (ifn);
+ if (mask_index >= 0
+ && use == gimple_call_arg (stmt, mask_index))
+ return true;
+ if (internal_gather_scatter_fn_p (ifn)
+ && use == gimple_call_arg (stmt, 1))
+ return true;
+ if (ifn == IFN_MASK_STORE
+ && use == gimple_call_arg (stmt, 3))
+ return true;
+ }
return false;
}
@@ -1725,6 +1723,8 @@ static tree permute_vec_elements (tree,
is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE
says how the load or store is going to be implemented and GROUP_SIZE
is the number of load or store statements in the containing group.
+ If the access is a gather load or scatter store, GS_INFO describes
+ its arguments.
Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
supported, otherwise record the required mask types. */
@@ -1732,7 +1732,8 @@ static tree permute_vec_elements (tree,
static void
check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
vec_load_store_type vls_type, int group_size,
- vect_memory_access_type memory_access_type)
+ vect_memory_access_type memory_access_type,
+ gather_scatter_info *gs_info)
{
/* Invariant loads need no special support. */
if (memory_access_type == VMAT_INVARIANT)
@@ -1760,6 +1761,29 @@ check_load_store_masking (loop_vec_info
return;
}
+ if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ gcc_assert (is_load);
+ tree offset_type = TREE_TYPE (gs_info->offset);
+ if (!internal_gather_scatter_fn_supported_p (IFN_MASK_GATHER_LOAD,
+ vectype,
+ gs_info->memory_type,
+ TYPE_SIGN (offset_type),
+ gs_info->scale))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because the"
+ " target doesn't have an appropriate masked"
+ " gather load instruction.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ return;
+ }
+ unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
+ vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+ return;
+ }
+
if (memory_access_type != VMAT_CONTIGUOUS
&& memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
{
@@ -2591,6 +2615,31 @@ vect_build_gather_load_calls (gimple *st
}
}
+/* Prepare the base and offset in GS_INFO for vectorization.
+ Set *DATAREF_PTR to the loop-invariant base address and *VEC_OFFSET
+ to the vectorized offset argument for the first copy of STMT. STMT
+ is the statement described by GS_INFO and LOOP is the containing loop. */
+
+static void
+vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt,
+ gather_scatter_info *gs_info,
+ tree *dataref_ptr, tree *vec_offset)
+{
+ gimple_seq stmts = NULL;
+ *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE);
+ if (stmts != NULL)
+ {
+ basic_block new_bb;
+ edge pe = loop_preheader_edge (loop);
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+ gcc_assert (!new_bb);
+ }
+ tree offset_type = TREE_TYPE (gs_info->offset);
+ tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+ *vec_offset = vect_get_vec_def_for_operand (gs_info->offset, stmt,
+ offset_vectype);
+}
+
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */
static bool
@@ -2780,7 +2829,7 @@ vectorizable_call (gimple *gs, gimple_st
return false;
if (gimple_call_internal_p (stmt)
- && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+ && (internal_load_fn_p (gimple_call_internal_fn (stmt))
|| gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
/* Handled by vectorizable_load and vectorizable_store. */
return false;
@@ -5965,7 +6014,7 @@ vectorizable_store (gimple *stmt, gimple
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
- memory_access_type);
+ memory_access_type, &gs_info);
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
/* The SLP costs are calculated during SLP analysis. */
@@ -6937,7 +6986,11 @@ vectorizable_load (gimple *stmt, gimple_
else
{
gcall *call = dyn_cast <gcall *> (stmt);
- if (!call || !gimple_call_internal_p (call, IFN_MASK_LOAD))
+ if (!call || !gimple_call_internal_p (call))
+ return false;
+
+ internal_fn ifn = gimple_call_internal_fn (call);
+ if (!internal_load_fn_p (ifn))
return false;
scalar_dest = gimple_call_lhs (call);
@@ -6952,9 +7005,13 @@ vectorizable_load (gimple *stmt, gimple_
return false;
}
- mask = gimple_call_arg (call, 2);
- if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
- return false;
+ int mask_index = internal_fn_mask_index (ifn);
+ if (mask_index >= 0)
+ {
+ mask = gimple_call_arg (call, mask_index);
+ if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
+ return false;
+ }
}
if (!STMT_VINFO_DATA_REF (stmt_info))
@@ -7078,7 +7135,7 @@ vectorizable_load (gimple *stmt, gimple_
TYPE_MODE (mask_vectype), true))
return false;
}
- else if (memory_access_type == VMAT_GATHER_SCATTER)
+ else if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
{
tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
tree masktype
@@ -7092,7 +7149,8 @@ vectorizable_load (gimple *stmt, gimple_
return false;
}
}
- else if (memory_access_type != VMAT_LOAD_STORE_LANES)
+ else if (memory_access_type != VMAT_LOAD_STORE_LANES
+ && memory_access_type != VMAT_GATHER_SCATTER)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7109,7 +7167,7 @@ vectorizable_load (gimple *stmt, gimple_
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
- memory_access_type);
+ memory_access_type, &gs_info);
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
/* The SLP costs are calculated during SLP analysis. */
@@ -7131,7 +7189,7 @@ vectorizable_load (gimple *stmt, gimple_
ensure_base_align (dr);
- if (memory_access_type == VMAT_GATHER_SCATTER)
+ if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
{
vect_build_gather_load_calls (stmt, gsi, vec_stmt, &gs_info, mask);
return true;
@@ -7576,6 +7634,7 @@ vectorizable_load (gimple *stmt, gimple_
aggr_type = vectype;
tree vec_mask = NULL_TREE;
+ tree vec_offset = NULL_TREE;
prev_stmt_info = NULL;
poly_uint64 group_elt = 0;
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -7618,6 +7677,12 @@ vectorizable_load (gimple *stmt, gimple_
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
stmt, diff);
}
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ vect_get_gather_scatter_ops (loop, stmt, &gs_info,
+ &dataref_ptr, &vec_offset);
+ inv_p = false;
+ }
else
dataref_ptr
= vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
@@ -7633,6 +7698,13 @@ vectorizable_load (gimple *stmt, gimple_
if (dataref_offset)
dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
TYPE_SIZE_UNIT (aggr_type));
+ else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ {
+ gimple *def_stmt;
+ vect_def_type dt;
+ vect_is_simple_use (vec_offset, loop_vinfo, &def_stmt, &dt);
+ vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
+ }
else
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
TYPE_SIZE_UNIT (aggr_type));
@@ -7721,6 +7793,24 @@ vectorizable_load (gimple *stmt, gimple_
{
unsigned int align, misalign;
+ if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ tree scale = size_int (gs_info.scale);
+ gcall *call;
+ if (masked_loop_p)
+ call = gimple_build_call_internal
+ (IFN_MASK_GATHER_LOAD, 4, dataref_ptr,
+ vec_offset, scale, final_mask);
+ else
+ call = gimple_build_call_internal
+ (IFN_GATHER_LOAD, 3, dataref_ptr,
+ vec_offset, scale);
+ gimple_call_set_nothrow (call, true);
+ new_stmt = call;
+ data_ref = NULL_TREE;
+ break;
+ }
+
align = DR_TARGET_ALIGNMENT (dr);
if (alignment_support_scheme == dr_aligned)
{
===================================================================
@@ -151,6 +151,7 @@ (define_c_enum "unspec" [
UNSPEC_XPACLRI
UNSPEC_LD1_SVE
UNSPEC_ST1_SVE
+ UNSPEC_LD1_GATHER
UNSPEC_MERGE_PTRUE
UNSPEC_PTEST_PTRUE
UNSPEC_UNPACKSHI
===================================================================
@@ -276,6 +276,12 @@ (define_mode_iterator SVE_HSF [VNx8HF VN
;; All SVE vector modes that have 32-bit or 64-bit elements.
(define_mode_iterator SVE_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
+;; All SVE vector modes that have 32-bit elements.
+(define_mode_iterator SVE_S [VNx4SI VNx4SF])
+
+;; All SVE vector modes that have 64-bit elements.
+(define_mode_iterator SVE_D [VNx2DI VNx2DF])
+
;; All SVE integer vector modes that have 32-bit or 64-bit elements.
(define_mode_iterator SVE_SDI [VNx4SI VNx2DI])
===================================================================
@@ -596,3 +596,11 @@ (define_predicate "aarch64_sve_float_mul
(define_predicate "aarch64_sve_vec_perm_operand"
(ior (match_operand 0 "register_operand")
(match_operand 0 "aarch64_constant_vector_operand")))
+
+(define_predicate "aarch64_gather_scale_operand_w"
+ (and (match_code "const_int")
+ (match_test "INTVAL (op) == 1 || INTVAL (op) == 4")))
+
+(define_predicate "aarch64_gather_scale_operand_d"
+ (and (match_code "const_int")
+ (match_test "INTVAL (op) == 1 || INTVAL (op) == 8")))
===================================================================
@@ -189,6 +189,63 @@ (define_insn "maskstore<mode><vpred>"
"st1<Vesize>\t%1.<Vetype>, %2, %0"
)
+;; Unpredicated gather loads.
+(define_expand "gather_load<mode>"
+ [(set (match_operand:SVE_SD 0 "register_operand")
+ (unspec:SVE_SD
+ [(match_dup 5)
+ (match_operand:DI 1 "aarch64_reg_or_zero")
+ (match_operand:<V_INT_EQUIV> 2 "register_operand")
+ (match_operand:DI 3 "const_int_operand")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+ (mem:BLK (scratch))]
+ UNSPEC_LD1_GATHER))]
+ "TARGET_SVE"
+ {
+ operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+ }
+)
+
+;; Predicated gather loads for 32-bit elements. Operand 3 is true for
+;; unsigned extension and false for signed extensions.
+(define_insn "mask_gather_load<mode>"
+ [(set (match_operand:SVE_S 0 "register_operand" "=w, w, w, w, w")
+ (unspec:SVE_S
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+ (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+ (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w, w, w")
+ (match_operand:DI 3 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+ (mem:BLK (scratch))]
+ UNSPEC_LD1_GATHER))]
+ "TARGET_SVE"
+ "@
+ ld1w\t%0.s, %5/z, [%2.s]
+ ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
+ ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
+ ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+ ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated gather loads for 64-bit elements. The value of operand 3
+;; doesn't matter in this case.
+(define_insn "mask_gather_load<mode>"
+ [(set (match_operand:SVE_D 0 "register_operand" "=w, w, w")
+ (unspec:SVE_D
+ [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+ (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk")
+ (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w")
+ (match_operand:DI 3 "const_int_operand")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+ (mem:BLK (scratch))]
+ UNSPEC_LD1_GATHER))]
+ "TARGET_SVE"
+ "@
+ ld1d\t%0.d, %5/z, [%2.d]
+ ld1d\t%0.d, %5/z, [%1, %2.d]
+ ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
;; SVE structure moves.
(define_expand "mov<mode>"
[(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
===================================================================
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[i] += src[indices[i]]; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
===================================================================
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_gather_load_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
===================================================================
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]); \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 32) \
+ T (uint32_t, 32) \
+ T (float, 32) \
+ T (int64_t, 64) \
+ T (uint64_t, 64) \
+ T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_gather_load_3.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,23 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict *src, \
+ int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[i] += *src[i]; \
+ }
+
+#define TEST_ALL(T) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 3 } } */
===================================================================
@@ -0,0 +1,36 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+/* Invoked 18 times for each data size. */
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, INDEX##BITS mask, int n) \
+ { \
+ for (int i = 9; i < n; ++i) \
+ dest[i] = src[(INDEX##BITS) (indices[i] | mask)]; \
+ }
+
+#define TEST_ALL(T) \
+ T (int32_t, 16) \
+ T (uint32_t, 16) \
+ T (float, 16) \
+ T (int64_t, 32) \
+ T (uint64_t, 32) \
+ T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
===================================================================
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "sve_gather_load_6.c"
+
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* Either extension type is OK here. */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, [us]xtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
===================================================================
@@ -0,0 +1,52 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[i] += src[indices[i]]; \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int32_t, 32) \
+ T (DATA_TYPE, uint32_t, 32) \
+ T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t, 64) \
+ T (DATA_TYPE, uint64_t, 64) \
+ T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float) \
+ TEST64 (T, int64_t) \
+ TEST64 (T, uint64_t) \
+ TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_mask_gather_load_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,52 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX##BITS *indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]); \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int32_t, 32) \
+ T (DATA_TYPE, uint32_t, 32) \
+ T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t, 64) \
+ T (DATA_TYPE, uint64_t, 64) \
+ T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float) \
+ TEST64 (T, int64_t) \
+ TEST64 (T, uint64_t) \
+ TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_mask_gather_load_3.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict *restrict src, \
+ CMP_TYPE *cmp1, CMP_TYPE *cmp2, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[i] += *src[i]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t) \
+ T (DATA_TYPE, uint64_t) \
+ T (DATA_TYPE, double)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
===================================================================
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, INDEX_TYPE) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE##_##INDEX_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX_TYPE *indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[i] += src[indices[i]]; \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int64_t, int32_t) \
+ T (DATA_TYPE, uint64_t, int32_t) \
+ T (DATA_TYPE, double, int32_t) \
+ T (DATA_TYPE, int64_t, uint32_t) \
+ T (DATA_TYPE, uint64_t, uint32_t) \
+ T (DATA_TYPE, double, uint32_t)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 72 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 18 } } */
===================================================================
@@ -0,0 +1,53 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, INDEX_TYPE) \
+ void \
+ f_##DATA_TYPE##_##CMP_TYPE##_##INDEX_TYPE \
+ (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX_TYPE *indices, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ if (cmp1[i] == cmp2[i]) \
+ dest[i] += src[indices[i]]; \
+ }
+
+#define TEST32(T, DATA_TYPE) \
+ T (DATA_TYPE, int16_t, int32_t) \
+ T (DATA_TYPE, uint16_t, int32_t) \
+ T (DATA_TYPE, _Float16, int32_t) \
+ T (DATA_TYPE, int16_t, uint32_t) \
+ T (DATA_TYPE, uint16_t, uint32_t) \
+ T (DATA_TYPE, _Float16, uint32_t)
+
+#define TEST64(T, DATA_TYPE) \
+ T (DATA_TYPE, int32_t, int64_t) \
+ T (DATA_TYPE, uint32_t, int64_t) \
+ T (DATA_TYPE, float, int64_t) \
+ T (DATA_TYPE, int32_t, uint64_t) \
+ T (DATA_TYPE, uint32_t, uint64_t) \
+ T (DATA_TYPE, float, uint64_t)
+
+#define TEST_ALL(T) \
+ TEST32 (T, int32_t) \
+ TEST32 (T, uint32_t) \
+ TEST32 (T, float) \
+ TEST64 (T, int64_t) \
+ TEST64 (T, uint64_t) \
+ TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 18 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 18 } } */
+
+/* Also used for the TEST32 indices. */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 72 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 36 } } */