===================================================================
@@ -2168,6 +2168,14 @@ expand_GOACC_REDUCTION (internal_fn, gcall *)
gcc_unreachable ();
}
+/* This is expanded by oacc_device_lower pass. */
+
+static void
+expand_GOACC_TILE (internal_fn, gcall *)
+{
+ gcc_unreachable ();
+}
+
/* Set errno to EDOM. */
static void
===================================================================
@@ -175,7 +175,7 @@ DEF_INTERNAL_FN (UNIQUE, ECF_NOTHROW, NULL)
dimension. DIM_POS is pure (and not const) so that it isn't
thought to clobber memory and can be gcse'd within a single
parallel region, but not across FORK/JOIN boundaries. They take a
- single INTEGER_CST argument. */
+ single INTEGER_CST argument. This might be overly conservative. */
DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
@@ -185,6 +185,10 @@ DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHRO
/* OpenACC reduction abstraction. See internal-fn.h for usage. */
DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL)
+/* Openacc tile abstraction. Describes the spans of the element loop.
+ GOACC_TILE (num-loops, loop-no, tile-arg, tile-mask, element-mask). */
+DEF_INTERNAL_FN (GOACC_TILE, ECF_NOTHROW | ECF_LEAF, NULL)
+
/* Set errno to EDOM, if GCC knows how to do that directly for the
current target. */
DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL)
===================================================================
@@ -213,7 +213,8 @@ struct omp_for_data
tree chunk_size;
gomp_for *for_stmt;
tree pre, iter_type;
- int collapse;
+ tree tiling; /* Tiling values (if non null). */
+ int collapse; /* Collapsed loops, 1 for a non-collapsed loop. */
int ordered;
bool have_nowait, have_ordered, simd_schedule;
unsigned char sched_modifiers;
@@ -242,9 +243,10 @@ struct oacc_loop
tree routine; /* Pseudo-loop enclosing a routine. */
unsigned mask; /* Partitioning mask. */
+ unsigned e_mask; /* Partitioning of element loops (when tiling). */
unsigned inner; /* Partitioning of inner loops. */
unsigned flags; /* Partitioning flags. */
- unsigned ifns; /* Contained loop abstraction functions. */
+ vec<gcall *> ifns; /* Contained loop abstraction functions. */
tree chunk_size; /* Chunk size. */
gcall *head_end; /* Final marker of head sequence. */
};
@@ -256,9 +258,10 @@ enum oacc_loop_flags {
OLF_AUTO = 1u << 1, /* Compiler chooses axes. */
OLF_INDEPENDENT = 1u << 2, /* Iterations are known independent. */
OLF_GANG_STATIC = 1u << 3, /* Gang partitioning is static (has op). */
-
+ OLF_TILE = 1u << 4, /* Tiled loop. */
+
/* Explicitly specified loop axes. */
- OLF_DIM_BASE = 4,
+ OLF_DIM_BASE = 5,
OLF_DIM_GANG = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER),
OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR),
@@ -536,13 +539,9 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
fd->for_stmt = for_stmt;
fd->pre = NULL;
- if (gimple_omp_for_collapse (for_stmt) > 1)
- fd->loops = loops;
- else
- fd->loops = &fd->loop;
-
fd->have_nowait = distribute || simd;
fd->have_ordered = false;
+ fd->tiling = NULL;
fd->collapse = 1;
fd->ordered = 0;
fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
@@ -587,9 +586,22 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
collapse_count = &OMP_CLAUSE_COLLAPSE_COUNT (t);
}
break;
+ case OMP_CLAUSE_TILE:
+ fd->tiling = OMP_CLAUSE_TILE_LIST (t);
+ fd->collapse = list_length (fd->tiling);
+ gcc_assert (fd->collapse);
+ collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t);
+ collapse_count = &OMP_CLAUSE_TILE_COUNT (t);
+ break;
default:
break;
}
+
+ if (fd->collapse > 1 || fd->tiling)
+ fd->loops = loops;
+ else
+ fd->loops = &fd->loop;
+
if (fd->ordered && fd->collapse == 1 && loops != NULL)
{
fd->loops = loops;
@@ -608,7 +620,7 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
gcc_assert (fd->chunk_size == NULL);
}
- gcc_assert (fd->collapse == 1 || collapse_iter != NULL);
+ gcc_assert ((fd->collapse == 1 && !fd->tiling) || collapse_iter != NULL);
if (taskloop)
fd->sched_kind = OMP_CLAUSE_SCHEDULE_RUNTIME;
if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
@@ -626,7 +638,8 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
int cnt = fd->ordered ? fd->ordered : fd->collapse;
for (i = 0; i < cnt; i++)
{
- if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL))
+ if (i == 0 && fd->collapse == 1 && !fd->tiling
+ && (fd->ordered == 0 || loops == NULL))
loop = &fd->loop;
else if (loops != NULL)
loop = loops + i;
@@ -655,7 +668,7 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
|| (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC
&& !fd->have_ordered))
{
- if (fd->collapse == 1)
+ if (fd->collapse == 1 && !fd->tiling)
iter_type = TREE_TYPE (loop->v);
else if (i == 0
|| TYPE_PRECISION (iter_type)
@@ -786,7 +799,7 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
*collapse_count = create_tmp_var (iter_type, ".count");
}
- if (fd->collapse > 1 || (fd->ordered && loops))
+ if (fd->collapse > 1 || fd->tiling || (fd->ordered && loops))
{
fd->loop.v = *collapse_iter;
fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0);
@@ -2192,6 +2205,7 @@ scan_sharing_clauses (tree clauses, omp_context *c
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_TILE:
break;
case OMP_CLAUSE_ALIGNED:
@@ -2201,7 +2215,6 @@ scan_sharing_clauses (tree clauses, omp_context *c
install_var_local (decl, ctx);
break;
- case OMP_CLAUSE_TILE:
case OMP_CLAUSE__CACHE_:
default:
gcc_unreachable ();
@@ -2362,10 +2375,10 @@ scan_sharing_clauses (tree clauses, omp_context *c
case OMP_CLAUSE_INDEPENDENT:
case OMP_CLAUSE_AUTO:
case OMP_CLAUSE_SEQ:
+ case OMP_CLAUSE_TILE:
case OMP_CLAUSE__GRIDDIM_:
break;
- case OMP_CLAUSE_TILE:
case OMP_CLAUSE__CACHE_:
default:
gcc_unreachable ();
@@ -6349,6 +6362,10 @@ lower_oacc_head_mark (location_t loc, tree ddvar,
tag |= OLF_INDEPENDENT;
break;
+ case OMP_CLAUSE_TILE:
+ tag |= OLF_TILE;
+ break;
+
default:
continue;
}
@@ -6366,14 +6383,20 @@ lower_oacc_head_mark (location_t loc, tree ddvar,
if (!tgt || is_oacc_parallel (tgt))
tag |= OLF_INDEPENDENT;
- /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO. */
- if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE)
- | OLF_SEQ)))
- tag |= OLF_AUTO;
+ if (tag & OLF_TILE)
+ /* Tiling could use all 3 levels. */
+ levels = 3;
+ else
+ {
+ /* A loop lacking SEQ, GANG, WORKER and/or VECTOR could be AUTO.
+ Ensure at least one level, or 2 for possible auto
+ partitioning */
+ bool maybe_auto = !(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
+ << OLF_DIM_BASE) | OLF_SEQ));
- /* Ensure at least one level. */
- if (!levels)
- levels++;
+ if (levels < 1u + maybe_auto)
+ levels = 1u + maybe_auto;
+ }
args.quick_push (build_int_cst (integer_type_node, levels));
args.quick_push (build_int_cst (integer_type_node, tag));
@@ -7445,7 +7468,9 @@ struct oacc_collapse
{
tree base; /* Base value. */
tree iters; /* Number of steps. */
- tree step; /* step size. */
+ tree step; /* Step size. */
+ tree tile; /* Tile increment (if tiled). */
+ tree outer; /* Tile iterator var. */
};
/* Helper for expand_oacc_for. Determine collapsed loop information.
@@ -7455,15 +7480,20 @@ struct oacc_collapse
static tree
expand_oacc_collapse_init (const struct omp_for_data *fd,
gimple_stmt_iterator *gsi,
- oacc_collapse *counts, tree bound_type)
+ oacc_collapse *counts, tree bound_type,
+ location_t loc)
{
+ tree tiling = fd->tiling;
tree total = build_int_cst (bound_type, 1);
int ix;
gcc_assert (integer_onep (fd->loop.step));
gcc_assert (integer_zerop (fd->loop.n1));
- for (ix = 0; ix != fd->collapse; ix++)
+ /* When tiling, the first operand of the tile clause applies to the
+ innermost loop, and we work outwards from there. Seems
+ backwards, but whatever. */
+ for (ix = fd->collapse; ix--;)
{
const omp_for_data_loop *loop = &fd->loops[ix];
@@ -7478,6 +7508,30 @@ expand_oacc_collapse_init (const struct omp_for_da
if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
diff_type = signed_type_for (diff_type);
+ if (tiling)
+ {
+ tree num = build_int_cst (integer_type_node, fd->collapse);
+ tree loop_no = build_int_cst (integer_type_node, ix);
+ tree tile = TREE_VALUE (tiling);
+ gcall *call = gimple_build_call_internal
+ (IFN_GOACC_TILE, 5, num, loop_no, tile,
+ /* gwv-outer=*/integer_zero_node,
+ /* gwv-inner=*/integer_zero_node);
+
+ counts[ix].outer = create_tmp_var (iter_type, ".outer");
+ counts[ix].tile = create_tmp_var (diff_type, ".tile");
+ gimple_call_set_lhs (call, counts[ix].tile);
+ gimple_set_location (call, loc);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+ tiling = TREE_CHAIN (tiling);
+ }
+ else
+ {
+ counts[ix].tile = NULL;
+ counts[ix].outer = loop->v;
+ }
+
tree b = loop->n1;
tree e = loop->n2;
tree s = loop->step;
@@ -7531,13 +7585,14 @@ expand_oacc_collapse_init (const struct omp_for_da
return total;
}
-/* Emit initializers for collapsed loop members. IVAR is the outer
+/* Emit initializers for collapsed loop members. INNER is true if
+ this is for the element loop of a TILE. IVAR is the outer
loop iteration variable, from which collapsed loop iteration values
are calculated. COUNTS array has been initialized by
expand_oacc_collapse_inits. */
static void
-expand_oacc_collapse_vars (const struct omp_for_data *fd,
+expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
gimple_stmt_iterator *gsi,
const oacc_collapse *counts, tree ivar)
{
@@ -7549,7 +7604,8 @@ static void
{
const omp_for_data_loop *loop = &fd->loops[ix];
const oacc_collapse *collapse = &counts[ix];
- tree iter_type = TREE_TYPE (loop->v);
+ tree v = inner ? loop->v : collapse->outer;
+ tree iter_type = TREE_TYPE (v);
tree diff_type = TREE_TYPE (collapse->step);
tree plus_type = iter_type;
enum tree_code plus_code = PLUS_EXPR;
@@ -7561,24 +7617,25 @@ static void
plus_type = sizetype;
}
- expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar,
- fold_convert (ivar_type, collapse->iters));
+ expr = ivar;
+ if (ix)
+ {
+ tree mod = fold_convert (ivar_type, collapse->iters);
+ ivar = fold_build2 (TRUNC_DIV_EXPR, ivar_type, expr, mod);
+ expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, expr, mod);
+ ivar = force_gimple_operand_gsi (gsi, ivar, true, NULL_TREE,
+ true, GSI_SAME_STMT);
+ }
+
expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
collapse->step);
- expr = fold_build2 (plus_code, iter_type, collapse->base,
+ expr = fold_build2 (plus_code, iter_type,
+ inner ? collapse->outer : collapse->base,
fold_convert (plus_type, expr));
expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE,
true, GSI_SAME_STMT);
- gassign *ass = gimple_build_assign (loop->v, expr);
+ gassign *ass = gimple_build_assign (v, expr);
gsi_insert_before (gsi, ass, GSI_SAME_STMT);
-
- if (ix)
- {
- expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar,
- fold_convert (ivar_type, collapse->iters));
- ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
- true, GSI_SAME_STMT);
- }
}
}
@@ -11148,7 +11205,8 @@ expand_omp_taskloop_for_inner (struct omp_region *
where LTGT is < or >. We may have a specified chunking size, CHUNKING
(constant 0 for no chunking) and we will have a GWV partitioning
mask, specifying dimensions over which the loop is to be
- partitioned (see note below). We generate code that looks like:
+ partitioned (see note below). We generate code that looks like
+ (this ignores tiling):
<entry_bb> [incoming FALL->body, BRANCH->exit]
typedef signedintify (typeof (V)) T; // underlying signed integral type
@@ -11178,12 +11236,8 @@ expand_omp_taskloop_for_inner (struct omp_region *
<exit_bb> [incoming]
V = B + ((range -/+ 1) / S +/- 1) * S [*]
- [*] Needed if V live at end of loop
+ [*] Needed if V live at end of loop. */
- Note: CHUNKING & GWV mask are specified explicitly here. This is a
- transition, and will be specified by a more general mechanism shortly.
- */
-
static void
expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
{
@@ -11245,9 +11299,16 @@ expand_oacc_for (struct omp_region *region, struct
tree step = create_tmp_var (diff_type, ".step");
bool up = cond_code == LT_EXPR;
tree dir = build_int_cst (diff_type, up ? +1 : -1);
- bool chunking = !gimple_in_ssa_p (cfun);;
+ bool chunking = !gimple_in_ssa_p (cfun);
bool negating;
+ /* Tiling vars. */
+ tree tile_size = NULL_TREE;
+ tree element_s = NULL_TREE;
+ tree e_bound = NULL_TREE, e_offset = NULL_TREE, e_step = NULL_TREE;
+ basic_block elem_body_bb = NULL;
+ basic_block elem_cont_bb = NULL;
+
/* SSA instances. */
tree offset_incr = NULL_TREE;
tree offset_init = NULL_TREE;
@@ -11278,11 +11339,12 @@ expand_oacc_for (struct omp_region *region, struct
gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
}
- if (fd->collapse > 1)
+ if (fd->collapse > 1 || fd->tiling)
{
+ gcc_assert (!gimple_in_ssa_p (cfun) && up);
counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
tree total = expand_oacc_collapse_init (fd, &gsi, counts,
- TREE_TYPE (fd->loop.n2));
+ TREE_TYPE (fd->loop.n2), loc);
if (SSA_VAR_P (fd->loop.n2))
{
@@ -11291,7 +11353,6 @@ expand_oacc_for (struct omp_region *region, struct
ass = gimple_build_assign (fd->loop.n2, total);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
}
-
}
tree b = fd->loop.n1;
@@ -11315,6 +11376,29 @@ expand_oacc_for (struct omp_region *region, struct
expr = fold_convert (diff_type, chunk_size);
chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
NULL_TREE, true, GSI_SAME_STMT);
+
+ if (fd->tiling)
+ {
+ /* Determine the tile size and element step,
+ modify the outer loop step size. */
+ tile_size = create_tmp_var (diff_type, ".tile_size");
+ expr = build_int_cst (diff_type, 1);
+ for (int ix = 0; ix < fd->collapse; ix++)
+ expr = fold_build2 (MULT_EXPR, diff_type, counts[ix].tile, expr);
+ expr = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+ ass = gimple_build_assign (tile_size, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ element_s = create_tmp_var (diff_type, ".element_s");
+ ass = gimple_build_assign (element_s, s);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ expr = fold_build2 (MULT_EXPR, diff_type, s, tile_size);
+ s = force_gimple_operand_gsi (&gsi, expr, true,
+ NULL_TREE, true, GSI_SAME_STMT);
+ }
+
/* Determine the range, avoiding possible unsigned->signed overflow. */
negating = !up && TYPE_UNSIGNED (iter_type);
expr = fold_build2 (MINUS_EXPR, plus_type,
@@ -11419,8 +11503,75 @@ expand_oacc_for (struct omp_region *region, struct
true, GSI_SAME_STMT);
ass = gimple_build_assign (v, expr);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
- if (fd->collapse > 1)
- expand_oacc_collapse_vars (fd, &gsi, counts, v);
+
+ if (fd->collapse > 1 || fd->tiling)
+ expand_oacc_collapse_vars (fd, false, &gsi, counts, v);
+
+ if (fd->tiling)
+ {
+ /* Determine the range of the element loop -- usually simply
+ the tile_size, but could be smaller if the final
+ iteration of the outer loop is a partial tile. */
+ tree e_range = create_tmp_var (diff_type, ".e_range");
+
+ expr = build2 (MIN_EXPR, diff_type,
+ build2 (MINUS_EXPR, diff_type, bound, offset),
+ build2 (MULT_EXPR, diff_type, tile_size,
+ element_s));
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (e_range, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+ /* Determine bound, offset & step of inner loop. */
+ e_bound = create_tmp_var (diff_type, ".e_bound");
+ e_offset = create_tmp_var (diff_type, ".e_offset");
+ e_step = create_tmp_var (diff_type, ".e_step");
+
+ /* Mark these as element loops. */
+ tree e_gwv = integer_minus_one_node;
+ tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */
+
+ call = gimple_build_call_internal
+ (IFN_GOACC_LOOP, 7,
+ build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET),
+ dir, e_range, element_s, chunk, e_gwv, chunk);
+ gimple_call_set_lhs (call, e_offset);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ call = gimple_build_call_internal
+ (IFN_GOACC_LOOP, 7,
+ build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND),
+ dir, e_range, element_s, chunk, e_gwv, e_offset);
+ gimple_call_set_lhs (call, e_bound);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ call = gimple_build_call_internal
+ (IFN_GOACC_LOOP, 6,
+ build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP),
+ dir, e_range, element_s, chunk, e_gwv);
+ gimple_call_set_lhs (call, e_step);
+ gimple_set_location (call, loc);
+ gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+ /* Add test and split block. */
+ expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+ stmt = gimple_build_cond_empty (expr);
+ gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+ split = split_block (body_bb, stmt);
+ elem_body_bb = split->dest;
+ if (cont_bb == body_bb)
+ cont_bb = elem_body_bb;
+ body_bb = split->src;
+
+ split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+ /* Initialize the user's loop vars. */
+ gsi = gsi_start_bb (elem_body_bb);
+ expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset);
+ }
}
/* Loop increment goes into cont_bb. If this is not a loop, we
@@ -11434,9 +11585,33 @@ expand_oacc_for (struct omp_region *region, struct
gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
loc = gimple_location (cont_stmt);
+ if (fd->tiling)
+ {
+ /* Insert element loop increment and test. */
+ expr = build2 (PLUS_EXPR, diff_type, e_offset, e_step);
+ expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+ true, GSI_SAME_STMT);
+ ass = gimple_build_assign (e_offset, expr);
+ gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+ expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+
+ stmt = gimple_build_cond_empty (expr);
+ gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+ split = split_block (cont_bb, stmt);
+ elem_cont_bb = split->src;
+ cont_bb = split->dest;
+
+ split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+ make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+
+ make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+
+ gsi = gsi_for_stmt (cont_stmt);
+ }
+
/* Increment offset. */
if (gimple_in_ssa_p (cfun))
- expr= build2 (plus_code, iter_type, offset,
+ expr = build2 (plus_code, iter_type, offset,
fold_convert (plus_type, step));
else
expr = build2 (PLUS_EXPR, diff_type, offset, step);
@@ -11510,7 +11685,7 @@ expand_oacc_for (struct omp_region *region, struct
if (cont_bb)
{
- /* We now have one or two nested loops. Update the loop
+ /* We now have one, two or three nested loops. Update the loop
structures. */
struct loop *parent = entry_bb->loop_father;
struct loop *body = body_bb->loop_father;
@@ -11537,6 +11712,15 @@ expand_oacc_for (struct omp_region *region, struct
body_loop->header = body_bb;
body_loop->latch = cont_bb;
add_loop (body_loop, parent);
+
+ if (fd->tiling)
+ {
+ // Insert tiling's element loop
+ struct loop *inner_loop = alloc_loop ();
+ inner_loop->header = elem_body_bb;
+ inner_loop->latch = elem_cont_bb;
+ add_loop (inner_loop, body_loop);
+ }
}
}
}
@@ -18555,6 +18739,23 @@ omp_finish_file (void)
}
}
+/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
+ axis DIM. Return a tmp var holding the result. */
+
+static tree
+oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+{
+ tree arg = build_int_cst (unsigned_type_node, dim);
+ tree size = create_tmp_var (integer_type_node);
+ enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
+ gimple *call = gimple_build_call_internal (fn, 1, arg);
+
+ gimple_call_set_lhs (call, size);
+ gimple_seq_add_stmt (seq, call);
+
+ return size;
+}
+
/* Find the number of threads (POS = false), or thread number (POS =
true) for an OpenACC region partitioned as MASK. Setup code
required for the calculation is added to SEQ. */
@@ -18569,29 +18770,17 @@ oacc_thread_numbers (bool pos, int mask, gimple_se
for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
if (GOMP_DIM_MASK (ix) & mask)
{
- tree arg = build_int_cst (unsigned_type_node, ix);
-
if (res)
{
/* We had an outer index, so scale that by the size of
this dimension. */
- tree n = create_tmp_var (integer_type_node);
- gimple *call
- = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg);
-
- gimple_call_set_lhs (call, n);
- gimple_seq_add_stmt (seq, call);
+ tree n = oacc_dim_call (false, ix, seq);
res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
}
if (pos)
{
/* Determine index in this dimension. */
- tree id = create_tmp_var (integer_type_node);
- gimple *call = gimple_build_call_internal
- (IFN_GOACC_DIM_POS, 1, arg);
-
- gimple_call_set_lhs (call, id);
- gimple_seq_add_stmt (seq, call);
+ tree id = oacc_dim_call (true, ix, seq);
if (res)
res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
else
@@ -18790,6 +18979,84 @@ oacc_xform_loop (gcall *call)
gsi_replace_with_seq (&gsi, seq, true);
}
+/* Transform a GOACC_TILE call. Determines the element loop span for
+ the specified loop of the nest. This is 1 if we're not tiling.
+
+ GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
+
+static void
+oacc_xform_tile (gcall *call)
+{
+ gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ unsigned collapse = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+ /* Inner loops have higher loop_nos. */
+ unsigned loop_no = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 1));
+ tree tile_size = gimple_call_arg (call, 2);
+ unsigned e_mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 4));
+ tree lhs = gimple_call_lhs (call);
+ tree type = TREE_TYPE (lhs);
+ gimple_seq seq = NULL;
+ tree span = build_int_cst (type, 1);
+
+ gcc_assert (!(e_mask
+ & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+ | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
+ push_gimplify_context (!seen_error ());
+ if (
+#ifndef ACCEL_COMPILER
+ 1 ||
+#endif
+ !e_mask)
+ /* Not paritioning. */
+ span = integer_one_node;
+ else if (!integer_zerop (tile_size))
+ /* User explicitly specified size. */
+ span = tile_size;
+ else
+ {
+ /* Pick a size based on the paritioning of the element loop and
+ the number of loop nests. */
+ tree first_size = NULL_TREE;
+ tree second_size = NULL_TREE;
+
+ if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
+ if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
+
+ if (!first_size)
+ {
+ first_size = second_size;
+ second_size = NULL_TREE;
+ }
+
+ if (loop_no + 1 == collapse)
+ {
+ span = first_size;
+ if (!loop_no && second_size)
+ span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
+ span, second_size);
+ }
+ else if (loop_no + 2 == collapse)
+ span = second_size;
+ else
+ span = NULL_TREE;
+
+ if (!span)
+ /* There's no obvious element size for this loop. Options
+ are 1, first_size or some non-unity constant (32 is my
+ favourite). We should gather some statistics. */
+ span = first_size;
+ }
+
+ span = fold_convert (type, span);
+ gimplify_assign (lhs, span, &seq);
+
+ pop_gimplify_context (NULL);
+
+ gsi_replace_with_seq (&gsi, seq, true);
+}
+
/* Default partitioned and minimum partitioned dimensions. */
static int oacc_default_dims[GOMP_DIM_MAX];
@@ -18948,8 +19215,7 @@ new_oacc_loop_raw (oacc_loop *parent, location_t l
memset (loop->tails, 0, sizeof (loop->tails));
loop->routine = NULL_TREE;
- loop->mask = loop->flags = loop->inner = 0;
- loop->ifns = 0;
+ loop->mask = loop->e_mask = loop->flags = loop->inner = 0;
loop->chunk_size = 0;
loop->head_end = NULL;
@@ -19012,7 +19278,7 @@ static oacc_loop *
finish_oacc_loop (oacc_loop *loop)
{
/* If the loop has been collapsed, don't partition it. */
- if (!loop->ifns)
+ if (loop->ifns.is_empty ())
loop->mask = loop->flags = 0;
return loop->parent;
}
@@ -19148,9 +19414,10 @@ oacc_loop_discover_walk (oacc_loop *loop, basic_bl
break;
case IFN_GOACC_LOOP:
- /* Count the goacc loop abstraction fns, to determine if the
- loop was collapsed already. */
- loop->ifns++;
+ case IFN_GOACC_TILE:
+ /* Record the abstraction function, so we can manipulate it
+ later. */
+ loop->ifns.safe_push (call);
break;
case IFN_UNIQUE:
@@ -19285,51 +19552,6 @@ oacc_loop_xform_head_tail (gcall *from, int level)
}
}
-/* Transform the IFN_GOACC_LOOP internal functions by providing the
- determined partitioning mask and chunking argument. END_MARKER
- points at the end IFN_HEAD_TAIL call intgroducing the loop. IFNS
- is the number of IFN_GOACC_LOOP calls for the loop. MASK_ARG is
- the replacement partitioning mask and CHUNK_ARG is the replacement
- chunking arg. */
-
-static void
-oacc_loop_xform_loop (gcall *end_marker, unsigned ifns,
- tree mask_arg, tree chunk_arg)
-{
- gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
-
- gcc_checking_assert (ifns);
- for (;;)
- {
- for (; !gsi_end_p (gsi); gsi_next (&gsi))
- {
- gimple *stmt = gsi_stmt (gsi);
-
- if (!is_gimple_call (stmt))
- continue;
-
- gcall *call = as_a <gcall *> (stmt);
-
- if (!gimple_call_internal_p (call))
- continue;
-
- if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
- continue;
-
- *gimple_call_arg_ptr (call, 5) = mask_arg;
- *gimple_call_arg_ptr (call, 4) = chunk_arg;
- ifns--;
- if (!ifns)
- return;
- }
-
- /* The LOOP_BOUND ifn could be in the single successor
- block. */
- basic_block bb = single_succ (gsi_bb (gsi));
- gsi = gsi_start_bb (bb);
- }
-}
-
/* Process the discovered OpenACC loops, setting the correct
partitioning level etc. */
@@ -19342,13 +19564,34 @@ oacc_loop_process (oacc_loop *loop)
if (loop->mask && !loop->routine)
{
int ix;
- unsigned mask = loop->mask;
- unsigned dim = GOMP_DIM_GANG;
- tree mask_arg = build_int_cst (unsigned_type_node, mask);
+ tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
+ tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
tree chunk_arg = loop->chunk_size;
+ gcall *call;
- oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg);
+ for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
+ switch (gimple_call_internal_fn (call))
+ {
+ case IFN_GOACC_LOOP:
+ {
+ bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
+ *gimple_call_arg_ptr (call, 5) = is_e ? e_mask_arg : mask_arg;
+ if (!is_e)
+ *gimple_call_arg_ptr (call, 4) = chunk_arg;
+ }
+ break;
+ case IFN_GOACC_TILE:
+ *gimple_call_arg_ptr (call, 3) = mask_arg;
+ *gimple_call_arg_ptr (call, 4) = e_mask_arg;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ unsigned dim = GOMP_DIM_GANG;
+ unsigned mask = loop->mask | loop->e_mask;
for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
{
while (!(GOMP_DIM_MASK (dim) & mask))
@@ -19388,10 +19631,16 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsig
{
bool auto_par = (loop->flags & OLF_AUTO) != 0;
bool seq_par = (loop->flags & OLF_SEQ) != 0;
+ bool tiling = (loop->flags & OLF_TILE) != 0;
this_mask = ((loop->flags >> OLF_DIM_BASE)
& (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
+ /* Apply auto partitioning if this is a non-partitioned regular
+ loop, or (no more than) single axis tiled loop. */
+ bool maybe_auto = !seq_par
+ && this_mask == (tiling ? this_mask & -this_mask : 0);
+
if ((this_mask != 0) + auto_par + seq_par > 1)
{
if (noisy)
@@ -19399,7 +19648,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsig
seq_par
? "%<seq%> overrides other OpenACC loop specifiers"
: "%<auto%> conflicts with other OpenACC loop specifiers");
- auto_par = false;
+ maybe_auto = false;
loop->flags &= ~OLF_AUTO;
if (seq_par)
{
@@ -19408,15 +19657,18 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsig
this_mask = 0;
}
}
- if (auto_par && (loop->flags & OLF_INDEPENDENT))
- mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+ if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
+ {
+ loop->flags |= OLF_AUTO;
+ mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+ }
}
if (this_mask & outer_mask)
{
const oacc_loop *outer;
for (outer = loop->parent; outer; outer = outer->parent)
- if (outer->mask & this_mask)
+ if ((outer->mask | outer->e_mask) & this_mask)
break;
if (noisy)
@@ -19462,13 +19714,33 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsig
}
}
+ mask_all |= this_mask;
+
+ if (loop->flags & OLF_TILE)
+ {
+ /* When tiling, vector goes to the element loop, and failing
+ that we put worker there. The std doesn't contemplate
+ specifying all three. We choose to put worker and vector on
+ the element loops in that case. */
+ unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+ if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+ this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+ loop->e_mask = this_e_mask;
+ this_mask ^= this_e_mask;
+ }
+
loop->mask = this_mask;
- mask_all |= this_mask;
-
+
+ if (dump_file)
+ fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
+ LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+ loop->mask, loop->e_mask);
+
if (loop->child)
{
- loop->inner = oacc_loop_fixed_partitions (loop->child,
- outer_mask | this_mask);
+ loop->inner = oacc_loop_fixed_partitions
+ (loop->child, outer_mask | this_mask | loop->e_mask);
mask_all |= loop->inner;
}
@@ -19480,14 +19752,17 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsig
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
OUTER_MASK is the partitioning this loop is contained within.
+ OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
Return the cumulative partitioning used by this loop, siblings and
children. */
static unsigned
-oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
+ bool outer_assign)
{
bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
bool noisy = true;
+ bool tiling = loop->flags & OLF_TILE;
#ifdef ACCEL_COMPILER
/* When device_type is supported, we want the device compiler to be
@@ -19495,59 +19770,97 @@ static unsigned
noisy = false;
#endif
- if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+ if (assign && (!outer_assign | loop->inner))
{
- /* Allocate the outermost loop at the outermost available
- level. */
- unsigned this_mask = outer_mask + 1;
+ /* Allocate outermost and non-innermost loops at the outermost
+ non-innermost available level. */
+ unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
+
+ /* Find the first outermost available partition. */
+ while (this_mask <= outer_mask)
+ this_mask <<= 1;
- if (!(this_mask & loop->inner))
- loop->mask = this_mask;
- }
+ /* Grab two axes if tiling, and we've not assigned anything */
+ if (tiling && !(loop->mask | loop->e_mask))
+ this_mask |= this_mask << 1;
- if (loop->child)
- {
- unsigned child_mask = outer_mask | loop->mask;
+ /* Prohibit the innermost partitioning at the moment. */
+ this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
- if (loop->mask || assign)
- child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+ /* Don't use any dimension explicitly claimed by an inner loop. */
+ this_mask &= ~loop->inner;
- loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+ if (tiling && !loop->e_mask)
+ {
+ /* If we got two axes, allocate the inner one to the element
+ loop. */
+ loop->e_mask = this_mask & (this_mask << 1);
+ this_mask ^= loop->e_mask;
+ }
+
+ loop->mask |= this_mask;
}
- if (assign && !loop->mask)
+ if (loop->child)
+ loop->inner = oacc_loop_auto_partitions
+ (loop->child, outer_mask | loop->mask | loop->e_mask,
+ outer_assign | assign);
+
+ if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
{
- /* Allocate the loop at the innermost available level. */
+ /* Allocate the loop at the innermost available level. Note
+ that we do this even if we already assigned this loop the
+ outermost available level above. That way we'll partition
+ this along 2 axes, if they are available. */
unsigned this_mask = 0;
-
+
/* Determine the outermost partitioning used within this loop. */
this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
this_mask = least_bit_hwi (this_mask);
/* Pick the partitioning just inside that one. */
this_mask >>= 1;
-
/* And avoid picking one use by an outer loop. */
this_mask &= ~outer_mask;
- if (!this_mask && noisy)
+ /* If tiling and we failed completely above, grab the next one
+ too. Making sure it doesn't hit an outer loop. */
+ if (tiling)
+ {
+ this_mask &= ~(loop->e_mask | loop->mask);
+ unsigned tile_mask = ((this_mask >> 1)
+ & ~(outer_mask | loop->e_mask | loop->mask));
+
+ if (tile_mask || loop->mask)
+ {
+ loop->e_mask |= this_mask;
+ this_mask = tile_mask;
+ }
+ if (!loop->e_mask && noisy)
+ warning_at (loop->loc, 0,
+ "insufficient partitioning available"
+ " to parallelize element loop");
+ }
+
+ loop->mask |= this_mask;
+ if (!loop->mask && noisy)
warning_at (loop->loc, 0,
- "insufficient partitioning available to parallelize loop");
-
- loop->mask = this_mask;
+ "insufficient partitioning available"
+ " to parallelize%s loop", tiling ? " tile" : "");
}
if (assign && dump_file)
- fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+ fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
- loop->mask);
+ loop->mask, loop->e_mask);
unsigned inner_mask = 0;
if (loop->sibling)
- inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+ inner_mask |= oacc_loop_auto_partitions (loop->sibling,
+ outer_mask, outer_assign);
- inner_mask |= loop->inner | loop->mask;
+ inner_mask |= loop->inner | loop->mask | loop->e_mask;
return inner_mask;
}
@@ -19563,7 +19876,7 @@ oacc_loop_partition (oacc_loop *loop, unsigned out
if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
{
mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
- mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+ mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
}
return mask_all;
}
@@ -19713,6 +20026,11 @@ execute_oacc_device_lower ()
{
default: break;
+ case IFN_GOACC_TILE:
+ oacc_xform_tile (call);
+ rescan = true;
+ break;
+
case IFN_GOACC_LOOP:
oacc_xform_loop (call);
rescan = true;
@@ -19740,7 +20058,7 @@ execute_oacc_device_lower ()
switch (kind)
{
default:
- gcc_unreachable ();
+ break;
case IFN_UNIQUE_OACC_FORK:
case IFN_UNIQUE_OACC_JOIN: