You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1978 lines
55 KiB
C++
1978 lines
55 KiB
C++
/* OpenACC worker partitioning via middle end neutering/broadcasting scheme
|
|
|
|
Copyright (C) 2015-2022 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published
|
|
by the Free Software Foundation; either version 3, or (at your
|
|
option) any later version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
|
License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "backend.h"
|
|
#include "rtl.h"
|
|
#include "tree.h"
|
|
#include "gimple.h"
|
|
#include "tree-pass.h"
|
|
#include "ssa.h"
|
|
#include "cgraph.h"
|
|
#include "pretty-print.h"
|
|
#include "fold-const.h"
|
|
#include "gimplify.h"
|
|
#include "gimple-iterator.h"
|
|
#include "gimple-walk.h"
|
|
#include "tree-inline.h"
|
|
#include "langhooks.h"
|
|
#include "omp-general.h"
|
|
#include "omp-low.h"
|
|
#include "gimple-pretty-print.h"
|
|
#include "cfghooks.h"
|
|
#include "insn-config.h"
|
|
#include "recog.h"
|
|
#include "internal-fn.h"
|
|
#include "bitmap.h"
|
|
#include "tree-nested.h"
|
|
#include "stor-layout.h"
|
|
#include "tree-ssa-threadupdate.h"
|
|
#include "tree-into-ssa.h"
|
|
#include "splay-tree.h"
|
|
#include "target.h"
|
|
#include "cfgloop.h"
|
|
#include "tree-cfg.h"
|
|
#include "omp-offload.h"
|
|
#include "attribs.h"
|
|
#include "targhooks.h"
|
|
#include "diagnostic-core.h"
|
|
|
|
/* Loop structure of the function. The entire function is described as
|
|
a NULL loop. */
|
|
/* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'. */
|
|
|
|
struct parallel_g
|
|
{
|
|
/* Parent parallel. */
|
|
parallel_g *parent;
|
|
|
|
/* Next sibling parallel. */
|
|
parallel_g *next;
|
|
|
|
/* First child parallel. */
|
|
parallel_g *inner;
|
|
|
|
/* Partitioning mask of the parallel. */
|
|
unsigned mask;
|
|
|
|
/* Partitioning used within inner parallels. */
|
|
unsigned inner_mask;
|
|
|
|
/* Location of parallel forked and join. The forked is the first
|
|
block in the parallel and the join is the first block after of
|
|
the partition. */
|
|
basic_block forked_block;
|
|
basic_block join_block;
|
|
|
|
gimple *forked_stmt;
|
|
gimple *join_stmt;
|
|
|
|
gimple *fork_stmt;
|
|
gimple *joining_stmt;
|
|
|
|
/* Basic blocks in this parallel, but not in child parallels. The
|
|
FORKED and JOINING blocks are in the partition. The FORK and JOIN
|
|
blocks are not. */
|
|
auto_vec<basic_block> blocks;
|
|
|
|
tree record_type;
|
|
tree sender_decl;
|
|
tree receiver_decl;
|
|
|
|
public:
|
|
parallel_g (parallel_g *parent, unsigned mode);
|
|
~parallel_g ();
|
|
};
|
|
|
|
/* Constructor links the new parallel into it's parent's chain of
|
|
children. */
|
|
|
|
parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
|
|
:parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
|
|
{
|
|
forked_block = join_block = 0;
|
|
forked_stmt = join_stmt = NULL;
|
|
fork_stmt = joining_stmt = NULL;
|
|
|
|
record_type = NULL_TREE;
|
|
sender_decl = NULL_TREE;
|
|
receiver_decl = NULL_TREE;
|
|
|
|
if (parent)
|
|
{
|
|
next = parent->inner;
|
|
parent->inner = this;
|
|
}
|
|
}
|
|
|
|
parallel_g::~parallel_g ()
|
|
{
|
|
delete inner;
|
|
delete next;
|
|
}
|
|
|
|
static bool
|
|
local_var_based_p (tree decl)
|
|
{
|
|
switch (TREE_CODE (decl))
|
|
{
|
|
case VAR_DECL:
|
|
return !is_global_var (decl);
|
|
|
|
case COMPONENT_REF:
|
|
case BIT_FIELD_REF:
|
|
case ARRAY_REF:
|
|
return local_var_based_p (TREE_OPERAND (decl, 0));
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Map of basic blocks to gimple stmts. */
|
|
typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
|
|
|
|
/* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
|
|
the routine likely contains partitioned loops (else will do its own
|
|
neutering and variable propagation). Return TRUE if a function call CALL
|
|
should be made in (worker) single mode instead, rather than redundant
|
|
mode. */
|
|
|
|
static bool
|
|
omp_sese_active_worker_call (gcall *call)
|
|
{
|
|
#define GOMP_DIM_SEQ GOMP_DIM_MAX
|
|
tree fndecl = gimple_call_fndecl (call);
|
|
|
|
if (!fndecl)
|
|
return true;
|
|
|
|
tree attrs = oacc_get_fn_attrib (fndecl);
|
|
|
|
if (!attrs)
|
|
return true;
|
|
|
|
int level = oacc_fn_attrib_level (attrs);
|
|
|
|
/* Neither regular functions nor "seq" routines should be run by all threads
|
|
in worker-single mode. */
|
|
return level == -1 || level == GOMP_DIM_SEQ;
|
|
#undef GOMP_DIM_SEQ
|
|
}
|
|
|
|
/* Split basic blocks such that each forked and join unspecs are at
|
|
the start of their basic blocks. Thus afterwards each block will
|
|
have a single partitioning mode. We also do the same for return
|
|
insns, as they are executed by every thread. Return the
|
|
partitioning mode of the function as a whole. Populate MAP with
|
|
head and tail blocks. We also clear the BB visited flag, which is
|
|
used when finding partitions. */
|
|
/* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'. */
|
|
|
|
static void
|
|
omp_sese_split_blocks (bb_stmt_map_t *map)
|
|
{
|
|
auto_vec<gimple *> worklist;
|
|
basic_block block;
|
|
|
|
/* Locate all the reorg instructions of interest. */
|
|
FOR_ALL_BB_FN (block, cfun)
|
|
{
|
|
/* Clear visited flag, for use by parallel locator */
|
|
block->flags &= ~BB_VISITED;
|
|
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
enum ifn_unique_kind k = ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
|
|
|
|
if (k == IFN_UNIQUE_OACC_JOIN)
|
|
worklist.safe_push (stmt);
|
|
else if (k == IFN_UNIQUE_OACC_FORK)
|
|
{
|
|
gcc_assert (gsi_one_before_end_p (gsi));
|
|
basic_block forked_block = single_succ (block);
|
|
gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
|
|
|
|
/* We push a NOP as a placeholder for the "forked" stmt.
|
|
This is then recognized in omp_sese_find_par. */
|
|
gimple *nop = gimple_build_nop ();
|
|
gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
|
|
|
|
worklist.safe_push (nop);
|
|
}
|
|
}
|
|
else if (gimple_code (stmt) == GIMPLE_RETURN
|
|
|| gimple_code (stmt) == GIMPLE_COND
|
|
|| gimple_code (stmt) == GIMPLE_SWITCH
|
|
|| (gimple_code (stmt) == GIMPLE_CALL
|
|
&& !gimple_call_internal_p (stmt)
|
|
&& !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
|
|
worklist.safe_push (stmt);
|
|
else if (is_gimple_assign (stmt))
|
|
{
|
|
tree lhs = gimple_assign_lhs (stmt);
|
|
|
|
/* Force assignments to components/fields/elements of local
|
|
aggregates into fully-partitioned (redundant) mode. This
|
|
avoids having to broadcast the whole aggregate. The RHS of
|
|
the assignment will be propagated using the normal
|
|
mechanism. */
|
|
|
|
switch (TREE_CODE (lhs))
|
|
{
|
|
case COMPONENT_REF:
|
|
case BIT_FIELD_REF:
|
|
case ARRAY_REF:
|
|
{
|
|
tree aggr = TREE_OPERAND (lhs, 0);
|
|
|
|
if (local_var_based_p (aggr))
|
|
worklist.safe_push (stmt);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Split blocks on the worklist. */
|
|
unsigned ix;
|
|
gimple *stmt;
|
|
|
|
for (ix = 0; worklist.iterate (ix, &stmt); ix++)
|
|
{
|
|
basic_block block = gimple_bb (stmt);
|
|
|
|
if (gimple_code (stmt) == GIMPLE_COND)
|
|
{
|
|
gcond *orig_cond = as_a <gcond *> (stmt);
|
|
tree_code code = gimple_expr_code (orig_cond);
|
|
tree pred = make_ssa_name (boolean_type_node);
|
|
gimple *asgn = gimple_build_assign (pred, code,
|
|
gimple_cond_lhs (orig_cond),
|
|
gimple_cond_rhs (orig_cond));
|
|
gcond *new_cond
|
|
= gimple_build_cond (NE_EXPR, pred, boolean_false_node,
|
|
gimple_cond_true_label (orig_cond),
|
|
gimple_cond_false_label (orig_cond));
|
|
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
|
|
gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
|
|
gsi_replace (&gsi, new_cond, true);
|
|
|
|
edge e = split_block (block, asgn);
|
|
block = e->dest;
|
|
map->get_or_insert (block) = new_cond;
|
|
}
|
|
else if ((gimple_code (stmt) == GIMPLE_CALL
|
|
&& !gimple_call_internal_p (stmt))
|
|
|| is_gimple_assign (stmt))
|
|
{
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
|
|
gsi_prev (&gsi);
|
|
|
|
edge call = split_block (block, gsi_stmt (gsi));
|
|
|
|
gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
|
|
|
|
edge call_to_ret = split_block (call->dest, call_stmt);
|
|
|
|
map->get_or_insert (call_to_ret->src) = call_stmt;
|
|
}
|
|
else
|
|
{
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
|
|
gsi_prev (&gsi);
|
|
|
|
if (gsi_end_p (gsi))
|
|
map->get_or_insert (block) = stmt;
|
|
else
|
|
{
|
|
/* Split block before insn. The insn is in the new block. */
|
|
edge e = split_block (block, gsi_stmt (gsi));
|
|
|
|
block = e->dest;
|
|
map->get_or_insert (block) = stmt;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static const char *
|
|
mask_name (unsigned mask)
|
|
{
|
|
switch (mask)
|
|
{
|
|
case 0: return "gang redundant";
|
|
case 1: return "gang partitioned";
|
|
case 2: return "worker partitioned";
|
|
case 3: return "gang+worker partitioned";
|
|
case 4: return "vector partitioned";
|
|
case 5: return "gang+vector partitioned";
|
|
case 6: return "worker+vector partitioned";
|
|
case 7: return "fully partitioned";
|
|
default: return "<illegal>";
|
|
}
|
|
}
|
|
|
|
/* Dump this parallel and all its inner parallels. */
|
|
/* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'. */
|
|
|
|
static void
|
|
omp_sese_dump_pars (parallel_g *par, unsigned depth)
|
|
{
|
|
fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
|
|
depth, par->mask, mask_name (par->mask),
|
|
par->forked_block ? par->forked_block->index : -1,
|
|
par->join_block ? par->join_block->index : -1);
|
|
|
|
fprintf (dump_file, " blocks:");
|
|
|
|
basic_block block;
|
|
for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
|
|
fprintf (dump_file, " %d", block->index);
|
|
fprintf (dump_file, "\n");
|
|
if (par->inner)
|
|
omp_sese_dump_pars (par->inner, depth + 1);
|
|
|
|
if (par->next)
|
|
omp_sese_dump_pars (par->next, depth);
|
|
}
|
|
|
|
/* If BLOCK contains a fork/join marker, process it to create or
|
|
terminate a loop structure. Add this block to the current loop,
|
|
and then walk successor blocks. */
|
|
/* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'. */
|
|
|
|
static parallel_g *
|
|
omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
|
|
{
|
|
if (block->flags & BB_VISITED)
|
|
return par;
|
|
block->flags |= BB_VISITED;
|
|
|
|
if (gimple **stmtp = map->get (block))
|
|
{
|
|
gimple *stmt = *stmtp;
|
|
|
|
if (gimple_code (stmt) == GIMPLE_COND
|
|
|| gimple_code (stmt) == GIMPLE_SWITCH
|
|
|| gimple_code (stmt) == GIMPLE_RETURN
|
|
|| (gimple_code (stmt) == GIMPLE_CALL
|
|
&& !gimple_call_internal_p (stmt))
|
|
|| is_gimple_assign (stmt))
|
|
{
|
|
/* A single block that is forced to be at the maximum partition
|
|
level. Make a singleton par for it. */
|
|
par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
|
|
| GOMP_DIM_MASK (GOMP_DIM_WORKER)
|
|
| GOMP_DIM_MASK (GOMP_DIM_VECTOR));
|
|
par->forked_block = block;
|
|
par->forked_stmt = stmt;
|
|
par->blocks.safe_push (block);
|
|
par = par->parent;
|
|
goto walk_successors;
|
|
}
|
|
else if (gimple_nop_p (stmt))
|
|
{
|
|
basic_block pred = single_pred (block);
|
|
gcc_assert (pred);
|
|
gimple_stmt_iterator gsi = gsi_last_bb (pred);
|
|
gimple *final_stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
|
|
{
|
|
gcall *call = as_a <gcall *> (final_stmt);
|
|
enum ifn_unique_kind k = ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
|
|
|
|
if (k == IFN_UNIQUE_OACC_FORK)
|
|
{
|
|
HOST_WIDE_INT dim
|
|
= TREE_INT_CST_LOW (gimple_call_arg (call, 2));
|
|
unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
|
|
|
|
par = new parallel_g (par, mask);
|
|
par->forked_block = block;
|
|
par->forked_stmt = final_stmt;
|
|
par->fork_stmt = stmt;
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
gcall *call = as_a <gcall *> (stmt);
|
|
enum ifn_unique_kind k = ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
|
|
if (k == IFN_UNIQUE_OACC_JOIN)
|
|
{
|
|
HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
|
|
unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
|
|
|
|
gcc_assert (par->mask == mask);
|
|
par->join_block = block;
|
|
par->join_stmt = stmt;
|
|
par = par->parent;
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
|
|
if (par)
|
|
/* Add this block onto the current loop's list of blocks. */
|
|
par->blocks.safe_push (block);
|
|
else
|
|
/* This must be the entry block. Create a NULL parallel. */
|
|
par = new parallel_g (0, 0);
|
|
|
|
walk_successors:
|
|
/* Walk successor blocks. */
|
|
edge e;
|
|
edge_iterator ei;
|
|
|
|
FOR_EACH_EDGE (e, ei, block->succs)
|
|
omp_sese_find_par (map, par, e->dest);
|
|
|
|
return par;
|
|
}
|
|
|
|
/* DFS walk the CFG looking for fork & join markers. Construct
|
|
loop structures as we go. MAP is a mapping of basic blocks
|
|
to head & tail markers, discovered when splitting blocks. This
|
|
speeds up the discovery. We rely on the BB visited flag having
|
|
been cleared when splitting blocks. */
|
|
/* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'. */
|
|
|
|
static parallel_g *
|
|
omp_sese_discover_pars (bb_stmt_map_t *map)
|
|
{
|
|
basic_block block;
|
|
|
|
/* Mark exit blocks as visited. */
|
|
block = EXIT_BLOCK_PTR_FOR_FN (cfun);
|
|
block->flags |= BB_VISITED;
|
|
|
|
/* And entry block as not. */
|
|
block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
|
|
block->flags &= ~BB_VISITED;
|
|
|
|
parallel_g *par = omp_sese_find_par (map, 0, block);
|
|
|
|
if (dump_file)
|
|
{
|
|
fprintf (dump_file, "\nLoops\n");
|
|
omp_sese_dump_pars (par, 0);
|
|
fprintf (dump_file, "\n");
|
|
}
|
|
|
|
return par;
|
|
}
|
|
|
|
static void
|
|
populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
|
|
bitmap vector_single, unsigned outer_mask,
|
|
int depth)
|
|
{
|
|
unsigned mask = outer_mask | par->mask;
|
|
|
|
basic_block block;
|
|
|
|
for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
|
|
{
|
|
if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
|
|
bitmap_set_bit (worker_single, block->index);
|
|
|
|
if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
|
|
bitmap_set_bit (vector_single, block->index);
|
|
}
|
|
|
|
if (par->inner)
|
|
populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
|
|
mask, depth + 1);
|
|
if (par->next)
|
|
populate_single_mode_bitmaps (par->next, worker_single, vector_single,
|
|
outer_mask, depth);
|
|
}
|
|
|
|
/* A map from SSA names or var decls to record fields. */
|
|
|
|
typedef hash_map<tree, tree> field_map_t;
|
|
|
|
/* For each propagation record type, this is a map from SSA names or var decls
|
|
to propagate, to the field in the record type that should be used for
|
|
transmission and reception. */
|
|
|
|
typedef hash_map<tree, field_map_t> record_field_map_t;
|
|
|
|
static void
|
|
install_var_field (tree var, tree record_type, field_map_t *fields)
|
|
{
|
|
tree name;
|
|
char tmp[20];
|
|
|
|
if (TREE_CODE (var) == SSA_NAME)
|
|
{
|
|
name = SSA_NAME_IDENTIFIER (var);
|
|
if (!name)
|
|
{
|
|
sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
|
|
name = get_identifier (tmp);
|
|
}
|
|
}
|
|
else if (TREE_CODE (var) == VAR_DECL)
|
|
{
|
|
name = DECL_NAME (var);
|
|
if (!name)
|
|
{
|
|
sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
|
|
name = get_identifier (tmp);
|
|
}
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
|
|
gcc_assert (!fields->get (var));
|
|
|
|
tree type = TREE_TYPE (var);
|
|
|
|
if (POINTER_TYPE_P (type)
|
|
&& TYPE_RESTRICT (type))
|
|
type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
|
|
|
|
tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
|
|
|
|
if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
|
|
{
|
|
SET_DECL_ALIGN (field, DECL_ALIGN (var));
|
|
DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
|
|
TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
|
|
}
|
|
else
|
|
SET_DECL_ALIGN (field, TYPE_ALIGN (type));
|
|
|
|
fields->put (var, field);
|
|
|
|
insert_field_into_struct (record_type, field);
|
|
}
|
|
|
|
/* Sets of SSA_NAMES or VAR_DECLs to propagate. */
|
|
typedef hash_set<tree> propagation_set;
|
|
|
|
static void
|
|
find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
|
|
bitmap worker_single, bitmap vector_single,
|
|
vec<propagation_set *> *prop_set)
|
|
{
|
|
unsigned mask = outer_mask | par->mask;
|
|
|
|
if (par->inner)
|
|
find_ssa_names_to_propagate (par->inner, mask, worker_single,
|
|
vector_single, prop_set);
|
|
if (par->next)
|
|
find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
|
|
vector_single, prop_set);
|
|
|
|
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
|
|
{
|
|
basic_block block;
|
|
int ix;
|
|
|
|
for (ix = 0; par->blocks.iterate (ix, &block); ix++)
|
|
{
|
|
for (gphi_iterator psi = gsi_start_phis (block);
|
|
!gsi_end_p (psi); gsi_next (&psi))
|
|
{
|
|
gphi *phi = psi.phi ();
|
|
use_operand_p use;
|
|
ssa_op_iter iter;
|
|
|
|
FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
|
|
{
|
|
tree var = USE_FROM_PTR (use);
|
|
|
|
if (TREE_CODE (var) != SSA_NAME)
|
|
continue;
|
|
|
|
gimple *def_stmt = SSA_NAME_DEF_STMT (var);
|
|
|
|
if (gimple_nop_p (def_stmt))
|
|
continue;
|
|
|
|
basic_block def_bb = gimple_bb (def_stmt);
|
|
|
|
if (bitmap_bit_p (worker_single, def_bb->index))
|
|
{
|
|
if (!(*prop_set)[def_bb->index])
|
|
(*prop_set)[def_bb->index] = new propagation_set;
|
|
|
|
propagation_set *ws_prop = (*prop_set)[def_bb->index];
|
|
|
|
ws_prop->add (var);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
use_operand_p use;
|
|
ssa_op_iter iter;
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
|
|
{
|
|
tree var = USE_FROM_PTR (use);
|
|
|
|
gimple *def_stmt = SSA_NAME_DEF_STMT (var);
|
|
|
|
if (gimple_nop_p (def_stmt))
|
|
continue;
|
|
|
|
basic_block def_bb = gimple_bb (def_stmt);
|
|
|
|
if (bitmap_bit_p (worker_single, def_bb->index))
|
|
{
|
|
if (!(*prop_set)[def_bb->index])
|
|
(*prop_set)[def_bb->index] = new propagation_set;
|
|
|
|
propagation_set *ws_prop = (*prop_set)[def_bb->index];
|
|
|
|
ws_prop->add (var);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
|
|
statement. */
|
|
|
|
static tree
|
|
find_partitioned_var_uses_1 (tree *node, int *, void *data)
|
|
{
|
|
walk_stmt_info *wi = (walk_stmt_info *) data;
|
|
hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
|
|
|
|
if (!wi->is_lhs && VAR_P (*node))
|
|
partitioned_var_uses->add (*node);
|
|
|
|
return NULL_TREE;
|
|
}
|
|
|
|
static void
|
|
find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
|
|
hash_set<tree> *partitioned_var_uses)
|
|
{
|
|
unsigned mask = outer_mask | par->mask;
|
|
|
|
if (par->inner)
|
|
find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
|
|
if (par->next)
|
|
find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
|
|
|
|
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
|
|
{
|
|
basic_block block;
|
|
int ix;
|
|
|
|
for (ix = 0; par->blocks.iterate (ix, &block); ix++)
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
walk_stmt_info wi;
|
|
memset (&wi, 0, sizeof (wi));
|
|
wi.info = (void *) partitioned_var_uses;
|
|
walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Gang-private variables (typically placed in a GPU's shared memory) do not
|
|
need to be processed by the worker-propagation mechanism. Populate the
|
|
GANG_PRIVATE_VARS set with any such variables found in the current
|
|
function. */
|
|
|
|
static void
|
|
find_gang_private_vars (hash_set<tree> *gang_private_vars)
|
|
{
|
|
basic_block block;
|
|
|
|
FOR_EACH_BB_FN (block, cfun)
|
|
{
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
enum ifn_unique_kind k = ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
|
|
if (k == IFN_UNIQUE_OACC_PRIVATE)
|
|
{
|
|
HOST_WIDE_INT level
|
|
= TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
|
|
if (level != GOMP_DIM_GANG)
|
|
continue;
|
|
for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
|
|
{
|
|
tree arg = gimple_call_arg (stmt, i);
|
|
gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
|
|
tree decl = TREE_OPERAND (arg, 0);
|
|
gang_private_vars->add (decl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
|
|
hash_set<tree> *partitioned_var_uses,
|
|
hash_set<tree> *gang_private_vars,
|
|
bitmap writes_gang_private,
|
|
vec<propagation_set *> *prop_set)
|
|
{
|
|
unsigned mask = outer_mask | par->mask;
|
|
|
|
if (par->inner)
|
|
find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
|
|
gang_private_vars, writes_gang_private,
|
|
prop_set);
|
|
if (par->next)
|
|
find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
|
|
gang_private_vars, writes_gang_private,
|
|
prop_set);
|
|
|
|
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
|
|
{
|
|
basic_block block;
|
|
int ix;
|
|
|
|
for (ix = 0; par->blocks.iterate (ix, &block); ix++)
|
|
{
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
tree var;
|
|
unsigned i;
|
|
|
|
FOR_EACH_LOCAL_DECL (cfun, i, var)
|
|
{
|
|
if (!VAR_P (var)
|
|
|| is_global_var (var)
|
|
|| AGGREGATE_TYPE_P (TREE_TYPE (var))
|
|
|| !partitioned_var_uses->contains (var))
|
|
continue;
|
|
|
|
if (stmt_may_clobber_ref_p (stmt, var))
|
|
{
|
|
if (dump_file)
|
|
{
|
|
fprintf (dump_file, "bb %u: local variable may be "
|
|
"clobbered in %s mode: ", block->index,
|
|
mask_name (mask));
|
|
print_generic_expr (dump_file, var, TDF_SLIM);
|
|
fprintf (dump_file, "\n");
|
|
}
|
|
|
|
if (gang_private_vars->contains (var))
|
|
{
|
|
/* If we write a gang-private variable, we want a
|
|
barrier at the end of the block. */
|
|
bitmap_set_bit (writes_gang_private, block->index);
|
|
continue;
|
|
}
|
|
|
|
if (!(*prop_set)[block->index])
|
|
(*prop_set)[block->index] = new propagation_set;
|
|
|
|
propagation_set *ws_prop
|
|
= (*prop_set)[block->index];
|
|
|
|
ws_prop->add (var);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Transform basic blocks FROM, TO (which may be the same block) into:
|
|
if (GOACC_single_start ())
|
|
BLOCK;
|
|
GOACC_barrier ();
|
|
\ | /
|
|
+----+
|
|
| | (new) predicate block
|
|
+----+--
|
|
\ | / \ | / |t \
|
|
+----+ +----+ +----+ |
|
|
| | | | ===> | | | f (old) from block
|
|
+----+ +----+ +----+ |
|
|
| t/ \f | /
|
|
+----+/
|
|
(split (split before | | skip block
|
|
at end) condition) +----+
|
|
t/ \f
|
|
*/
|
|
|
|
static void
|
|
worker_single_simple (basic_block from, basic_block to,
|
|
hash_set<tree> *def_escapes_block)
|
|
{
|
|
gimple *call, *cond;
|
|
tree lhs, decl;
|
|
basic_block skip_block;
|
|
|
|
gimple_stmt_iterator gsi = gsi_last_bb (to);
|
|
if (EDGE_COUNT (to->succs) > 1)
|
|
{
|
|
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
|
|
gsi_prev (&gsi);
|
|
}
|
|
edge e = split_block (to, gsi_stmt (gsi));
|
|
skip_block = e->dest;
|
|
|
|
gimple_stmt_iterator start = gsi_after_labels (from);
|
|
|
|
decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
|
|
lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
|
|
call = gimple_build_call (decl, 0);
|
|
gimple_call_set_lhs (call, lhs);
|
|
gsi_insert_before (&start, call, GSI_NEW_STMT);
|
|
update_stmt (call);
|
|
|
|
cond = gimple_build_cond (EQ_EXPR, lhs,
|
|
fold_convert_loc (UNKNOWN_LOCATION,
|
|
TREE_TYPE (lhs),
|
|
boolean_true_node),
|
|
NULL_TREE, NULL_TREE);
|
|
gsi_insert_after (&start, cond, GSI_NEW_STMT);
|
|
update_stmt (cond);
|
|
|
|
edge et = split_block (from, cond);
|
|
et->flags &= ~EDGE_FALLTHRU;
|
|
et->flags |= EDGE_TRUE_VALUE;
|
|
/* Make the active worker the more probable path so we prefer fallthrough
|
|
(letting the idle workers jump around more). */
|
|
et->probability = profile_probability::likely ();
|
|
|
|
edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
|
|
ef->probability = et->probability.invert ();
|
|
|
|
basic_block neutered = split_edge (ef);
|
|
gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
|
|
|
|
for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
ssa_op_iter iter;
|
|
tree var;
|
|
|
|
FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
|
|
{
|
|
if (def_escapes_block->contains (var))
|
|
{
|
|
gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
|
|
create_new_def_for (var, join_phi,
|
|
gimple_phi_result_ptr (join_phi));
|
|
add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
|
|
|
|
tree neutered_def = copy_ssa_name (var, NULL);
|
|
/* We really want "don't care" or some value representing
|
|
undefined here, but optimizers will probably get rid of the
|
|
zero-assignments anyway. */
|
|
gassign *zero = gimple_build_assign (neutered_def,
|
|
build_zero_cst (TREE_TYPE (neutered_def)));
|
|
|
|
gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
|
|
update_stmt (zero);
|
|
|
|
add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
|
|
UNKNOWN_LOCATION);
|
|
update_stmt (join_phi);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static tree
|
|
build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
|
|
{
|
|
tree x = build_simple_mem_ref (receiver_decl);
|
|
tree field = *fields->get (var);
|
|
TREE_THIS_NOTRAP (x) = 1;
|
|
x = omp_build_component_ref (x, field);
|
|
return x;
|
|
}
|
|
|
|
static tree
|
|
build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
|
|
{
|
|
if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
|
|
sender_decl = build_simple_mem_ref (sender_decl);
|
|
tree field = *fields->get (var);
|
|
return omp_build_component_ref (sender_decl, field);
|
|
}
|
|
|
|
static int
|
|
sort_by_ssa_version_or_uid (const void *p1, const void *p2)
|
|
{
|
|
const tree t1 = *(const tree *)p1;
|
|
const tree t2 = *(const tree *)p2;
|
|
|
|
if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
|
|
return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
|
|
else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
|
|
return -1;
|
|
else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
|
|
return 1;
|
|
else
|
|
return DECL_UID (t1) - DECL_UID (t2);
|
|
}
|
|
|
|
static int
|
|
sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
|
|
{
|
|
const tree t1 = *(const tree *)p1;
|
|
const tree t2 = *(const tree *)p2;
|
|
unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
|
|
unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
|
|
if (s1 != s2)
|
|
return s2 - s1;
|
|
else
|
|
return sort_by_ssa_version_or_uid (p1, p2);
|
|
}
|
|
|
|
static void
|
|
worker_single_copy (basic_block from, basic_block to,
|
|
hash_set<tree> *def_escapes_block,
|
|
hash_set<tree> *worker_partitioned_uses,
|
|
tree record_type, record_field_map_t *record_field_map,
|
|
unsigned HOST_WIDE_INT placement,
|
|
bool isolate_broadcasts, bool has_gang_private_write)
|
|
{
|
|
/* If we only have virtual defs, we'll have no record type, but we still want
|
|
to emit single_copy_start and (particularly) single_copy_end to act as
|
|
a vdef source on the neutered edge representing memory writes on the
|
|
non-neutered edge. */
|
|
if (!record_type)
|
|
record_type = char_type_node;
|
|
|
|
tree sender_decl
|
|
= targetm.goacc.create_worker_broadcast_record (record_type, true,
|
|
".oacc_worker_o",
|
|
placement);
|
|
tree receiver_decl
|
|
= targetm.goacc.create_worker_broadcast_record (record_type, false,
|
|
".oacc_worker_i",
|
|
placement);
|
|
|
|
gimple_stmt_iterator gsi = gsi_last_bb (to);
|
|
if (EDGE_COUNT (to->succs) > 1)
|
|
gsi_prev (&gsi);
|
|
edge e = split_block (to, gsi_stmt (gsi));
|
|
basic_block barrier_block = e->dest;
|
|
|
|
gimple_stmt_iterator start = gsi_after_labels (from);
|
|
|
|
tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
|
|
|
|
tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
|
|
|
|
gimple *call
|
|
= gimple_build_call (decl, 1,
|
|
POINTER_TYPE_P (TREE_TYPE (sender_decl))
|
|
? sender_decl : build_fold_addr_expr (sender_decl));
|
|
gimple_call_set_lhs (call, lhs);
|
|
gsi_insert_before (&start, call, GSI_NEW_STMT);
|
|
update_stmt (call);
|
|
|
|
/* The shared-memory range for this block overflowed. Add a barrier before
|
|
the GOACC_single_copy_start call. */
|
|
if (isolate_broadcasts)
|
|
{
|
|
decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
|
|
gimple *acc_bar = gimple_build_call (decl, 0);
|
|
gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
|
|
}
|
|
|
|
tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
|
|
|
|
gimple *conv = gimple_build_assign (conv_tmp,
|
|
fold_convert (TREE_TYPE (receiver_decl),
|
|
lhs));
|
|
update_stmt (conv);
|
|
gsi_insert_after (&start, conv, GSI_NEW_STMT);
|
|
gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
|
|
gsi_insert_after (&start, asgn, GSI_NEW_STMT);
|
|
update_stmt (asgn);
|
|
|
|
tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
|
|
|
|
tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
|
|
asgn = gimple_build_assign (recv_tmp, receiver_decl);
|
|
gsi_insert_after (&start, asgn, GSI_NEW_STMT);
|
|
update_stmt (asgn);
|
|
|
|
gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
|
|
NULL_TREE);
|
|
update_stmt (cond);
|
|
|
|
gsi_insert_after (&start, cond, GSI_NEW_STMT);
|
|
|
|
edge et = split_block (from, cond);
|
|
et->flags &= ~EDGE_FALLTHRU;
|
|
et->flags |= EDGE_TRUE_VALUE;
|
|
/* Make the active worker the more probable path so we prefer fallthrough
|
|
(letting the idle workers jump around more). */
|
|
et->probability = profile_probability::likely ();
|
|
|
|
basic_block body = et->dest;
|
|
|
|
edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
|
|
ef->probability = et->probability.invert ();
|
|
|
|
gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
|
|
cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
|
|
|
|
if (record_type != char_type_node || has_gang_private_write)
|
|
{
|
|
decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
|
|
gimple *acc_bar = gimple_build_call (decl, 0);
|
|
|
|
gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
|
|
gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
|
|
}
|
|
else
|
|
gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
|
|
|
|
edge et2 = split_block (barrier_block, cond);
|
|
et2->flags &= ~EDGE_FALLTHRU;
|
|
et2->flags |= EDGE_TRUE_VALUE;
|
|
et2->probability = profile_probability::unlikely ();
|
|
|
|
basic_block exit_block = et2->dest;
|
|
|
|
basic_block copyout_block = split_edge (et2);
|
|
edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
|
|
ef2->probability = et2->probability.invert ();
|
|
|
|
gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
|
|
|
|
edge copyout_to_exit = single_succ_edge (copyout_block);
|
|
|
|
gimple_seq sender_seq = NULL;
|
|
|
|
/* Make sure we iterate over definitions in a stable order. */
|
|
auto_vec<tree> escape_vec (def_escapes_block->elements ());
|
|
for (hash_set<tree>::iterator it = def_escapes_block->begin ();
|
|
it != def_escapes_block->end (); ++it)
|
|
escape_vec.quick_push (*it);
|
|
escape_vec.qsort (sort_by_ssa_version_or_uid);
|
|
|
|
for (unsigned i = 0; i < escape_vec.length (); i++)
|
|
{
|
|
tree var = escape_vec[i];
|
|
|
|
if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
|
|
continue;
|
|
|
|
tree barrier_def = 0;
|
|
|
|
if (TREE_CODE (var) == SSA_NAME)
|
|
{
|
|
gimple *def_stmt = SSA_NAME_DEF_STMT (var);
|
|
|
|
if (gimple_nop_p (def_stmt))
|
|
continue;
|
|
|
|
/* The barrier phi takes one result from the actual work of the
|
|
block we're neutering, and the other result is constant zero of
|
|
the same type. */
|
|
|
|
gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
|
|
barrier_def = create_new_def_for (var, barrier_phi,
|
|
gimple_phi_result_ptr (barrier_phi));
|
|
|
|
add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
|
|
add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
|
|
UNKNOWN_LOCATION);
|
|
|
|
update_stmt (barrier_phi);
|
|
}
|
|
else
|
|
gcc_assert (TREE_CODE (var) == VAR_DECL);
|
|
|
|
/* If we had no record type, we will have no fields map. */
|
|
field_map_t *fields = record_field_map->get (record_type);
|
|
|
|
if (worker_partitioned_uses->contains (var)
|
|
&& fields
|
|
&& fields->get (var))
|
|
{
|
|
tree neutered_def = make_ssa_name (TREE_TYPE (var));
|
|
|
|
/* Receive definition from shared memory block. */
|
|
|
|
tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
|
|
gassign *recv = gimple_build_assign (neutered_def,
|
|
receiver_ref);
|
|
gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING);
|
|
update_stmt (recv);
|
|
|
|
if (TREE_CODE (var) == VAR_DECL)
|
|
{
|
|
/* If it's a VAR_DECL, we only copied to an SSA temporary. Copy
|
|
to the final location now. */
|
|
gassign *asgn = gimple_build_assign (var, neutered_def);
|
|
gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING);
|
|
update_stmt (asgn);
|
|
}
|
|
else
|
|
{
|
|
/* If it's an SSA name, create a new phi at the join node to
|
|
represent either the output from the active worker (the
|
|
barrier) or the inactive workers (the copyout block). */
|
|
gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
|
|
create_new_def_for (barrier_def, join_phi,
|
|
gimple_phi_result_ptr (join_phi));
|
|
add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
|
|
add_phi_arg (join_phi, neutered_def, copyout_to_exit,
|
|
UNKNOWN_LOCATION);
|
|
update_stmt (join_phi);
|
|
}
|
|
|
|
/* Send definition to shared memory block. */
|
|
|
|
tree sender_ref = build_sender_ref (var, sender_decl, fields);
|
|
|
|
if (TREE_CODE (var) == SSA_NAME)
|
|
{
|
|
gassign *send = gimple_build_assign (sender_ref, var);
|
|
gimple_seq_add_stmt (&sender_seq, send);
|
|
update_stmt (send);
|
|
}
|
|
else if (TREE_CODE (var) == VAR_DECL)
|
|
{
|
|
tree tmp = make_ssa_name (TREE_TYPE (var));
|
|
gassign *send = gimple_build_assign (tmp, var);
|
|
gimple_seq_add_stmt (&sender_seq, send);
|
|
update_stmt (send);
|
|
send = gimple_build_assign (sender_ref, tmp);
|
|
gimple_seq_add_stmt (&sender_seq, send);
|
|
update_stmt (send);
|
|
}
|
|
else
|
|
gcc_unreachable ();
|
|
}
|
|
}
|
|
|
|
/* The shared-memory range for this block overflowed. Add a barrier at the
|
|
end. */
|
|
if (isolate_broadcasts)
|
|
{
|
|
gsi = gsi_start_bb (exit_block);
|
|
decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
|
|
gimple *acc_bar = gimple_build_call (decl, 0);
|
|
gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
|
|
}
|
|
|
|
/* It's possible for the ET->DEST block (the work done by the active thread)
|
|
to finish with a control-flow insn, e.g. a UNIQUE function call. Split
|
|
the block and add SENDER_SEQ in the latter part to avoid having control
|
|
flow in the middle of a BB. */
|
|
|
|
decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
|
|
call = gimple_build_call (decl, 1,
|
|
POINTER_TYPE_P (TREE_TYPE (sender_decl))
|
|
? sender_decl
|
|
: build_fold_addr_expr (sender_decl));
|
|
gimple_seq_add_stmt (&sender_seq, call);
|
|
|
|
gsi = gsi_last_bb (body);
|
|
gimple *last = gsi_stmt (gsi);
|
|
basic_block sender_block = split_block (body, last)->dest;
|
|
gsi = gsi_last_bb (sender_block);
|
|
gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
|
|
}
|
|
|
|
typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
|
|
blk_offset_map_t;
|
|
|
|
static void
|
|
neuter_worker_single (parallel_g *par, unsigned outer_mask,
|
|
bitmap worker_single, bitmap vector_single,
|
|
vec<propagation_set *> *prop_set,
|
|
hash_set<tree> *partitioned_var_uses,
|
|
record_field_map_t *record_field_map,
|
|
blk_offset_map_t *blk_offset_map,
|
|
bitmap writes_gang_private)
|
|
{
|
|
unsigned mask = outer_mask | par->mask;
|
|
|
|
if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
|
|
{
|
|
basic_block block;
|
|
|
|
for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
|
|
{
|
|
bool has_defs = false;
|
|
hash_set<tree> def_escapes_block;
|
|
hash_set<tree> worker_partitioned_uses;
|
|
unsigned j;
|
|
tree var;
|
|
|
|
FOR_EACH_SSA_NAME (j, var, cfun)
|
|
{
|
|
if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
|
|
{
|
|
has_defs = true;
|
|
continue;
|
|
}
|
|
|
|
gimple *def_stmt = SSA_NAME_DEF_STMT (var);
|
|
|
|
if (gimple_nop_p (def_stmt))
|
|
continue;
|
|
|
|
if (gimple_bb (def_stmt)->index != block->index)
|
|
continue;
|
|
|
|
gimple *use_stmt;
|
|
imm_use_iterator use_iter;
|
|
bool uses_outside_block = false;
|
|
bool worker_partitioned_use = false;
|
|
|
|
FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
|
|
{
|
|
int blocknum = gimple_bb (use_stmt)->index;
|
|
|
|
/* Don't propagate SSA names that are only used in the
|
|
current block, unless the usage is in a phi node: that
|
|
means the name left the block, then came back in at the
|
|
top. */
|
|
if (blocknum != block->index
|
|
|| gimple_code (use_stmt) == GIMPLE_PHI)
|
|
uses_outside_block = true;
|
|
if (!bitmap_bit_p (worker_single, blocknum))
|
|
worker_partitioned_use = true;
|
|
}
|
|
|
|
if (uses_outside_block)
|
|
def_escapes_block.add (var);
|
|
|
|
if (worker_partitioned_use)
|
|
{
|
|
worker_partitioned_uses.add (var);
|
|
has_defs = true;
|
|
}
|
|
}
|
|
|
|
propagation_set *ws_prop = (*prop_set)[block->index];
|
|
|
|
if (ws_prop)
|
|
{
|
|
for (propagation_set::iterator it = ws_prop->begin ();
|
|
it != ws_prop->end ();
|
|
++it)
|
|
{
|
|
tree var = *it;
|
|
if (TREE_CODE (var) == VAR_DECL)
|
|
{
|
|
def_escapes_block.add (var);
|
|
if (partitioned_var_uses->contains (var))
|
|
{
|
|
worker_partitioned_uses.add (var);
|
|
has_defs = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
delete ws_prop;
|
|
(*prop_set)[block->index] = 0;
|
|
}
|
|
|
|
bool only_marker_fns = true;
|
|
bool join_block = false;
|
|
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
if (gimple_code (stmt) == GIMPLE_CALL
|
|
&& gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
enum ifn_unique_kind k = ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
|
|
if (k != IFN_UNIQUE_OACC_PRIVATE
|
|
&& k != IFN_UNIQUE_OACC_JOIN
|
|
&& k != IFN_UNIQUE_OACC_FORK
|
|
&& k != IFN_UNIQUE_OACC_HEAD_MARK
|
|
&& k != IFN_UNIQUE_OACC_TAIL_MARK)
|
|
only_marker_fns = false;
|
|
else if (k == IFN_UNIQUE_OACC_JOIN)
|
|
/* The JOIN marker is special in that it *cannot* be
|
|
predicated for worker zero, because it may be lowered
|
|
to a barrier instruction and all workers must typically
|
|
execute that barrier. We shouldn't be doing any
|
|
broadcasts from the join block anyway. */
|
|
join_block = true;
|
|
}
|
|
else if (gimple_code (stmt) == GIMPLE_CALL
|
|
&& gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
|
|
/* Empty. */;
|
|
else if (gimple_nop_p (stmt))
|
|
/* Empty. */;
|
|
else
|
|
only_marker_fns = false;
|
|
}
|
|
|
|
/* We can skip predicating this block for worker zero if the only
|
|
thing it contains is marker functions that will be removed in the
|
|
oaccdevlow pass anyway.
|
|
Don't do this if the block has (any) phi nodes, because those
|
|
might define SSA names that need broadcasting.
|
|
TODO: We might be able to skip transforming blocks that only
|
|
contain some other trivial statements too. */
|
|
if (only_marker_fns && !phi_nodes (block))
|
|
continue;
|
|
|
|
gcc_assert (!join_block);
|
|
|
|
if (has_defs)
|
|
{
|
|
tree record_type = (tree) block->aux;
|
|
std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
|
|
= blk_offset_map->get (block);
|
|
gcc_assert (!record_type || off_rngalloc);
|
|
unsigned HOST_WIDE_INT offset
|
|
= off_rngalloc ? off_rngalloc->first : 0;
|
|
bool range_allocated
|
|
= off_rngalloc ? off_rngalloc->second : true;
|
|
bool has_gang_private_write
|
|
= bitmap_bit_p (writes_gang_private, block->index);
|
|
worker_single_copy (block, block, &def_escapes_block,
|
|
&worker_partitioned_uses, record_type,
|
|
record_field_map,
|
|
offset, !range_allocated,
|
|
has_gang_private_write);
|
|
}
|
|
else
|
|
worker_single_simple (block, block, &def_escapes_block);
|
|
}
|
|
}
|
|
|
|
if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
|
|
{
|
|
basic_block block;
|
|
|
|
for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (block);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_code (stmt) == GIMPLE_CALL
|
|
&& !gimple_call_internal_p (stmt)
|
|
&& !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
|
|
{
|
|
/* If we have an OpenACC routine call in worker-single mode,
|
|
place barriers before and afterwards to prevent
|
|
clobbering re-used shared memory regions (as are used
|
|
for AMDGCN at present, for example). */
|
|
tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
|
|
gsi_insert_before (&gsi, gimple_build_call (decl, 0),
|
|
GSI_SAME_STMT);
|
|
gsi_insert_after (&gsi, gimple_build_call (decl, 0),
|
|
GSI_NEW_STMT);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (par->inner)
|
|
neuter_worker_single (par->inner, mask, worker_single, vector_single,
|
|
prop_set, partitioned_var_uses, record_field_map,
|
|
blk_offset_map, writes_gang_private);
|
|
if (par->next)
|
|
neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
|
|
prop_set, partitioned_var_uses, record_field_map,
|
|
blk_offset_map, writes_gang_private);
|
|
}
|
|
|
|
static void
|
|
dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
|
|
{
|
|
if (bb->flags & BB_VISITED)
|
|
return;
|
|
|
|
bb->flags |= BB_VISITED;
|
|
|
|
if (bb->succs)
|
|
{
|
|
edge e;
|
|
edge_iterator ei;
|
|
FOR_EACH_EDGE (e, ei, bb->succs)
|
|
{
|
|
basic_block dest = e->dest;
|
|
if (dest->aux)
|
|
bitmap_set_bit (reachable, dest->index);
|
|
else
|
|
dfs_broadcast_reachable_1 (dest, reachable);
|
|
}
|
|
}
|
|
}
|
|
|
|
typedef std::pair<int, tree> idx_decl_pair_t;
|
|
|
|
typedef auto_vec<splay_tree> used_range_vec_t;
|
|
|
|
static int
|
|
sort_size_descending (const void *a, const void *b)
|
|
{
|
|
const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
|
|
const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
|
|
unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
|
|
unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
|
|
return bsize - asize;
|
|
}
|
|
|
|
class addr_range
|
|
{
|
|
public:
|
|
addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
|
|
: lo (addr_lo), hi (addr_hi)
|
|
{ }
|
|
addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
|
|
{ }
|
|
addr_range () : lo (0), hi (0)
|
|
{ }
|
|
|
|
bool invalid () { return lo == 0 && hi == 0; }
|
|
|
|
unsigned HOST_WIDE_INT lo;
|
|
unsigned HOST_WIDE_INT hi;
|
|
};
|
|
|
|
static int
|
|
splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
|
|
{
|
|
addr_range *ar = (addr_range *) a;
|
|
addr_range *br = (addr_range *) b;
|
|
if (ar->lo == br->lo && ar->hi == br->hi)
|
|
return 0;
|
|
if (ar->hi <= br->lo)
|
|
return -1;
|
|
else if (ar->lo >= br->hi)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
splay_tree_free_key (splay_tree_key k)
|
|
{
|
|
addr_range *ar = (addr_range *) k;
|
|
delete ar;
|
|
}
|
|
|
|
static addr_range
|
|
first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
|
|
unsigned HOST_WIDE_INT align, addr_range *bounds)
|
|
{
|
|
splay_tree_node min = splay_tree_min (s);
|
|
if (min)
|
|
{
|
|
splay_tree_node next;
|
|
while ((next = splay_tree_successor (s, min->key)))
|
|
{
|
|
unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
|
|
unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
|
|
unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
|
|
if (base + size <= hi)
|
|
return addr_range (base, base + size);
|
|
min = next;
|
|
}
|
|
|
|
unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
|
|
base = (base + align - 1) & ~(align - 1);
|
|
if (base + size <= bounds->hi)
|
|
return addr_range (base, base + size);
|
|
else
|
|
return addr_range ();
|
|
}
|
|
else
|
|
{
|
|
unsigned HOST_WIDE_INT lo = bounds->lo;
|
|
lo = (lo + align - 1) & ~(align - 1);
|
|
if (lo + size <= bounds->hi)
|
|
return addr_range (lo, lo + size);
|
|
else
|
|
return addr_range ();
|
|
}
|
|
}
|
|
|
|
static int
|
|
merge_ranges_1 (splay_tree_node n, void *ptr)
|
|
{
|
|
splay_tree accum = (splay_tree) ptr;
|
|
addr_range ar = *(addr_range *) n->key;
|
|
|
|
splay_tree_node old = splay_tree_lookup (accum, n->key);
|
|
|
|
/* We might have an overlap. Create a new range covering the
|
|
overlapping parts. */
|
|
if (old)
|
|
{
|
|
addr_range *old_ar = (addr_range *) old->key;
|
|
ar.lo = MIN (old_ar->lo, ar.lo);
|
|
ar.hi = MAX (old_ar->hi, ar.hi);
|
|
splay_tree_remove (accum, old->key);
|
|
}
|
|
|
|
addr_range *new_ar = new addr_range (ar);
|
|
|
|
splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
merge_ranges (splay_tree accum, splay_tree sp)
|
|
{
|
|
splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
|
|
}
|
|
|
|
static void
|
|
oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
|
|
unsigned HOST_WIDE_INT bounds_hi)
|
|
{
|
|
bb_stmt_map_t bb_stmt_map;
|
|
auto_bitmap worker_single, vector_single;
|
|
|
|
omp_sese_split_blocks (&bb_stmt_map);
|
|
|
|
if (dump_file)
|
|
{
|
|
fprintf (dump_file, "\n\nAfter splitting:\n\n");
|
|
dump_function_to_file (current_function_decl, dump_file, dump_flags);
|
|
}
|
|
|
|
unsigned mask = 0;
|
|
|
|
/* If this is a routine, calculate MASK as if the outer levels are already
|
|
partitioned. */
|
|
{
|
|
tree attr = oacc_get_fn_attrib (current_function_decl);
|
|
tree dims = TREE_VALUE (attr);
|
|
unsigned ix;
|
|
for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
|
|
{
|
|
tree allowed = TREE_PURPOSE (dims);
|
|
if (allowed && integer_zerop (allowed))
|
|
mask |= GOMP_DIM_MASK (ix);
|
|
}
|
|
}
|
|
|
|
parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
|
|
populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
|
|
|
|
basic_block bb;
|
|
FOR_ALL_BB_FN (bb, cfun)
|
|
bb->aux = NULL;
|
|
|
|
vec<propagation_set *> prop_set (vNULL);
|
|
prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
|
|
|
|
find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
|
|
&prop_set);
|
|
|
|
hash_set<tree> partitioned_var_uses;
|
|
hash_set<tree> gang_private_vars;
|
|
auto_bitmap writes_gang_private;
|
|
|
|
find_gang_private_vars (&gang_private_vars);
|
|
find_partitioned_var_uses (par, mask, &partitioned_var_uses);
|
|
find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
|
|
&gang_private_vars, writes_gang_private,
|
|
&prop_set);
|
|
|
|
record_field_map_t record_field_map;
|
|
|
|
FOR_ALL_BB_FN (bb, cfun)
|
|
{
|
|
propagation_set *ws_prop = prop_set[bb->index];
|
|
if (ws_prop)
|
|
{
|
|
tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
|
|
tree name = create_tmp_var_name (".oacc_ws_data_s");
|
|
name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
|
|
DECL_ARTIFICIAL (name) = 1;
|
|
DECL_NAMELESS (name) = 1;
|
|
TYPE_NAME (record_type) = name;
|
|
TYPE_ARTIFICIAL (record_type) = 1;
|
|
|
|
auto_vec<tree> field_vec (ws_prop->elements ());
|
|
for (hash_set<tree>::iterator it = ws_prop->begin ();
|
|
it != ws_prop->end (); ++it)
|
|
field_vec.quick_push (*it);
|
|
|
|
field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
|
|
|
|
bool existed;
|
|
field_map_t *fields
|
|
= &record_field_map.get_or_insert (record_type, &existed);
|
|
gcc_checking_assert (!existed);
|
|
|
|
/* Insert var fields in reverse order, so the last inserted element
|
|
is the first in the structure. */
|
|
for (int i = field_vec.length () - 1; i >= 0; i--)
|
|
install_var_field (field_vec[i], record_type, fields);
|
|
|
|
layout_type (record_type);
|
|
|
|
bb->aux = (tree) record_type;
|
|
}
|
|
}
|
|
|
|
sbitmap *reachable
|
|
= sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
|
|
last_basic_block_for_fn (cfun));
|
|
|
|
bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
|
|
|
|
auto_vec<std::pair<int, tree> > priority;
|
|
|
|
FOR_ALL_BB_FN (bb, cfun)
|
|
{
|
|
if (bb->aux)
|
|
{
|
|
tree record_type = (tree) bb->aux;
|
|
|
|
basic_block bb2;
|
|
FOR_ALL_BB_FN (bb2, cfun)
|
|
bb2->flags &= ~BB_VISITED;
|
|
|
|
priority.safe_push (std::make_pair (bb->index, record_type));
|
|
dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
|
|
}
|
|
}
|
|
|
|
sbitmap *inverted
|
|
= sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
|
|
last_basic_block_for_fn (cfun));
|
|
|
|
bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
|
|
|
|
for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
|
|
{
|
|
sbitmap_iterator bi;
|
|
unsigned int j;
|
|
EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
|
|
bitmap_set_bit (inverted[j], i);
|
|
}
|
|
|
|
for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
|
|
bitmap_ior (reachable[i], reachable[i], inverted[i]);
|
|
|
|
sbitmap_vector_free (inverted);
|
|
|
|
used_range_vec_t used_ranges;
|
|
|
|
used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
|
|
|
|
blk_offset_map_t blk_offset_map;
|
|
|
|
addr_range worker_shm_bounds (bounds_lo, bounds_hi);
|
|
|
|
priority.qsort (sort_size_descending);
|
|
for (unsigned int i = 0; i < priority.length (); i++)
|
|
{
|
|
idx_decl_pair_t p = priority[i];
|
|
int blkno = p.first;
|
|
tree record_type = p.second;
|
|
HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
|
|
HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
|
|
|
|
splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
|
|
splay_tree_free_key, NULL);
|
|
|
|
if (!used_ranges[blkno])
|
|
used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
|
|
splay_tree_free_key, NULL);
|
|
else
|
|
merge_ranges (conflicts, used_ranges[blkno]);
|
|
|
|
sbitmap_iterator bi;
|
|
unsigned int j;
|
|
EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
|
|
if (used_ranges[j])
|
|
merge_ranges (conflicts, used_ranges[j]);
|
|
|
|
addr_range ar
|
|
= first_fit_range (conflicts, size, align, &worker_shm_bounds);
|
|
|
|
splay_tree_delete (conflicts);
|
|
|
|
if (ar.invalid ())
|
|
{
|
|
unsigned HOST_WIDE_INT base
|
|
= (bounds_lo + align - 1) & ~(align - 1);
|
|
if (base + size > bounds_hi)
|
|
error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
|
|
std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
|
|
= std::make_pair (base, false);
|
|
blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
|
|
}
|
|
else
|
|
{
|
|
splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
|
|
(splay_tree_key) &ar);
|
|
if (old)
|
|
{
|
|
fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
|
|
"already mapped in block %d\n", (int) ar.lo,
|
|
(int) ar.hi, (int) ((addr_range *) old->key)->lo,
|
|
(int) ((addr_range *) old->key)->hi, blkno);
|
|
abort ();
|
|
}
|
|
|
|
addr_range *arp = new addr_range (ar);
|
|
splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
|
|
(splay_tree_value) blkno);
|
|
std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
|
|
= std::make_pair (ar.lo, true);
|
|
blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
|
|
}
|
|
}
|
|
|
|
sbitmap_vector_free (reachable);
|
|
|
|
neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
|
|
&partitioned_var_uses, &record_field_map,
|
|
&blk_offset_map, writes_gang_private);
|
|
|
|
record_field_map.empty ();
|
|
|
|
/* These are supposed to have been 'delete'd by 'neuter_worker_single'. */
|
|
for (auto it : prop_set)
|
|
gcc_checking_assert (!it);
|
|
prop_set.release ();
|
|
|
|
delete par;
|
|
|
|
/* This doesn't seem to make a difference. */
|
|
loops_state_clear (LOOP_CLOSED_SSA);
|
|
|
|
/* Neutering worker-single neutered blocks will invalidate dominance info.
|
|
It may be possible to incrementally update just the affected blocks, but
|
|
obliterate everything for now. */
|
|
free_dominance_info (CDI_DOMINATORS);
|
|
free_dominance_info (CDI_POST_DOMINATORS);
|
|
|
|
if (dump_file)
|
|
{
|
|
fprintf (dump_file, "\n\nAfter neutering:\n\n");
|
|
dump_function_to_file (current_function_decl, dump_file, dump_flags);
|
|
}
|
|
}
|
|
|
|
static int
|
|
execute_omp_oacc_neuter_broadcast ()
|
|
{
|
|
unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
|
|
unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
|
|
|
|
for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
|
|
{
|
|
reduction_size[i] = 0;
|
|
private_size[i] = 0;
|
|
}
|
|
|
|
/* Calculate shared memory size required for reduction variables and
|
|
gang-private memory for this offloaded function. */
|
|
basic_block bb;
|
|
FOR_ALL_BB_FN (bb, cfun)
|
|
{
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
|
|
!gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
if (!is_gimple_call (stmt))
|
|
continue;
|
|
gcall *call = as_a <gcall *> (stmt);
|
|
if (!gimple_call_internal_p (call))
|
|
continue;
|
|
enum internal_fn ifn_code = gimple_call_internal_fn (call);
|
|
switch (ifn_code)
|
|
{
|
|
default: break;
|
|
case IFN_GOACC_REDUCTION:
|
|
if (integer_minus_onep (gimple_call_arg (call, 3)))
|
|
continue;
|
|
else
|
|
{
|
|
unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
|
|
/* Only count reduction variables once: the choice to pick
|
|
the setup call is fairly arbitrary. */
|
|
if (code == IFN_GOACC_REDUCTION_SETUP)
|
|
{
|
|
int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
|
|
tree var = gimple_call_arg (call, 2);
|
|
tree offset = gimple_call_arg (call, 5);
|
|
tree var_type = TREE_TYPE (var);
|
|
unsigned HOST_WIDE_INT limit
|
|
= (tree_to_uhwi (offset)
|
|
+ tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
|
|
reduction_size[level]
|
|
= MAX (reduction_size[level], limit);
|
|
}
|
|
}
|
|
break;
|
|
case IFN_UNIQUE:
|
|
{
|
|
enum ifn_unique_kind kind
|
|
= ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
|
|
|
|
if (kind == IFN_UNIQUE_OACC_PRIVATE)
|
|
{
|
|
HOST_WIDE_INT level
|
|
= TREE_INT_CST_LOW (gimple_call_arg (call, 2));
|
|
if (level == -1)
|
|
break;
|
|
for (unsigned i = 3;
|
|
i < gimple_call_num_args (call);
|
|
i++)
|
|
{
|
|
tree arg = gimple_call_arg (call, i);
|
|
gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
|
|
tree decl = TREE_OPERAND (arg, 0);
|
|
unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
|
|
private_size[level] = ((private_size[level] + align - 1)
|
|
& ~(align - 1));
|
|
unsigned HOST_WIDE_INT decl_size
|
|
= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
|
|
private_size[level] += decl_size;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
int dims[GOMP_DIM_MAX];
|
|
for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
|
|
dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
|
|
|
|
/* Find bounds of shared-memory buffer space we can use. */
|
|
unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
|
|
if (targetm.goacc.shared_mem_layout)
|
|
targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
|
|
private_size, reduction_size);
|
|
|
|
/* Perform worker partitioning unless we know 'num_workers(1)'. */
|
|
if (dims[GOMP_DIM_WORKER] != 1)
|
|
oacc_do_neutering (bounds_lo, bounds_hi);
|
|
|
|
return 0;
|
|
}
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_omp_oacc_neuter_broadcast =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"omp_oacc_neuter_broadcast", /* name */
|
|
OPTGROUP_OMP, /* optinfo_flags */
|
|
TV_NONE, /* tv_id */
|
|
PROP_cfg, /* properties_required */
|
|
0, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *fun)
|
|
{
|
|
if (!flag_openacc)
|
|
return false;
|
|
|
|
if (!targetm.goacc.create_worker_broadcast_record)
|
|
return false;
|
|
|
|
/* Only relevant for OpenACC offloaded functions. */
|
|
tree attr = oacc_get_fn_attrib (fun->decl);
|
|
if (!attr)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
virtual unsigned int execute (function *)
|
|
{
|
|
return execute_omp_oacc_neuter_broadcast ();
|
|
}
|
|
|
|
}; // class pass_omp_oacc_neuter_broadcast
|
|
|
|
} // anon namespace
|
|
|
|
gimple_opt_pass *
|
|
make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
|
|
{
|
|
return new pass_omp_oacc_neuter_broadcast (ctxt);
|
|
}
|