1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
2 
3    Copyright (C) 2015-2021 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "tree-pass.h"
29 #include "ssa.h"
30 #include "cgraph.h"
31 #include "pretty-print.h"
32 #include "fold-const.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "tree-inline.h"
37 #include "langhooks.h"
38 #include "omp-general.h"
39 #include "omp-low.h"
40 #include "gimple-pretty-print.h"
41 #include "cfghooks.h"
42 #include "insn-config.h"
43 #include "recog.h"
44 #include "internal-fn.h"
45 #include "bitmap.h"
46 #include "tree-nested.h"
47 #include "stor-layout.h"
48 #include "tree-ssa-threadupdate.h"
49 #include "tree-into-ssa.h"
50 #include "splay-tree.h"
51 #include "target.h"
52 #include "cfgloop.h"
53 #include "tree-cfg.h"
54 #include "omp-offload.h"
55 #include "attribs.h"
56 #include "targhooks.h"
57 #include "diagnostic-core.h"
58 
59 /* Loop structure of the function.  The entire function is described as
60    a NULL loop.  */
61 /* Adapted from 'gcc/config/nvptx/nvptx.c:struct parallel'.  */
62 
63 struct parallel_g
64 {
65   /* Parent parallel.  */
66   parallel_g *parent;
67 
68   /* Next sibling parallel.  */
69   parallel_g *next;
70 
71   /* First child parallel.  */
72   parallel_g *inner;
73 
74   /* Partitioning mask of the parallel.  */
75   unsigned mask;
76 
77   /* Partitioning used within inner parallels. */
78   unsigned inner_mask;
79 
80   /* Location of parallel forked and join.  The forked is the first
81      block in the parallel and the join is the first block after of
82      the partition.  */
83   basic_block forked_block;
84   basic_block join_block;
85 
86   gimple *forked_stmt;
87   gimple *join_stmt;
88 
89   gimple *fork_stmt;
90   gimple *joining_stmt;
91 
92   /* Basic blocks in this parallel, but not in child parallels.  The
93      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
94      blocks are not.  */
95   auto_vec<basic_block> blocks;
96 
97   tree record_type;
98   tree sender_decl;
99   tree receiver_decl;
100 
101 public:
102   parallel_g (parallel_g *parent, unsigned mode);
103   ~parallel_g ();
104 };
105 
106 /* Constructor links the new parallel into it's parent's chain of
107    children.  */
108 
parallel_g(parallel_g * parent_,unsigned mask_)109 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
110   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
111 {
112   forked_block = join_block = 0;
113   forked_stmt = join_stmt = NULL;
114   fork_stmt = joining_stmt = NULL;
115 
116   record_type = NULL_TREE;
117   sender_decl = NULL_TREE;
118   receiver_decl = NULL_TREE;
119 
120   if (parent)
121     {
122       next = parent->inner;
123       parent->inner = this;
124     }
125 }
126 
~parallel_g()127 parallel_g::~parallel_g ()
128 {
129   delete inner;
130   delete next;
131 }
132 
133 static bool
local_var_based_p(tree decl)134 local_var_based_p (tree decl)
135 {
136   switch (TREE_CODE (decl))
137     {
138     case VAR_DECL:
139       return !is_global_var (decl);
140 
141     case COMPONENT_REF:
142     case BIT_FIELD_REF:
143     case ARRAY_REF:
144       return local_var_based_p (TREE_OPERAND (decl, 0));
145 
146     default:
147       return false;
148     }
149 }
150 
151 /* Map of basic blocks to gimple stmts.  */
152 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
153 
154 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
155    the routine likely contains partitioned loops (else will do its own
156    neutering and variable propagation). Return TRUE if a function call CALL
157    should be made in (worker) single mode instead, rather than redundant
158    mode.  */
159 
160 static bool
omp_sese_active_worker_call(gcall * call)161 omp_sese_active_worker_call (gcall *call)
162 {
163 #define GOMP_DIM_SEQ GOMP_DIM_MAX
164   tree fndecl = gimple_call_fndecl (call);
165 
166   if (!fndecl)
167     return true;
168 
169   tree attrs = oacc_get_fn_attrib (fndecl);
170 
171   if (!attrs)
172     return true;
173 
174   int level = oacc_fn_attrib_level (attrs);
175 
176   /* Neither regular functions nor "seq" routines should be run by all threads
177      in worker-single mode.  */
178   return level == -1 || level == GOMP_DIM_SEQ;
179 #undef GOMP_DIM_SEQ
180 }
181 
182 /* Split basic blocks such that each forked and join unspecs are at
183    the start of their basic blocks.  Thus afterwards each block will
184    have a single partitioning mode.  We also do the same for return
185    insns, as they are executed by every thread.  Return the
186    partitioning mode of the function as a whole.  Populate MAP with
187    head and tail blocks.  We also clear the BB visited flag, which is
188    used when finding partitions.  */
189 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_split_blocks'.  */
190 
191 static void
omp_sese_split_blocks(bb_stmt_map_t * map)192 omp_sese_split_blocks (bb_stmt_map_t *map)
193 {
194   auto_vec<gimple *> worklist;
195   basic_block block;
196 
197   /* Locate all the reorg instructions of interest.  */
198   FOR_ALL_BB_FN (block, cfun)
199     {
200       /* Clear visited flag, for use by parallel locator  */
201       block->flags &= ~BB_VISITED;
202 
203       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
204 	   !gsi_end_p (gsi);
205 	   gsi_next (&gsi))
206 	{
207 	  gimple *stmt = gsi_stmt (gsi);
208 
209 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
210 	    {
211 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
212 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
213 
214 	      if (k == IFN_UNIQUE_OACC_JOIN)
215 		worklist.safe_push (stmt);
216 	      else if (k == IFN_UNIQUE_OACC_FORK)
217 		{
218 		  gcc_assert (gsi_one_before_end_p (gsi));
219 		  basic_block forked_block = single_succ (block);
220 		  gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
221 
222 		  /* We push a NOP as a placeholder for the "forked" stmt.
223 		     This is then recognized in omp_sese_find_par.  */
224 		  gimple *nop = gimple_build_nop ();
225 		  gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
226 
227 		  worklist.safe_push (nop);
228 		}
229 	    }
230 	  else if (gimple_code (stmt) == GIMPLE_RETURN
231 		   || gimple_code (stmt) == GIMPLE_COND
232 		   || gimple_code (stmt) == GIMPLE_SWITCH
233 		   || (gimple_code (stmt) == GIMPLE_CALL
234 		       && !gimple_call_internal_p (stmt)
235 		       && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
236 	    worklist.safe_push (stmt);
237 	  else if (is_gimple_assign (stmt))
238 	    {
239 	      tree lhs = gimple_assign_lhs (stmt);
240 
241 	      /* Force assignments to components/fields/elements of local
242 		 aggregates into fully-partitioned (redundant) mode.  This
243 		 avoids having to broadcast the whole aggregate.  The RHS of
244 		 the assignment will be propagated using the normal
245 		 mechanism.  */
246 
247 	      switch (TREE_CODE (lhs))
248 		{
249 		case COMPONENT_REF:
250 		case BIT_FIELD_REF:
251 		case ARRAY_REF:
252 		  {
253 		    tree aggr = TREE_OPERAND (lhs, 0);
254 
255 		    if (local_var_based_p (aggr))
256 		      worklist.safe_push (stmt);
257 		  }
258 		  break;
259 
260 		default:
261 		  ;
262 		}
263 	    }
264 	}
265     }
266 
267   /* Split blocks on the worklist.  */
268   unsigned ix;
269   gimple *stmt;
270 
271   for (ix = 0; worklist.iterate (ix, &stmt); ix++)
272     {
273       basic_block block = gimple_bb (stmt);
274 
275       if (gimple_code (stmt) == GIMPLE_COND)
276 	{
277 	  gcond *orig_cond = as_a <gcond *> (stmt);
278 	  tree_code code = gimple_expr_code (orig_cond);
279 	  tree pred = make_ssa_name (boolean_type_node);
280 	  gimple *asgn = gimple_build_assign (pred, code,
281 			   gimple_cond_lhs (orig_cond),
282 			   gimple_cond_rhs (orig_cond));
283 	  gcond *new_cond
284 	    = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
285 				 gimple_cond_true_label (orig_cond),
286 				 gimple_cond_false_label (orig_cond));
287 
288 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
289 	  gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
290 	  gsi_replace (&gsi, new_cond, true);
291 
292 	  edge e = split_block (block, asgn);
293 	  block = e->dest;
294 	  map->get_or_insert (block) = new_cond;
295 	}
296       else if ((gimple_code (stmt) == GIMPLE_CALL
297 		&& !gimple_call_internal_p (stmt))
298 	       || is_gimple_assign (stmt))
299 	{
300 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
301 	  gsi_prev (&gsi);
302 
303 	  edge call = split_block (block, gsi_stmt (gsi));
304 
305 	  gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
306 
307 	  edge call_to_ret = split_block (call->dest, call_stmt);
308 
309 	  map->get_or_insert (call_to_ret->src) = call_stmt;
310 	}
311       else
312 	{
313 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
314 	  gsi_prev (&gsi);
315 
316 	  if (gsi_end_p (gsi))
317 	    map->get_or_insert (block) = stmt;
318 	  else
319 	    {
320 	      /* Split block before insn. The insn is in the new block.  */
321 	      edge e = split_block (block, gsi_stmt (gsi));
322 
323 	      block = e->dest;
324 	      map->get_or_insert (block) = stmt;
325 	    }
326 	}
327     }
328 }
329 
330 static const char *
mask_name(unsigned mask)331 mask_name (unsigned mask)
332 {
333   switch (mask)
334     {
335     case 0: return "gang redundant";
336     case 1: return "gang partitioned";
337     case 2: return "worker partitioned";
338     case 3: return "gang+worker partitioned";
339     case 4: return "vector partitioned";
340     case 5: return "gang+vector partitioned";
341     case 6: return "worker+vector partitioned";
342     case 7: return "fully partitioned";
343     default: return "<illegal>";
344     }
345 }
346 
347 /* Dump this parallel and all its inner parallels.  */
348 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_dump_pars'.  */
349 
350 static void
omp_sese_dump_pars(parallel_g * par,unsigned depth)351 omp_sese_dump_pars (parallel_g *par, unsigned depth)
352 {
353   fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
354 	   depth, par->mask, mask_name (par->mask),
355 	   par->forked_block ? par->forked_block->index : -1,
356 	   par->join_block ? par->join_block->index : -1);
357 
358   fprintf (dump_file, "    blocks:");
359 
360   basic_block block;
361   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
362     fprintf (dump_file, " %d", block->index);
363   fprintf (dump_file, "\n");
364   if (par->inner)
365     omp_sese_dump_pars (par->inner, depth + 1);
366 
367   if (par->next)
368     omp_sese_dump_pars (par->next, depth);
369 }
370 
371 /* If BLOCK contains a fork/join marker, process it to create or
372    terminate a loop structure.  Add this block to the current loop,
373    and then walk successor blocks.   */
374 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_find_par'.  */
375 
376 static parallel_g *
omp_sese_find_par(bb_stmt_map_t * map,parallel_g * par,basic_block block)377 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
378 {
379   if (block->flags & BB_VISITED)
380     return par;
381   block->flags |= BB_VISITED;
382 
383   if (gimple **stmtp = map->get (block))
384     {
385       gimple *stmt = *stmtp;
386 
387       if (gimple_code (stmt) == GIMPLE_COND
388 	  || gimple_code (stmt) == GIMPLE_SWITCH
389 	  || gimple_code (stmt) == GIMPLE_RETURN
390 	  || (gimple_code (stmt) == GIMPLE_CALL
391 	      && !gimple_call_internal_p (stmt))
392 	  || is_gimple_assign (stmt))
393 	{
394 	  /* A single block that is forced to be at the maximum partition
395 	     level.  Make a singleton par for it.  */
396 	  par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
397 				   | GOMP_DIM_MASK (GOMP_DIM_WORKER)
398 				   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
399 	  par->forked_block = block;
400 	  par->forked_stmt = stmt;
401 	  par->blocks.safe_push (block);
402 	  par = par->parent;
403 	  goto walk_successors;
404 	}
405       else if (gimple_nop_p (stmt))
406 	{
407 	  basic_block pred = single_pred (block);
408 	  gcc_assert (pred);
409 	  gimple_stmt_iterator gsi = gsi_last_bb (pred);
410 	  gimple *final_stmt = gsi_stmt (gsi);
411 
412 	  if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
413 	    {
414 	      gcall *call = as_a <gcall *> (final_stmt);
415 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
416 		TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
417 
418 	      if (k == IFN_UNIQUE_OACC_FORK)
419 		{
420 		  HOST_WIDE_INT dim
421 		    = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
422 		  unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
423 
424 		  par = new parallel_g (par, mask);
425 		  par->forked_block = block;
426 		  par->forked_stmt = final_stmt;
427 		  par->fork_stmt = stmt;
428 		}
429 	      else
430 		gcc_unreachable ();
431 	    }
432 	  else
433 	    gcc_unreachable ();
434 	}
435       else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
436 	{
437 	  gcall *call = as_a <gcall *> (stmt);
438 	  enum ifn_unique_kind k = ((enum ifn_unique_kind)
439 	    TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
440 	  if (k == IFN_UNIQUE_OACC_JOIN)
441 	    {
442 	      HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
443 	      unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
444 
445 	      gcc_assert (par->mask == mask);
446 	      par->join_block = block;
447 	      par->join_stmt = stmt;
448 	      par = par->parent;
449 	    }
450 	  else
451 	    gcc_unreachable ();
452 	}
453       else
454 	gcc_unreachable ();
455     }
456 
457   if (par)
458     /* Add this block onto the current loop's list of blocks.  */
459     par->blocks.safe_push (block);
460   else
461     /* This must be the entry block.  Create a NULL parallel.  */
462     par = new parallel_g (0, 0);
463 
464 walk_successors:
465   /* Walk successor blocks.  */
466   edge e;
467   edge_iterator ei;
468 
469   FOR_EACH_EDGE (e, ei, block->succs)
470     omp_sese_find_par (map, par, e->dest);
471 
472   return par;
473 }
474 
475 /* DFS walk the CFG looking for fork & join markers.  Construct
476    loop structures as we go.  MAP is a mapping of basic blocks
477    to head & tail markers, discovered when splitting blocks.  This
478    speeds up the discovery.  We rely on the BB visited flag having
479    been cleared when splitting blocks.  */
480 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_discover_pars'.  */
481 
482 static parallel_g *
omp_sese_discover_pars(bb_stmt_map_t * map)483 omp_sese_discover_pars (bb_stmt_map_t *map)
484 {
485   basic_block block;
486 
487   /* Mark exit blocks as visited.  */
488   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
489   block->flags |= BB_VISITED;
490 
491   /* And entry block as not.  */
492   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
493   block->flags &= ~BB_VISITED;
494 
495   parallel_g *par = omp_sese_find_par (map, 0, block);
496 
497   if (dump_file)
498     {
499       fprintf (dump_file, "\nLoops\n");
500       omp_sese_dump_pars (par, 0);
501       fprintf (dump_file, "\n");
502     }
503 
504   return par;
505 }
506 
507 static void
populate_single_mode_bitmaps(parallel_g * par,bitmap worker_single,bitmap vector_single,unsigned outer_mask,int depth)508 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
509 			      bitmap vector_single, unsigned outer_mask,
510 			      int depth)
511 {
512   unsigned mask = outer_mask | par->mask;
513 
514   basic_block block;
515 
516   for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
517     {
518       if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
519 	bitmap_set_bit (worker_single, block->index);
520 
521       if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
522 	bitmap_set_bit (vector_single, block->index);
523     }
524 
525   if (par->inner)
526     populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
527 				  mask, depth + 1);
528   if (par->next)
529     populate_single_mode_bitmaps (par->next, worker_single, vector_single,
530 				  outer_mask, depth);
531 }
532 
533 /* A map from SSA names or var decls to record fields.  */
534 
535 typedef hash_map<tree, tree> field_map_t;
536 
537 /* For each propagation record type, this is a map from SSA names or var decls
538    to propagate, to the field in the record type that should be used for
539    transmission and reception.  */
540 
541 typedef hash_map<tree, field_map_t *> record_field_map_t;
542 
543 static void
install_var_field(tree var,tree record_type,field_map_t * fields)544 install_var_field (tree var, tree record_type, field_map_t *fields)
545 {
546   tree name;
547   char tmp[20];
548 
549   if (TREE_CODE (var) == SSA_NAME)
550     {
551       name = SSA_NAME_IDENTIFIER (var);
552       if (!name)
553 	{
554 	  sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
555 	  name = get_identifier (tmp);
556 	}
557     }
558   else if (TREE_CODE (var) == VAR_DECL)
559     {
560       name = DECL_NAME (var);
561       if (!name)
562 	{
563 	  sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
564 	  name = get_identifier (tmp);
565 	}
566     }
567   else
568     gcc_unreachable ();
569 
570   gcc_assert (!fields->get (var));
571 
572   tree type = TREE_TYPE (var);
573 
574   if (POINTER_TYPE_P (type)
575       && TYPE_RESTRICT (type))
576     type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
577 
578   tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
579 
580   if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
581     {
582       SET_DECL_ALIGN (field, DECL_ALIGN (var));
583       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
584       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
585     }
586   else
587     SET_DECL_ALIGN (field, TYPE_ALIGN (type));
588 
589   fields->put (var, field);
590 
591   insert_field_into_struct (record_type, field);
592 }
593 
594 /* Sets of SSA_NAMES or VAR_DECLs to propagate.  */
595 typedef hash_set<tree> propagation_set;
596 
597 static void
find_ssa_names_to_propagate(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set)598 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
599 			     bitmap worker_single, bitmap vector_single,
600 			     vec<propagation_set *> *prop_set)
601 {
602   unsigned mask = outer_mask | par->mask;
603 
604   if (par->inner)
605     find_ssa_names_to_propagate (par->inner, mask, worker_single,
606 				 vector_single, prop_set);
607   if (par->next)
608     find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
609 				 vector_single, prop_set);
610 
611   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
612     {
613       basic_block block;
614       int ix;
615 
616       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
617 	{
618 	  for (gphi_iterator psi = gsi_start_phis (block);
619 	       !gsi_end_p (psi); gsi_next (&psi))
620 	    {
621 	      gphi *phi = psi.phi ();
622 	      use_operand_p use;
623 	      ssa_op_iter iter;
624 
625 	      FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
626 		{
627 		  tree var = USE_FROM_PTR (use);
628 
629 		  if (TREE_CODE (var) != SSA_NAME)
630 		    continue;
631 
632 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
633 
634 		  if (gimple_nop_p (def_stmt))
635 		    continue;
636 
637 		  basic_block def_bb = gimple_bb (def_stmt);
638 
639 		  if (bitmap_bit_p (worker_single, def_bb->index))
640 		    {
641 		      if (!(*prop_set)[def_bb->index])
642 			(*prop_set)[def_bb->index] = new propagation_set;
643 
644 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
645 
646 		      ws_prop->add (var);
647 		    }
648 		}
649 	    }
650 
651 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
652 	       !gsi_end_p (gsi); gsi_next (&gsi))
653 	    {
654 	      use_operand_p use;
655 	      ssa_op_iter iter;
656 	      gimple *stmt = gsi_stmt (gsi);
657 
658 	      FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
659 		{
660 		  tree var = USE_FROM_PTR (use);
661 
662 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
663 
664 		  if (gimple_nop_p (def_stmt))
665 		    continue;
666 
667 		  basic_block def_bb = gimple_bb (def_stmt);
668 
669 		  if (bitmap_bit_p (worker_single, def_bb->index))
670 		    {
671 		      if (!(*prop_set)[def_bb->index])
672 			(*prop_set)[def_bb->index] = new propagation_set;
673 
674 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
675 
676 		      ws_prop->add (var);
677 		    }
678 		}
679 	    }
680 	}
681     }
682 }
683 
684 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
685    statement.  */
686 
687 static tree
find_partitioned_var_uses_1(tree * node,int *,void * data)688 find_partitioned_var_uses_1 (tree *node, int *, void *data)
689 {
690   walk_stmt_info *wi = (walk_stmt_info *) data;
691   hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
692 
693   if (!wi->is_lhs && VAR_P (*node))
694     partitioned_var_uses->add (*node);
695 
696   return NULL_TREE;
697 }
698 
699 static void
find_partitioned_var_uses(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses)700 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
701 			   hash_set<tree> *partitioned_var_uses)
702 {
703   unsigned mask = outer_mask | par->mask;
704 
705   if (par->inner)
706     find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
707   if (par->next)
708     find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
709 
710   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
711     {
712       basic_block block;
713       int ix;
714 
715       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
716 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
717 	     !gsi_end_p (gsi); gsi_next (&gsi))
718 	  {
719 	    walk_stmt_info wi;
720 	    memset (&wi, 0, sizeof (wi));
721 	    wi.info = (void *) partitioned_var_uses;
722 	    walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
723 	  }
724     }
725 }
726 
727 /* Gang-private variables (typically placed in a GPU's shared memory) do not
728    need to be processed by the worker-propagation mechanism.  Populate the
729    GANG_PRIVATE_VARS set with any such variables found in the current
730    function.  */
731 
732 static void
find_gang_private_vars(hash_set<tree> * gang_private_vars)733 find_gang_private_vars (hash_set<tree> *gang_private_vars)
734 {
735   basic_block block;
736 
737   FOR_EACH_BB_FN (block, cfun)
738     {
739       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
740 	   !gsi_end_p (gsi);
741 	   gsi_next (&gsi))
742 	{
743 	  gimple *stmt = gsi_stmt (gsi);
744 
745 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
746 	    {
747 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
748 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
749 	      if (k == IFN_UNIQUE_OACC_PRIVATE)
750 		{
751 		  HOST_WIDE_INT level
752 		    = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
753 		  if (level != GOMP_DIM_GANG)
754 		    continue;
755 		  for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
756 		    {
757 		      tree arg = gimple_call_arg (stmt, i);
758 		      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
759 		      tree decl = TREE_OPERAND (arg, 0);
760 		      gang_private_vars->add (decl);
761 		    }
762 		}
763 	    }
764 	}
765     }
766 }
767 
768 static void
find_local_vars_to_propagate(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses,hash_set<tree> * gang_private_vars,bitmap writes_gang_private,vec<propagation_set * > * prop_set)769 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
770 			      hash_set<tree> *partitioned_var_uses,
771 			      hash_set<tree> *gang_private_vars,
772 			      bitmap writes_gang_private,
773 			      vec<propagation_set *> *prop_set)
774 {
775   unsigned mask = outer_mask | par->mask;
776 
777   if (par->inner)
778     find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
779 				  gang_private_vars, writes_gang_private,
780 				  prop_set);
781   if (par->next)
782     find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
783 				  gang_private_vars, writes_gang_private,
784 				  prop_set);
785 
786   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
787     {
788       basic_block block;
789       int ix;
790 
791       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
792 	{
793 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
794 	       !gsi_end_p (gsi); gsi_next (&gsi))
795 	    {
796 	      gimple *stmt = gsi_stmt (gsi);
797 	      tree var;
798 	      unsigned i;
799 
800 	      FOR_EACH_LOCAL_DECL (cfun, i, var)
801 		{
802 		  if (!VAR_P (var)
803 		      || is_global_var (var)
804 		      || AGGREGATE_TYPE_P (TREE_TYPE (var))
805 		      || !partitioned_var_uses->contains (var))
806 		    continue;
807 
808 		  if (stmt_may_clobber_ref_p (stmt, var))
809 		    {
810 		      if (dump_file)
811 			{
812 			  fprintf (dump_file, "bb %u: local variable may be "
813 				   "clobbered in %s mode: ", block->index,
814 				   mask_name (mask));
815 			  print_generic_expr (dump_file, var, TDF_SLIM);
816 			  fprintf (dump_file, "\n");
817 			}
818 
819 		      if (gang_private_vars->contains (var))
820 			{
821 			  /* If we write a gang-private variable, we want a
822 			     barrier at the end of the block.  */
823 			  bitmap_set_bit (writes_gang_private, block->index);
824 			  continue;
825 			}
826 
827 		      if (!(*prop_set)[block->index])
828 			(*prop_set)[block->index] = new propagation_set;
829 
830 		      propagation_set *ws_prop
831 			= (*prop_set)[block->index];
832 
833 		      ws_prop->add (var);
834 		    }
835 		}
836 	    }
837 	}
838     }
839 }
840 
841 /* Transform basic blocks FROM, TO (which may be the same block) into:
842    if (GOACC_single_start ())
843      BLOCK;
844    GOACC_barrier ();
845 			      \  |  /
846 			      +----+
847 			      |    |        (new) predicate block
848 			      +----+--
849    \  |  /   \  |  /	        |t    \
850    +----+    +----+	      +----+  |
851    |	|    |    |	===>  |    |  | f   (old) from block
852    +----+    +----+	      +----+  |
853      |       t/  \f	        |    /
854 			      +----+/
855   (split  (split before       |    |        skip block
856   at end)   condition)	      +----+
857 			      t/  \f
858 */
859 
860 static void
worker_single_simple(basic_block from,basic_block to,hash_set<tree> * def_escapes_block)861 worker_single_simple (basic_block from, basic_block to,
862 		      hash_set<tree> *def_escapes_block)
863 {
864   gimple *call, *cond;
865   tree lhs, decl;
866   basic_block skip_block;
867 
868   gimple_stmt_iterator gsi = gsi_last_bb (to);
869   if (EDGE_COUNT (to->succs) > 1)
870     {
871       gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
872       gsi_prev (&gsi);
873     }
874   edge e = split_block (to, gsi_stmt (gsi));
875   skip_block = e->dest;
876 
877   gimple_stmt_iterator start = gsi_after_labels (from);
878 
879   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
880   lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
881   call = gimple_build_call (decl, 0);
882   gimple_call_set_lhs (call, lhs);
883   gsi_insert_before (&start, call, GSI_NEW_STMT);
884   update_stmt (call);
885 
886   cond = gimple_build_cond (EQ_EXPR, lhs,
887 			    fold_convert_loc (UNKNOWN_LOCATION,
888 					      TREE_TYPE (lhs),
889 					      boolean_true_node),
890 			    NULL_TREE, NULL_TREE);
891   gsi_insert_after (&start, cond, GSI_NEW_STMT);
892   update_stmt (cond);
893 
894   edge et = split_block (from, cond);
895   et->flags &= ~EDGE_FALLTHRU;
896   et->flags |= EDGE_TRUE_VALUE;
897   /* Make the active worker the more probable path so we prefer fallthrough
898      (letting the idle workers jump around more).  */
899   et->probability = profile_probability::likely ();
900 
901   edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
902   ef->probability = et->probability.invert ();
903 
904   basic_block neutered = split_edge (ef);
905   gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
906 
907   for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
908     {
909       gimple *stmt = gsi_stmt (gsi);
910       ssa_op_iter iter;
911       tree var;
912 
913       FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
914 	{
915 	  if (def_escapes_block->contains (var))
916 	    {
917 	      gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
918 	      create_new_def_for (var, join_phi,
919 				  gimple_phi_result_ptr (join_phi));
920 	      add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
921 
922 	      tree neutered_def = copy_ssa_name (var, NULL);
923 	      /* We really want "don't care" or some value representing
924 		 undefined here, but optimizers will probably get rid of the
925 		 zero-assignments anyway.  */
926 	      gassign *zero = gimple_build_assign (neutered_def,
927 				build_zero_cst (TREE_TYPE (neutered_def)));
928 
929 	      gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
930 	      update_stmt (zero);
931 
932 	      add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
933 			   UNKNOWN_LOCATION);
934 	      update_stmt (join_phi);
935 	    }
936 	}
937     }
938 }
939 
940 /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it
941    as appropriate.  */
942 /* Adapted from 'gcc/omp-low.c:omp_build_component_ref'.  */
943 
944 static tree
oacc_build_component_ref(tree obj,tree field)945 oacc_build_component_ref (tree obj, tree field)
946 {
947   tree field_type = TREE_TYPE (field);
948   tree obj_type = TREE_TYPE (obj);
949   if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type)))
950     field_type = build_qualified_type
951 			(field_type,
952 			 KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type)));
953 
954   tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL);
955   if (TREE_THIS_VOLATILE (field))
956     TREE_THIS_VOLATILE (ret) |= 1;
957   if (TREE_READONLY (field))
958     TREE_READONLY (ret) |= 1;
959   return ret;
960 }
961 
962 static tree
build_receiver_ref(tree var,tree receiver_decl,field_map_t * fields)963 build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
964 {
965   tree x = build_simple_mem_ref (receiver_decl);
966   tree field = *fields->get (var);
967   TREE_THIS_NOTRAP (x) = 1;
968   x = oacc_build_component_ref (x, field);
969   return x;
970 }
971 
972 static tree
build_sender_ref(tree var,tree sender_decl,field_map_t * fields)973 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
974 {
975   if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
976     sender_decl = build_simple_mem_ref (sender_decl);
977   tree field = *fields->get (var);
978   return oacc_build_component_ref (sender_decl, field);
979 }
980 
981 static int
sort_by_ssa_version_or_uid(const void * p1,const void * p2)982 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
983 {
984   const tree t1 = *(const tree *)p1;
985   const tree t2 = *(const tree *)p2;
986 
987   if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
988     return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
989   else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
990     return -1;
991   else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
992     return 1;
993   else
994     return DECL_UID (t1) - DECL_UID (t2);
995 }
996 
997 static int
sort_by_size_then_ssa_version_or_uid(const void * p1,const void * p2)998 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
999 {
1000   const tree t1 = *(const tree *)p1;
1001   const tree t2 = *(const tree *)p2;
1002   unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
1003   unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
1004   if (s1 != s2)
1005     return s2 - s1;
1006   else
1007     return sort_by_ssa_version_or_uid (p1, p2);
1008 }
1009 
1010 static void
worker_single_copy(basic_block from,basic_block to,hash_set<tree> * def_escapes_block,hash_set<tree> * worker_partitioned_uses,tree record_type,record_field_map_t * record_field_map,unsigned HOST_WIDE_INT placement,bool isolate_broadcasts,bool has_gang_private_write)1011 worker_single_copy (basic_block from, basic_block to,
1012 		    hash_set<tree> *def_escapes_block,
1013 		    hash_set<tree> *worker_partitioned_uses,
1014 		    tree record_type, record_field_map_t *record_field_map,
1015 		    unsigned HOST_WIDE_INT placement,
1016 		    bool isolate_broadcasts, bool has_gang_private_write)
1017 {
1018   /* If we only have virtual defs, we'll have no record type, but we still want
1019      to emit single_copy_start and (particularly) single_copy_end to act as
1020      a vdef source on the neutered edge representing memory writes on the
1021      non-neutered edge.  */
1022   if (!record_type)
1023     record_type = char_type_node;
1024 
1025   tree sender_decl
1026     = targetm.goacc.create_worker_broadcast_record (record_type, true,
1027 						    ".oacc_worker_o",
1028 						    placement);
1029   tree receiver_decl
1030     = targetm.goacc.create_worker_broadcast_record (record_type, false,
1031 						    ".oacc_worker_i",
1032 						    placement);
1033 
1034   gimple_stmt_iterator gsi = gsi_last_bb (to);
1035   if (EDGE_COUNT (to->succs) > 1)
1036     gsi_prev (&gsi);
1037   edge e = split_block (to, gsi_stmt (gsi));
1038   basic_block barrier_block = e->dest;
1039 
1040   gimple_stmt_iterator start = gsi_after_labels (from);
1041 
1042   tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
1043 
1044   tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
1045 
1046   gimple *call
1047     = gimple_build_call (decl, 1,
1048 			 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1049 			 ? sender_decl : build_fold_addr_expr (sender_decl));
1050   gimple_call_set_lhs (call, lhs);
1051   gsi_insert_before (&start, call, GSI_NEW_STMT);
1052   update_stmt (call);
1053 
1054   /* The shared-memory range for this block overflowed.  Add a barrier before
1055      the GOACC_single_copy_start call.  */
1056   if (isolate_broadcasts)
1057     {
1058       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1059       gimple *acc_bar = gimple_build_call (decl, 0);
1060       gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
1061     }
1062 
1063   tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1064 
1065   gimple *conv = gimple_build_assign (conv_tmp,
1066 				      fold_convert (TREE_TYPE (receiver_decl),
1067 						    lhs));
1068   update_stmt (conv);
1069   gsi_insert_after (&start, conv, GSI_NEW_STMT);
1070   gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
1071   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1072   update_stmt (asgn);
1073 
1074   tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
1075 
1076   tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1077   asgn = gimple_build_assign (recv_tmp, receiver_decl);
1078   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1079   update_stmt (asgn);
1080 
1081   gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
1082 				    NULL_TREE);
1083   update_stmt (cond);
1084 
1085   gsi_insert_after (&start, cond, GSI_NEW_STMT);
1086 
1087   edge et = split_block (from, cond);
1088   et->flags &= ~EDGE_FALLTHRU;
1089   et->flags |= EDGE_TRUE_VALUE;
1090   /* Make the active worker the more probable path so we prefer fallthrough
1091      (letting the idle workers jump around more).  */
1092   et->probability = profile_probability::likely ();
1093 
1094   basic_block body = et->dest;
1095 
1096   edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
1097   ef->probability = et->probability.invert ();
1098 
1099   gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
1100   cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
1101 
1102   if (record_type != char_type_node || has_gang_private_write)
1103     {
1104       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1105       gimple *acc_bar = gimple_build_call (decl, 0);
1106 
1107       gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
1108       gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
1109     }
1110   else
1111     gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
1112 
1113   edge et2 = split_block (barrier_block, cond);
1114   et2->flags &= ~EDGE_FALLTHRU;
1115   et2->flags |= EDGE_TRUE_VALUE;
1116   et2->probability = profile_probability::unlikely ();
1117 
1118   basic_block exit_block = et2->dest;
1119 
1120   basic_block copyout_block = split_edge (et2);
1121   edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
1122   ef2->probability = et2->probability.invert ();
1123 
1124   gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
1125 
1126   edge copyout_to_exit = single_succ_edge (copyout_block);
1127 
1128   gimple_seq sender_seq = NULL;
1129 
1130   /* Make sure we iterate over definitions in a stable order.  */
1131   auto_vec<tree> escape_vec (def_escapes_block->elements ());
1132   for (hash_set<tree>::iterator it = def_escapes_block->begin ();
1133        it != def_escapes_block->end (); ++it)
1134     escape_vec.quick_push (*it);
1135   escape_vec.qsort (sort_by_ssa_version_or_uid);
1136 
1137   for (unsigned i = 0; i < escape_vec.length (); i++)
1138     {
1139       tree var = escape_vec[i];
1140 
1141       if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
1142 	continue;
1143 
1144       tree barrier_def = 0;
1145 
1146       if (TREE_CODE (var) == SSA_NAME)
1147 	{
1148 	  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1149 
1150 	  if (gimple_nop_p (def_stmt))
1151 	    continue;
1152 
1153 	  /* The barrier phi takes one result from the actual work of the
1154 	     block we're neutering, and the other result is constant zero of
1155 	     the same type.  */
1156 
1157 	  gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
1158 	  barrier_def = create_new_def_for (var, barrier_phi,
1159 			  gimple_phi_result_ptr (barrier_phi));
1160 
1161 	  add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
1162 	  add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
1163 		       UNKNOWN_LOCATION);
1164 
1165 	  update_stmt (barrier_phi);
1166 	}
1167       else
1168 	gcc_assert (TREE_CODE (var) == VAR_DECL);
1169 
1170       /* If we had no record type, we will have no fields map.  */
1171       field_map_t **fields_p = record_field_map->get (record_type);
1172       field_map_t *fields = fields_p ? *fields_p : NULL;
1173 
1174       if (worker_partitioned_uses->contains (var)
1175 	  && fields
1176 	  && fields->get (var))
1177 	{
1178 	  tree neutered_def = make_ssa_name (TREE_TYPE (var));
1179 
1180 	  /* Receive definition from shared memory block.  */
1181 
1182 	  tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
1183 	  gassign *recv = gimple_build_assign (neutered_def,
1184 					       receiver_ref);
1185 	  gsi_insert_after (&copyout_gsi, recv, GSI_CONTINUE_LINKING);
1186 	  update_stmt (recv);
1187 
1188 	  if (TREE_CODE (var) == VAR_DECL)
1189 	    {
1190 	      /* If it's a VAR_DECL, we only copied to an SSA temporary.  Copy
1191 		 to the final location now.  */
1192 	      gassign *asgn = gimple_build_assign (var, neutered_def);
1193 	      gsi_insert_after (&copyout_gsi, asgn, GSI_CONTINUE_LINKING);
1194 	      update_stmt (asgn);
1195 	    }
1196 	  else
1197 	    {
1198 	      /* If it's an SSA name, create a new phi at the join node to
1199 		 represent either the output from the active worker (the
1200 		 barrier) or the inactive workers (the copyout block).  */
1201 	      gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
1202 	      create_new_def_for (barrier_def, join_phi,
1203 				  gimple_phi_result_ptr (join_phi));
1204 	      add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
1205 	      add_phi_arg (join_phi, neutered_def, copyout_to_exit,
1206 			   UNKNOWN_LOCATION);
1207 	      update_stmt (join_phi);
1208 	    }
1209 
1210 	  /* Send definition to shared memory block.  */
1211 
1212 	  tree sender_ref = build_sender_ref (var, sender_decl, fields);
1213 
1214 	  if (TREE_CODE (var) == SSA_NAME)
1215 	    {
1216 	      gassign *send = gimple_build_assign (sender_ref, var);
1217 	      gimple_seq_add_stmt (&sender_seq, send);
1218 	      update_stmt (send);
1219 	    }
1220 	  else if (TREE_CODE (var) == VAR_DECL)
1221 	    {
1222 	      tree tmp = make_ssa_name (TREE_TYPE (var));
1223 	      gassign *send = gimple_build_assign (tmp, var);
1224 	      gimple_seq_add_stmt (&sender_seq, send);
1225 	      update_stmt (send);
1226 	      send = gimple_build_assign (sender_ref, tmp);
1227 	      gimple_seq_add_stmt (&sender_seq, send);
1228 	      update_stmt (send);
1229 	    }
1230 	  else
1231 	    gcc_unreachable ();
1232 	}
1233     }
1234 
1235   /* The shared-memory range for this block overflowed.  Add a barrier at the
1236      end.  */
1237   if (isolate_broadcasts)
1238     {
1239       gsi = gsi_start_bb (exit_block);
1240       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1241       gimple *acc_bar = gimple_build_call (decl, 0);
1242       gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
1243     }
1244 
1245   /* It's possible for the ET->DEST block (the work done by the active thread)
1246      to finish with a control-flow insn, e.g. a UNIQUE function call.  Split
1247      the block and add SENDER_SEQ in the latter part to avoid having control
1248      flow in the middle of a BB.  */
1249 
1250   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
1251   call = gimple_build_call (decl, 1,
1252 			    POINTER_TYPE_P (TREE_TYPE (sender_decl))
1253 			    ? sender_decl
1254 			    : build_fold_addr_expr (sender_decl));
1255   gimple_seq_add_stmt (&sender_seq, call);
1256 
1257   gsi = gsi_last_bb (body);
1258   gimple *last = gsi_stmt (gsi);
1259   basic_block sender_block = split_block (body, last)->dest;
1260   gsi = gsi_last_bb (sender_block);
1261   gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
1262 }
1263 
1264 typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
1265   blk_offset_map_t;
1266 
1267 static void
neuter_worker_single(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set,hash_set<tree> * partitioned_var_uses,record_field_map_t * record_field_map,blk_offset_map_t * blk_offset_map,bitmap writes_gang_private)1268 neuter_worker_single (parallel_g *par, unsigned outer_mask,
1269 		      bitmap worker_single, bitmap vector_single,
1270 		      vec<propagation_set *> *prop_set,
1271 		      hash_set<tree> *partitioned_var_uses,
1272 		      record_field_map_t *record_field_map,
1273 		      blk_offset_map_t *blk_offset_map,
1274 		      bitmap writes_gang_private)
1275 {
1276   unsigned mask = outer_mask | par->mask;
1277 
1278   if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1279     {
1280       basic_block block;
1281 
1282       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1283 	{
1284 	  bool has_defs = false;
1285 	  hash_set<tree> def_escapes_block;
1286 	  hash_set<tree> worker_partitioned_uses;
1287 	  unsigned j;
1288 	  tree var;
1289 
1290 	  FOR_EACH_SSA_NAME (j, var, cfun)
1291 	    {
1292 	      if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
1293 		{
1294 		  has_defs = true;
1295 		  continue;
1296 		}
1297 
1298 	      gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1299 
1300 	      if (gimple_nop_p (def_stmt))
1301 		continue;
1302 
1303 	      if (gimple_bb (def_stmt)->index != block->index)
1304 		continue;
1305 
1306 	      gimple *use_stmt;
1307 	      imm_use_iterator use_iter;
1308 	      bool uses_outside_block = false;
1309 	      bool worker_partitioned_use = false;
1310 
1311 	      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
1312 		{
1313 		  int blocknum = gimple_bb (use_stmt)->index;
1314 
1315 		  /* Don't propagate SSA names that are only used in the
1316 		     current block, unless the usage is in a phi node: that
1317 		     means the name left the block, then came back in at the
1318 		     top.  */
1319 		  if (blocknum != block->index
1320 		      || gimple_code (use_stmt) == GIMPLE_PHI)
1321 		    uses_outside_block = true;
1322 		  if (!bitmap_bit_p (worker_single, blocknum))
1323 		    worker_partitioned_use = true;
1324 		}
1325 
1326 	      if (uses_outside_block)
1327 		def_escapes_block.add (var);
1328 
1329 	      if (worker_partitioned_use)
1330 		{
1331 		  worker_partitioned_uses.add (var);
1332 		  has_defs = true;
1333 		}
1334 	    }
1335 
1336 	  propagation_set *ws_prop = (*prop_set)[block->index];
1337 
1338 	  if (ws_prop)
1339 	    {
1340 	      for (propagation_set::iterator it = ws_prop->begin ();
1341 		   it != ws_prop->end ();
1342 		   ++it)
1343 		{
1344 		  tree var = *it;
1345 		  if (TREE_CODE (var) == VAR_DECL)
1346 		    {
1347 		      def_escapes_block.add (var);
1348 		      if (partitioned_var_uses->contains (var))
1349 			{
1350 			  worker_partitioned_uses.add (var);
1351 			  has_defs = true;
1352 			}
1353 		    }
1354 		}
1355 
1356 	      delete ws_prop;
1357 	      (*prop_set)[block->index] = 0;
1358 	    }
1359 
1360 	  bool only_marker_fns = true;
1361 	  bool join_block = false;
1362 
1363 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1364 	       !gsi_end_p (gsi);
1365 	       gsi_next (&gsi))
1366 	    {
1367 	      gimple *stmt = gsi_stmt (gsi);
1368 	      if (gimple_code (stmt) == GIMPLE_CALL
1369 		  && gimple_call_internal_p (stmt, IFN_UNIQUE))
1370 		{
1371 		  enum ifn_unique_kind k = ((enum ifn_unique_kind)
1372 		    TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1373 		  if (k != IFN_UNIQUE_OACC_PRIVATE
1374 		      && k != IFN_UNIQUE_OACC_JOIN
1375 		      && k != IFN_UNIQUE_OACC_FORK
1376 		      && k != IFN_UNIQUE_OACC_HEAD_MARK
1377 		      && k != IFN_UNIQUE_OACC_TAIL_MARK)
1378 		    only_marker_fns = false;
1379 		  else if (k == IFN_UNIQUE_OACC_JOIN)
1380 		    /* The JOIN marker is special in that it *cannot* be
1381 		       predicated for worker zero, because it may be lowered
1382 		       to a barrier instruction and all workers must typically
1383 		       execute that barrier.  We shouldn't be doing any
1384 		       broadcasts from the join block anyway.  */
1385 		    join_block = true;
1386 		}
1387 	      else if (gimple_code (stmt) == GIMPLE_CALL
1388 		       && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
1389 		/* Empty.  */;
1390 	      else if (gimple_nop_p (stmt))
1391 		/* Empty.  */;
1392 	      else
1393 		only_marker_fns = false;
1394 	    }
1395 
1396 	  /* We can skip predicating this block for worker zero if the only
1397 	     thing it contains is marker functions that will be removed in the
1398 	     oaccdevlow pass anyway.
1399 	     Don't do this if the block has (any) phi nodes, because those
1400 	     might define SSA names that need broadcasting.
1401 	     TODO: We might be able to skip transforming blocks that only
1402 	     contain some other trivial statements too.  */
1403 	  if (only_marker_fns && !phi_nodes (block))
1404 	    continue;
1405 
1406 	  gcc_assert (!join_block);
1407 
1408 	  if (has_defs)
1409 	    {
1410 	      tree record_type = (tree) block->aux;
1411 	      std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
1412 		= blk_offset_map->get (block);
1413 	      gcc_assert (!record_type || off_rngalloc);
1414 	      unsigned HOST_WIDE_INT offset
1415 		= off_rngalloc ? off_rngalloc->first : 0;
1416 	      bool range_allocated
1417 		= off_rngalloc ? off_rngalloc->second : true;
1418 	      bool has_gang_private_write
1419 		= bitmap_bit_p (writes_gang_private, block->index);
1420 	      worker_single_copy (block, block, &def_escapes_block,
1421 				  &worker_partitioned_uses, record_type,
1422 				  record_field_map,
1423 				  offset, !range_allocated,
1424 				  has_gang_private_write);
1425 	    }
1426 	  else
1427 	    worker_single_simple (block, block, &def_escapes_block);
1428 	}
1429     }
1430 
1431   if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1432     {
1433       basic_block block;
1434 
1435       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1436 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1437 	     !gsi_end_p (gsi);
1438 	     gsi_next (&gsi))
1439 	  {
1440 	    gimple *stmt = gsi_stmt (gsi);
1441 
1442 	    if (gimple_code (stmt) == GIMPLE_CALL
1443 		&& !gimple_call_internal_p (stmt)
1444 		&& !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
1445 	      {
1446 		/* If we have an OpenACC routine call in worker-single mode,
1447 		   place barriers before and afterwards to prevent
1448 		   clobbering re-used shared memory regions (as are used
1449 		   for AMDGCN at present, for example).  */
1450 		tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1451 		gsi_insert_before (&gsi, gimple_build_call (decl, 0),
1452 				   GSI_SAME_STMT);
1453 		gsi_insert_after (&gsi, gimple_build_call (decl, 0),
1454 				  GSI_NEW_STMT);
1455 	      }
1456 	  }
1457     }
1458 
1459   if (par->inner)
1460     neuter_worker_single (par->inner, mask, worker_single, vector_single,
1461 			  prop_set, partitioned_var_uses, record_field_map,
1462 			  blk_offset_map, writes_gang_private);
1463   if (par->next)
1464     neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
1465 			  prop_set, partitioned_var_uses, record_field_map,
1466 			  blk_offset_map, writes_gang_private);
1467 }
1468 
1469 static void
dfs_broadcast_reachable_1(basic_block bb,sbitmap reachable)1470 dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
1471 {
1472   if (bb->flags & BB_VISITED)
1473     return;
1474 
1475   bb->flags |= BB_VISITED;
1476 
1477   if (bb->succs)
1478     {
1479       edge e;
1480       edge_iterator ei;
1481       FOR_EACH_EDGE (e, ei, bb->succs)
1482 	{
1483 	  basic_block dest = e->dest;
1484 	  if (dest->aux)
1485 	    bitmap_set_bit (reachable, dest->index);
1486 	  else
1487 	    dfs_broadcast_reachable_1 (dest, reachable);
1488 	}
1489     }
1490 }
1491 
1492 typedef std::pair<int, tree> idx_decl_pair_t;
1493 
1494 typedef auto_vec<splay_tree> used_range_vec_t;
1495 
1496 static int
sort_size_descending(const void * a,const void * b)1497 sort_size_descending (const void *a, const void *b)
1498 {
1499   const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
1500   const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
1501   unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
1502   unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
1503   return bsize - asize;
1504 }
1505 
1506 class addr_range
1507 {
1508 public:
addr_range(unsigned HOST_WIDE_INT addr_lo,unsigned HOST_WIDE_INT addr_hi)1509   addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
1510     : lo (addr_lo), hi (addr_hi)
1511     { }
addr_range(const addr_range & ar)1512   addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
1513     { }
addr_range()1514   addr_range () : lo (0), hi (0)
1515     { }
1516 
invalid()1517   bool invalid () { return lo == 0 && hi == 0; }
1518 
1519   unsigned HOST_WIDE_INT lo;
1520   unsigned HOST_WIDE_INT hi;
1521 };
1522 
1523 static int
splay_tree_compare_addr_range(splay_tree_key a,splay_tree_key b)1524 splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
1525 {
1526   addr_range *ar = (addr_range *) a;
1527   addr_range *br = (addr_range *) b;
1528   if (ar->lo == br->lo && ar->hi == br->hi)
1529     return 0;
1530   if (ar->hi <= br->lo)
1531     return -1;
1532   else if (ar->lo >= br->hi)
1533     return 1;
1534   return 0;
1535 }
1536 
1537 static void
splay_tree_free_key(splay_tree_key k)1538 splay_tree_free_key (splay_tree_key k)
1539 {
1540   addr_range *ar = (addr_range *) k;
1541   delete ar;
1542 }
1543 
1544 static addr_range
first_fit_range(splay_tree s,unsigned HOST_WIDE_INT size,unsigned HOST_WIDE_INT align,addr_range * bounds)1545 first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
1546 		 unsigned HOST_WIDE_INT align, addr_range *bounds)
1547 {
1548   splay_tree_node min = splay_tree_min (s);
1549   if (min)
1550     {
1551       splay_tree_node next;
1552       while ((next = splay_tree_successor (s, min->key)))
1553 	{
1554 	  unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
1555 	  unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
1556 	  unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
1557 	  if (base + size <= hi)
1558 	    return addr_range (base, base + size);
1559 	  min = next;
1560 	}
1561 
1562       unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
1563       base = (base + align - 1) & ~(align - 1);
1564       if (base + size <= bounds->hi)
1565 	return addr_range (base, base + size);
1566       else
1567 	return addr_range ();
1568     }
1569   else
1570     {
1571       unsigned HOST_WIDE_INT lo = bounds->lo;
1572       lo = (lo + align - 1) & ~(align - 1);
1573       if (lo + size <= bounds->hi)
1574 	return addr_range (lo, lo + size);
1575       else
1576 	return addr_range ();
1577     }
1578 }
1579 
1580 static int
merge_ranges_1(splay_tree_node n,void * ptr)1581 merge_ranges_1 (splay_tree_node n, void *ptr)
1582 {
1583   splay_tree accum = (splay_tree) ptr;
1584   addr_range ar = *(addr_range *) n->key;
1585 
1586   splay_tree_node old = splay_tree_lookup (accum, n->key);
1587 
1588   /* We might have an overlap.  Create a new range covering the
1589      overlapping parts.  */
1590   if (old)
1591     {
1592       addr_range *old_ar = (addr_range *) old->key;
1593       ar.lo = MIN (old_ar->lo, ar.lo);
1594       ar.hi = MAX (old_ar->hi, ar.hi);
1595       splay_tree_remove (accum, old->key);
1596     }
1597 
1598   addr_range *new_ar = new addr_range (ar);
1599 
1600   splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
1601 
1602   return 0;
1603 }
1604 
1605 static void
merge_ranges(splay_tree accum,splay_tree sp)1606 merge_ranges (splay_tree accum, splay_tree sp)
1607 {
1608   splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
1609 }
1610 
1611 static void
oacc_do_neutering(unsigned HOST_WIDE_INT bounds_lo,unsigned HOST_WIDE_INT bounds_hi)1612 oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
1613 		   unsigned HOST_WIDE_INT bounds_hi)
1614 {
1615   bb_stmt_map_t bb_stmt_map;
1616   auto_bitmap worker_single, vector_single;
1617 
1618   omp_sese_split_blocks (&bb_stmt_map);
1619 
1620   if (dump_file)
1621     {
1622       fprintf (dump_file, "\n\nAfter splitting:\n\n");
1623       dump_function_to_file (current_function_decl, dump_file, dump_flags);
1624     }
1625 
1626   unsigned mask = 0;
1627 
1628   /* If this is a routine, calculate MASK as if the outer levels are already
1629      partitioned.  */
1630   {
1631     tree attr = oacc_get_fn_attrib (current_function_decl);
1632     tree dims = TREE_VALUE (attr);
1633     unsigned ix;
1634     for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
1635       {
1636 	tree allowed = TREE_PURPOSE (dims);
1637 	if (allowed && integer_zerop (allowed))
1638 	  mask |= GOMP_DIM_MASK (ix);
1639       }
1640   }
1641 
1642   parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
1643   populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
1644 
1645   basic_block bb;
1646   FOR_ALL_BB_FN (bb, cfun)
1647     bb->aux = NULL;
1648 
1649   vec<propagation_set *> prop_set (vNULL);
1650   prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
1651 
1652   find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
1653 			       &prop_set);
1654 
1655   hash_set<tree> partitioned_var_uses;
1656   hash_set<tree> gang_private_vars;
1657   auto_bitmap writes_gang_private;
1658 
1659   find_gang_private_vars (&gang_private_vars);
1660   find_partitioned_var_uses (par, mask, &partitioned_var_uses);
1661   find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
1662 				&gang_private_vars, writes_gang_private,
1663 				&prop_set);
1664 
1665   record_field_map_t record_field_map;
1666 
1667   FOR_ALL_BB_FN (bb, cfun)
1668     {
1669       propagation_set *ws_prop = prop_set[bb->index];
1670       if (ws_prop)
1671 	{
1672 	  tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
1673 	  tree name = create_tmp_var_name (".oacc_ws_data_s");
1674 	  name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
1675 	  DECL_ARTIFICIAL (name) = 1;
1676 	  DECL_NAMELESS (name) = 1;
1677 	  TYPE_NAME (record_type) = name;
1678 	  TYPE_ARTIFICIAL (record_type) = 1;
1679 
1680 	  auto_vec<tree> field_vec (ws_prop->elements ());
1681 	  for (hash_set<tree>::iterator it = ws_prop->begin ();
1682 	       it != ws_prop->end (); ++it)
1683 	    field_vec.quick_push (*it);
1684 
1685 	  field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
1686 
1687 	  field_map_t *fields = new field_map_t;
1688 
1689 	  bool existed;
1690 	  existed = record_field_map.put (record_type, fields);
1691 	  gcc_checking_assert (!existed);
1692 
1693 	  /* Insert var fields in reverse order, so the last inserted element
1694 	     is the first in the structure.  */
1695 	  for (int i = field_vec.length () - 1; i >= 0; i--)
1696 	    install_var_field (field_vec[i], record_type, fields);
1697 
1698 	  layout_type (record_type);
1699 
1700 	  bb->aux = (tree) record_type;
1701 	}
1702     }
1703 
1704   sbitmap *reachable
1705     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1706 			    last_basic_block_for_fn (cfun));
1707 
1708   bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
1709 
1710   auto_vec<std::pair<int, tree> > priority;
1711 
1712   FOR_ALL_BB_FN (bb, cfun)
1713     {
1714       if (bb->aux)
1715 	{
1716 	  tree record_type = (tree) bb->aux;
1717 
1718 	  basic_block bb2;
1719 	  FOR_ALL_BB_FN (bb2, cfun)
1720 	    bb2->flags &= ~BB_VISITED;
1721 
1722 	  priority.safe_push (std::make_pair (bb->index, record_type));
1723 	  dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
1724 	}
1725     }
1726 
1727   sbitmap *inverted
1728     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1729 			    last_basic_block_for_fn (cfun));
1730 
1731   bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
1732 
1733   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1734     {
1735       sbitmap_iterator bi;
1736       unsigned int j;
1737       EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
1738 	bitmap_set_bit (inverted[j], i);
1739     }
1740 
1741   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1742     bitmap_ior (reachable[i], reachable[i], inverted[i]);
1743 
1744   sbitmap_vector_free (inverted);
1745 
1746   used_range_vec_t used_ranges;
1747 
1748   used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
1749 
1750   blk_offset_map_t blk_offset_map;
1751 
1752   addr_range worker_shm_bounds (bounds_lo, bounds_hi);
1753 
1754   priority.qsort (sort_size_descending);
1755   for (unsigned int i = 0; i < priority.length (); i++)
1756     {
1757       idx_decl_pair_t p = priority[i];
1758       int blkno = p.first;
1759       tree record_type = p.second;
1760       HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
1761       HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
1762 
1763       splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
1764 					     splay_tree_free_key, NULL);
1765 
1766       if (!used_ranges[blkno])
1767 	used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
1768 					     splay_tree_free_key, NULL);
1769       else
1770 	merge_ranges (conflicts, used_ranges[blkno]);
1771 
1772       sbitmap_iterator bi;
1773       unsigned int j;
1774       EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
1775 	if (used_ranges[j])
1776 	  merge_ranges (conflicts, used_ranges[j]);
1777 
1778       addr_range ar
1779 	= first_fit_range (conflicts, size, align, &worker_shm_bounds);
1780 
1781       splay_tree_delete (conflicts);
1782 
1783       if (ar.invalid ())
1784 	{
1785 	  unsigned HOST_WIDE_INT base
1786 	    = (bounds_lo + align - 1) & ~(align - 1);
1787 	  if (base + size > bounds_hi)
1788 	    error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
1789 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1790 	    = std::make_pair (base, false);
1791 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1792 	}
1793       else
1794 	{
1795 	  splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
1796 						   (splay_tree_key) &ar);
1797 	  if (old)
1798 	    {
1799 	      fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
1800 		       "already mapped in block %d\n", (int) ar.lo,
1801 		       (int) ar.hi, (int) ((addr_range *) old->key)->lo,
1802 		       (int) ((addr_range *) old->key)->hi, blkno);
1803 	      abort ();
1804 	    }
1805 
1806 	  addr_range *arp = new addr_range (ar);
1807 	  splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
1808 			     (splay_tree_value) blkno);
1809 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1810 	    = std::make_pair (ar.lo, true);
1811 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1812 	}
1813     }
1814 
1815   sbitmap_vector_free (reachable);
1816 
1817   neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
1818 			&partitioned_var_uses, &record_field_map,
1819 			&blk_offset_map, writes_gang_private);
1820 
1821   for (auto it : record_field_map)
1822     delete it.second;
1823   record_field_map.empty ();
1824 
1825   /* These are supposed to have been 'delete'd by 'neuter_worker_single'.  */
1826   for (auto it : prop_set)
1827     gcc_checking_assert (!it);
1828   prop_set.release ();
1829 
1830   delete par;
1831 
1832   /* This doesn't seem to make a difference.  */
1833   loops_state_clear (LOOP_CLOSED_SSA);
1834 
1835   /* Neutering worker-single neutered blocks will invalidate dominance info.
1836      It may be possible to incrementally update just the affected blocks, but
1837      obliterate everything for now.  */
1838   free_dominance_info (CDI_DOMINATORS);
1839   free_dominance_info (CDI_POST_DOMINATORS);
1840 
1841   if (dump_file)
1842     {
1843       fprintf (dump_file, "\n\nAfter neutering:\n\n");
1844       dump_function_to_file (current_function_decl, dump_file, dump_flags);
1845     }
1846 }
1847 
1848 static int
execute_omp_oacc_neuter_broadcast()1849 execute_omp_oacc_neuter_broadcast ()
1850 {
1851   unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
1852   unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
1853 
1854   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1855     {
1856       reduction_size[i] = 0;
1857       private_size[i] = 0;
1858     }
1859 
1860   /* Calculate shared memory size required for reduction variables and
1861      gang-private memory for this offloaded function.  */
1862   basic_block bb;
1863   FOR_ALL_BB_FN (bb, cfun)
1864     {
1865       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
1866 	   !gsi_end_p (gsi);
1867 	   gsi_next (&gsi))
1868 	{
1869 	  gimple *stmt = gsi_stmt (gsi);
1870 	  if (!is_gimple_call (stmt))
1871 	    continue;
1872 	  gcall *call = as_a <gcall *> (stmt);
1873 	  if (!gimple_call_internal_p (call))
1874 	    continue;
1875 	  enum internal_fn ifn_code = gimple_call_internal_fn (call);
1876 	  switch (ifn_code)
1877 	    {
1878 	    default: break;
1879 	    case IFN_GOACC_REDUCTION:
1880 	      if (integer_minus_onep (gimple_call_arg (call, 3)))
1881 		continue;
1882 	      else
1883 		{
1884 		  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1885 		  /* Only count reduction variables once: the choice to pick
1886 		     the setup call is fairly arbitrary.  */
1887 		  if (code == IFN_GOACC_REDUCTION_SETUP)
1888 		    {
1889 		      int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
1890 		      tree var = gimple_call_arg (call, 2);
1891 		      tree offset = gimple_call_arg (call, 5);
1892 		      tree var_type = TREE_TYPE (var);
1893 		      unsigned HOST_WIDE_INT limit
1894 			= (tree_to_uhwi (offset)
1895 			   + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
1896 		      reduction_size[level]
1897 			= MAX (reduction_size[level], limit);
1898 		    }
1899 		}
1900 	      break;
1901 	    case IFN_UNIQUE:
1902 	      {
1903 		enum ifn_unique_kind kind
1904 		  = ((enum ifn_unique_kind)
1905 		     TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1906 
1907 		if (kind == IFN_UNIQUE_OACC_PRIVATE)
1908 		  {
1909 		    HOST_WIDE_INT level
1910 		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1911 		    if (level == -1)
1912 		      break;
1913 		    for (unsigned i = 3;
1914 			 i < gimple_call_num_args (call);
1915 			 i++)
1916 		      {
1917 			tree arg = gimple_call_arg (call, i);
1918 			gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
1919 			tree decl = TREE_OPERAND (arg, 0);
1920 			unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
1921 			private_size[level] = ((private_size[level] + align - 1)
1922 					       & ~(align - 1));
1923 			unsigned HOST_WIDE_INT decl_size
1924 			  = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
1925 			private_size[level] += decl_size;
1926 		      }
1927 		  }
1928 	      }
1929 	      break;
1930 	    }
1931 	}
1932     }
1933 
1934   int dims[GOMP_DIM_MAX];
1935   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1936     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
1937 
1938   /* Find bounds of shared-memory buffer space we can use.  */
1939   unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
1940   if (targetm.goacc.shared_mem_layout)
1941     targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
1942 				     private_size, reduction_size);
1943 
1944   /* Perform worker partitioning unless we know 'num_workers(1)'.  */
1945   if (dims[GOMP_DIM_WORKER] != 1)
1946     oacc_do_neutering (bounds_lo, bounds_hi);
1947 
1948   return 0;
1949 }
1950 
1951 namespace {
1952 
1953 const pass_data pass_data_omp_oacc_neuter_broadcast =
1954 {
1955   GIMPLE_PASS, /* type */
1956   "omp_oacc_neuter_broadcast", /* name */
1957   OPTGROUP_OMP, /* optinfo_flags */
1958   TV_NONE, /* tv_id */
1959   PROP_cfg, /* properties_required */
1960   0, /* properties_provided */
1961   0, /* properties_destroyed */
1962   0, /* todo_flags_start */
1963   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1964 };
1965 
1966 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
1967 {
1968 public:
pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)1969   pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1970     : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
1971   {}
1972 
1973   /* opt_pass methods: */
gate(function * fun)1974   virtual bool gate (function *fun)
1975   {
1976     if (!flag_openacc)
1977       return false;
1978 
1979     if (!targetm.goacc.create_worker_broadcast_record)
1980       return false;
1981 
1982     /* Only relevant for OpenACC offloaded functions.  */
1983     tree attr = oacc_get_fn_attrib (fun->decl);
1984     if (!attr)
1985       return false;
1986 
1987     return true;
1988   }
1989 
execute(function *)1990   virtual unsigned int execute (function *)
1991     {
1992       return execute_omp_oacc_neuter_broadcast ();
1993     }
1994 
1995 }; // class pass_omp_oacc_neuter_broadcast
1996 
1997 } // anon namespace
1998 
1999 gimple_opt_pass *
make_pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)2000 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
2001 {
2002   return new pass_omp_oacc_neuter_broadcast (ctxt);
2003 }
2004