1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
2
3 Copyright (C) 2015-2021 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "tree-pass.h"
29 #include "ssa.h"
30 #include "cgraph.h"
31 #include "pretty-print.h"
32 #include "fold-const.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "tree-inline.h"
37 #include "langhooks.h"
38 #include "omp-general.h"
39 #include "omp-low.h"
40 #include "gimple-pretty-print.h"
41 #include "cfghooks.h"
42 #include "insn-config.h"
43 #include "recog.h"
44 #include "internal-fn.h"
45 #include "bitmap.h"
46 #include "tree-nested.h"
47 #include "stor-layout.h"
48 #include "tree-ssa-threadupdate.h"
49 #include "tree-into-ssa.h"
50 #include "splay-tree.h"
51 #include "target.h"
52 #include "cfgloop.h"
53 #include "tree-cfg.h"
54 #include "omp-offload.h"
55 #include "attribs.h"
56 #include "targhooks.h"
57 #include "diagnostic-core.h"
58
59 /* Loop structure of the function. The entire function is described as
60 a NULL loop. */
61 /* Adapted from 'gcc/config/nvptx/nvptx.c:struct parallel'. */
62
63 struct parallel_g
64 {
65 /* Parent parallel. */
66 parallel_g *parent;
67
68 /* Next sibling parallel. */
69 parallel_g *next;
70
71 /* First child parallel. */
72 parallel_g *inner;
73
74 /* Partitioning mask of the parallel. */
75 unsigned mask;
76
77 /* Partitioning used within inner parallels. */
78 unsigned inner_mask;
79
80 /* Location of parallel forked and join. The forked is the first
81 block in the parallel and the join is the first block after of
82 the partition. */
83 basic_block forked_block;
84 basic_block join_block;
85
86 gimple *forked_stmt;
87 gimple *join_stmt;
88
89 gimple *fork_stmt;
90 gimple *joining_stmt;
91
92 /* Basic blocks in this parallel, but not in child parallels. The
93 FORKED and JOINING blocks are in the partition. The FORK and JOIN
94 blocks are not. */
95 auto_vec<basic_block> blocks;
96
97 tree record_type;
98 tree sender_decl;
99 tree receiver_decl;
100
101 public:
102 parallel_g (parallel_g *parent, unsigned mode);
103 ~parallel_g ();
104 };
105
106 /* Constructor links the new parallel into it's parent's chain of
107 children. */
108
parallel_g(parallel_g * parent_,unsigned mask_)109 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
110 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
111 {
112 forked_block = join_block = 0;
113 forked_stmt = join_stmt = NULL;
114 fork_stmt = joining_stmt = NULL;
115
116 record_type = NULL_TREE;
117 sender_decl = NULL_TREE;
118 receiver_decl = NULL_TREE;
119
120 if (parent)
121 {
122 next = parent->inner;
123 parent->inner = this;
124 }
125 }
126
~parallel_g()127 parallel_g::~parallel_g ()
128 {
129 delete inner;
130 delete next;
131 }
132
133 static bool
local_var_based_p(tree decl)134 local_var_based_p (tree decl)
135 {
136 switch (TREE_CODE (decl))
137 {
138 case VAR_DECL:
139 return !is_global_var (decl);
140
141 case COMPONENT_REF:
142 case BIT_FIELD_REF:
143 case ARRAY_REF:
144 return local_var_based_p (TREE_OPERAND (decl, 0));
145
146 default:
147 return false;
148 }
149 }
150
151 /* Map of basic blocks to gimple stmts. */
152 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
153
154 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
155 the routine likely contains partitioned loops (else will do its own
156 neutering and variable propagation). Return TRUE if a function call CALL
157 should be made in (worker) single mode instead, rather than redundant
158 mode. */
159
160 static bool
omp_sese_active_worker_call(gcall * call)161 omp_sese_active_worker_call (gcall *call)
162 {
163 #define GOMP_DIM_SEQ GOMP_DIM_MAX
164 tree fndecl = gimple_call_fndecl (call);
165
166 if (!fndecl)
167 return true;
168
169 tree attrs = oacc_get_fn_attrib (fndecl);
170
171 if (!attrs)
172 return true;
173
174 int level = oacc_fn_attrib_level (attrs);
175
176 /* Neither regular functions nor "seq" routines should be run by all threads
177 in worker-single mode. */
178 return level == -1 || level == GOMP_DIM_SEQ;
179 #undef GOMP_DIM_SEQ
180 }
181
182 /* Split basic blocks such that each forked and join unspecs are at
183 the start of their basic blocks. Thus afterwards each block will
184 have a single partitioning mode. We also do the same for return
185 insns, as they are executed by every thread. Return the
186 partitioning mode of the function as a whole. Populate MAP with
187 head and tail blocks. We also clear the BB visited flag, which is
188 used when finding partitions. */
189 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_split_blocks'. */
190
191 static void
omp_sese_split_blocks(bb_stmt_map_t * map)192 omp_sese_split_blocks (bb_stmt_map_t *map)
193 {
194 auto_vec<gimple *> worklist;
195 basic_block block;
196
197 /* Locate all the reorg instructions of interest. */
198 FOR_ALL_BB_FN (block, cfun)
199 {
200 /* Clear visited flag, for use by parallel locator */
201 block->flags &= ~BB_VISITED;
202
203 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
204 !gsi_end_p (gsi);
205 gsi_next (&gsi))
206 {
207 gimple *stmt = gsi_stmt (gsi);
208
209 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
210 {
211 enum ifn_unique_kind k = ((enum ifn_unique_kind)
212 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
213
214 if (k == IFN_UNIQUE_OACC_JOIN)
215 worklist.safe_push (stmt);
216 else if (k == IFN_UNIQUE_OACC_FORK)
217 {
218 gcc_assert (gsi_one_before_end_p (gsi));
219 basic_block forked_block = single_succ (block);
220 gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
221
222 /* We push a NOP as a placeholder for the "forked" stmt.
223 This is then recognized in omp_sese_find_par. */
224 gimple *nop = gimple_build_nop ();
225 gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
226
227 worklist.safe_push (nop);
228 }
229 }
230 else if (gimple_code (stmt) == GIMPLE_RETURN
231 || gimple_code (stmt) == GIMPLE_COND
232 || gimple_code (stmt) == GIMPLE_SWITCH
233 || (gimple_code (stmt) == GIMPLE_CALL
234 && !gimple_call_internal_p (stmt)
235 && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
236 worklist.safe_push (stmt);
237 else if (is_gimple_assign (stmt))
238 {
239 tree lhs = gimple_assign_lhs (stmt);
240
241 /* Force assignments to components/fields/elements of local
242 aggregates into fully-partitioned (redundant) mode. This
243 avoids having to broadcast the whole aggregate. The RHS of
244 the assignment will be propagated using the normal
245 mechanism. */
246
247 switch (TREE_CODE (lhs))
248 {
249 case COMPONENT_REF:
250 case BIT_FIELD_REF:
251 case ARRAY_REF:
252 {
253 tree aggr = TREE_OPERAND (lhs, 0);
254
255 if (local_var_based_p (aggr))
256 worklist.safe_push (stmt);
257 }
258 break;
259
260 default:
261 ;
262 }
263 }
264 }
265 }
266
267 /* Split blocks on the worklist. */
268 unsigned ix;
269 gimple *stmt;
270
271 for (ix = 0; worklist.iterate (ix, &stmt); ix++)
272 {
273 basic_block block = gimple_bb (stmt);
274
275 if (gimple_code (stmt) == GIMPLE_COND)
276 {
277 gcond *orig_cond = as_a <gcond *> (stmt);
278 tree_code code = gimple_expr_code (orig_cond);
279 tree pred = make_ssa_name (boolean_type_node);
280 gimple *asgn = gimple_build_assign (pred, code,
281 gimple_cond_lhs (orig_cond),
282 gimple_cond_rhs (orig_cond));
283 gcond *new_cond
284 = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
285 gimple_cond_true_label (orig_cond),
286 gimple_cond_false_label (orig_cond));
287
288 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
289 gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
290 gsi_replace (&gsi, new_cond, true);
291
292 edge e = split_block (block, asgn);
293 block = e->dest;
294 map->get_or_insert (block) = new_cond;
295 }
296 else if ((gimple_code (stmt) == GIMPLE_CALL
297 && !gimple_call_internal_p (stmt))
298 || is_gimple_assign (stmt))
299 {
300 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
301 gsi_prev (&gsi);
302
303 edge call = split_block (block, gsi_stmt (gsi));
304
305 gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
306
307 edge call_to_ret = split_block (call->dest, call_stmt);
308
309 map->get_or_insert (call_to_ret->src) = call_stmt;
310 }
311 else
312 {
313 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
314 gsi_prev (&gsi);
315
316 if (gsi_end_p (gsi))
317 map->get_or_insert (block) = stmt;
318 else
319 {
320 /* Split block before insn. The insn is in the new block. */
321 edge e = split_block (block, gsi_stmt (gsi));
322
323 block = e->dest;
324 map->get_or_insert (block) = stmt;
325 }
326 }
327 }
328 }
329
330 static const char *
mask_name(unsigned mask)331 mask_name (unsigned mask)
332 {
333 switch (mask)
334 {
335 case 0: return "gang redundant";
336 case 1: return "gang partitioned";
337 case 2: return "worker partitioned";
338 case 3: return "gang+worker partitioned";
339 case 4: return "vector partitioned";
340 case 5: return "gang+vector partitioned";
341 case 6: return "worker+vector partitioned";
342 case 7: return "fully partitioned";
343 default: return "<illegal>";
344 }
345 }
346
347 /* Dump this parallel and all its inner parallels. */
348 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_dump_pars'. */
349
350 static void
omp_sese_dump_pars(parallel_g * par,unsigned depth)351 omp_sese_dump_pars (parallel_g *par, unsigned depth)
352 {
353 fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
354 depth, par->mask, mask_name (par->mask),
355 par->forked_block ? par->forked_block->index : -1,
356 par->join_block ? par->join_block->index : -1);
357
358 fprintf (dump_file, " blocks:");
359
360 basic_block block;
361 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
362 fprintf (dump_file, " %d", block->index);
363 fprintf (dump_file, "\n");
364 if (par->inner)
365 omp_sese_dump_pars (par->inner, depth + 1);
366
367 if (par->next)
368 omp_sese_dump_pars (par->next, depth);
369 }
370
371 /* If BLOCK contains a fork/join marker, process it to create or
372 terminate a loop structure. Add this block to the current loop,
373 and then walk successor blocks. */
374 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_find_par'. */
375
376 static parallel_g *
omp_sese_find_par(bb_stmt_map_t * map,parallel_g * par,basic_block block)377 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
378 {
379 if (block->flags & BB_VISITED)
380 return par;
381 block->flags |= BB_VISITED;
382
383 if (gimple **stmtp = map->get (block))
384 {
385 gimple *stmt = *stmtp;
386
387 if (gimple_code (stmt) == GIMPLE_COND
388 || gimple_code (stmt) == GIMPLE_SWITCH
389 || gimple_code (stmt) == GIMPLE_RETURN
390 || (gimple_code (stmt) == GIMPLE_CALL
391 && !gimple_call_internal_p (stmt))
392 || is_gimple_assign (stmt))
393 {
394 /* A single block that is forced to be at the maximum partition
395 level. Make a singleton par for it. */
396 par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
397 | GOMP_DIM_MASK (GOMP_DIM_WORKER)
398 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
399 par->forked_block = block;
400 par->forked_stmt = stmt;
401 par->blocks.safe_push (block);
402 par = par->parent;
403 goto walk_successors;
404 }
405 else if (gimple_nop_p (stmt))
406 {
407 basic_block pred = single_pred (block);
408 gcc_assert (pred);
409 gimple_stmt_iterator gsi = gsi_last_bb (pred);
410 gimple *final_stmt = gsi_stmt (gsi);
411
412 if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
413 {
414 gcall *call = as_a <gcall *> (final_stmt);
415 enum ifn_unique_kind k = ((enum ifn_unique_kind)
416 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
417
418 if (k == IFN_UNIQUE_OACC_FORK)
419 {
420 HOST_WIDE_INT dim
421 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
422 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
423
424 par = new parallel_g (par, mask);
425 par->forked_block = block;
426 par->forked_stmt = final_stmt;
427 par->fork_stmt = stmt;
428 }
429 else
430 gcc_unreachable ();
431 }
432 else
433 gcc_unreachable ();
434 }
435 else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
436 {
437 gcall *call = as_a <gcall *> (stmt);
438 enum ifn_unique_kind k = ((enum ifn_unique_kind)
439 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
440 if (k == IFN_UNIQUE_OACC_JOIN)
441 {
442 HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
443 unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
444
445 gcc_assert (par->mask == mask);
446 par->join_block = block;
447 par->join_stmt = stmt;
448 par = par->parent;
449 }
450 else
451 gcc_unreachable ();
452 }
453 else
454 gcc_unreachable ();
455 }
456
457 if (par)
458 /* Add this block onto the current loop's list of blocks. */
459 par->blocks.safe_push (block);
460 else
461 /* This must be the entry block. Create a NULL parallel. */
462 par = new parallel_g (0, 0);
463
464 walk_successors:
465 /* Walk successor blocks. */
466 edge e;
467 edge_iterator ei;
468
469 FOR_EACH_EDGE (e, ei, block->succs)
470 omp_sese_find_par (map, par, e->dest);
471
472 return par;
473 }
474
475 /* DFS walk the CFG looking for fork & join markers. Construct
476 loop structures as we go. MAP is a mapping of basic blocks
477 to head & tail markers, discovered when splitting blocks. This
478 speeds up the discovery. We rely on the BB visited flag having
479 been cleared when splitting blocks. */
480 /* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_discover_pars'. */
481
482 static parallel_g *
omp_sese_discover_pars(bb_stmt_map_t * map)483 omp_sese_discover_pars (bb_stmt_map_t *map)
484 {
485 basic_block block;
486
487 /* Mark exit blocks as visited. */
488 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
489 block->flags |= BB_VISITED;
490
491 /* And entry block as not. */
492 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
493 block->flags &= ~BB_VISITED;
494
495 parallel_g *par = omp_sese_find_par (map, 0, block);
496
497 if (dump_file)
498 {
499 fprintf (dump_file, "\nLoops\n");
500 omp_sese_dump_pars (par, 0);
501 fprintf (dump_file, "\n");
502 }
503
504 return par;
505 }
506
507 static void
populate_single_mode_bitmaps(parallel_g * par,bitmap worker_single,bitmap vector_single,unsigned outer_mask,int depth)508 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
509 bitmap vector_single, unsigned outer_mask,
510 int depth)
511 {
512 unsigned mask = outer_mask | par->mask;
513
514 basic_block block;
515
516 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
517 {
518 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
519 bitmap_set_bit (worker_single, block->index);
520
521 if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
522 bitmap_set_bit (vector_single, block->index);
523 }
524
525 if (par->inner)
526 populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
527 mask, depth + 1);
528 if (par->next)
529 populate_single_mode_bitmaps (par->next, worker_single, vector_single,
530 outer_mask, depth);
531 }
532
533 /* A map from SSA names or var decls to record fields. */
534
535 typedef hash_map<tree, tree> field_map_t;
536
537 /* For each propagation record type, this is a map from SSA names or var decls
538 to propagate, to the field in the record type that should be used for
539 transmission and reception. */
540
541 typedef hash_map<tree, field_map_t *> record_field_map_t;
542
543 static void
install_var_field(tree var,tree record_type,field_map_t * fields)544 install_var_field (tree var, tree record_type, field_map_t *fields)
545 {
546 tree name;
547 char tmp[20];
548
549 if (TREE_CODE (var) == SSA_NAME)
550 {
551 name = SSA_NAME_IDENTIFIER (var);
552 if (!name)
553 {
554 sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
555 name = get_identifier (tmp);
556 }
557 }
558 else if (TREE_CODE (var) == VAR_DECL)
559 {
560 name = DECL_NAME (var);
561 if (!name)
562 {
563 sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
564 name = get_identifier (tmp);
565 }
566 }
567 else
568 gcc_unreachable ();
569
570 gcc_assert (!fields->get (var));
571
572 tree type = TREE_TYPE (var);
573
574 if (POINTER_TYPE_P (type)
575 && TYPE_RESTRICT (type))
576 type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
577
578 tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
579
580 if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
581 {
582 SET_DECL_ALIGN (field, DECL_ALIGN (var));
583 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
584 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
585 }
586 else
587 SET_DECL_ALIGN (field, TYPE_ALIGN (type));
588
589 fields->put (var, field);
590
591 insert_field_into_struct (record_type, field);
592 }
593
594 /* Sets of SSA_NAMES or VAR_DECLs to propagate. */
595 typedef hash_set<tree> propagation_set;
596
597 static void
find_ssa_names_to_propagate(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set)598 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
599 bitmap worker_single, bitmap vector_single,
600 vec<propagation_set *> *prop_set)
601 {
602 unsigned mask = outer_mask | par->mask;
603
604 if (par->inner)
605 find_ssa_names_to_propagate (par->inner, mask, worker_single,
606 vector_single, prop_set);
607 if (par->next)
608 find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
609 vector_single, prop_set);
610
611 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
612 {
613 basic_block block;
614 int ix;
615
616 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
617 {
618 for (gphi_iterator psi = gsi_start_phis (block);
619 !gsi_end_p (psi); gsi_next (&psi))
620 {
621 gphi *phi = psi.phi ();
622 use_operand_p use;
623 ssa_op_iter iter;
624
625 FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
626 {
627 tree var = USE_FROM_PTR (use);
628
629 if (TREE_CODE (var) != SSA_NAME)
630 continue;
631
632 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
633
634 if (gimple_nop_p (def_stmt))
635 continue;
636
637 basic_block def_bb = gimple_bb (def_stmt);
638
639 if (bitmap_bit_p (worker_single, def_bb->index))
640 {
641 if (!(*prop_set)[def_bb->index])
642 (*prop_set)[def_bb->index] = new propagation_set;
643
644 propagation_set *ws_prop = (*prop_set)[def_bb->index];
645
646 ws_prop->add (var);
647 }
648 }
649 }
650
651 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
652 !gsi_end_p (gsi); gsi_next (&gsi))
653 {
654 use_operand_p use;
655 ssa_op_iter iter;
656 gimple *stmt = gsi_stmt (gsi);
657
658 FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
659 {
660 tree var = USE_FROM_PTR (use);
661
662 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
663
664 if (gimple_nop_p (def_stmt))
665 continue;
666
667 basic_block def_bb = gimple_bb (def_stmt);
668
669 if (bitmap_bit_p (worker_single, def_bb->index))
670 {
671 if (!(*prop_set)[def_bb->index])
672 (*prop_set)[def_bb->index] = new propagation_set;
673
674 propagation_set *ws_prop = (*prop_set)[def_bb->index];
675
676 ws_prop->add (var);
677 }
678 }
679 }
680 }
681 }
682 }
683
684 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
685 statement. */
686
687 static tree
find_partitioned_var_uses_1(tree * node,int *,void * data)688 find_partitioned_var_uses_1 (tree *node, int *, void *data)
689 {
690 walk_stmt_info *wi = (walk_stmt_info *) data;
691 hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
692
693 if (!wi->is_lhs && VAR_P (*node))
694 partitioned_var_uses->add (*node);
695
696 return NULL_TREE;
697 }
698
699 static void
find_partitioned_var_uses(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses)700 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
701 hash_set<tree> *partitioned_var_uses)
702 {
703 unsigned mask = outer_mask | par->mask;
704
705 if (par->inner)
706 find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
707 if (par->next)
708 find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
709
710 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
711 {
712 basic_block block;
713 int ix;
714
715 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
716 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
717 !gsi_end_p (gsi); gsi_next (&gsi))
718 {
719 walk_stmt_info wi;
720 memset (&wi, 0, sizeof (wi));
721 wi.info = (void *) partitioned_var_uses;
722 walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
723 }
724 }
725 }
726
727 /* Gang-private variables (typically placed in a GPU's shared memory) do not
728 need to be processed by the worker-propagation mechanism. Populate the
729 GANG_PRIVATE_VARS set with any such variables found in the current
730 function. */
731
732 static void
find_gang_private_vars(hash_set<tree> * gang_private_vars)733 find_gang_private_vars (hash_set<tree> *gang_private_vars)
734 {
735 basic_block block;
736
737 FOR_EACH_BB_FN (block, cfun)
738 {
739 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
740 !gsi_end_p (gsi);
741 gsi_next (&gsi))
742 {
743 gimple *stmt = gsi_stmt (gsi);
744
745 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
746 {
747 enum ifn_unique_kind k = ((enum ifn_unique_kind)
748 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
749 if (k == IFN_UNIQUE_OACC_PRIVATE)
750 {
751 HOST_WIDE_INT level
752 = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
753 if (level != GOMP_DIM_GANG)
754 continue;
755 for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
756 {
757 tree arg = gimple_call_arg (stmt, i);
758 gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
759 tree decl = TREE_OPERAND (arg, 0);
760 gang_private_vars->add (decl);
761 }
762 }
763 }
764 }
765 }
766 }
767
768 static void
find_local_vars_to_propagate(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses,hash_set<tree> * gang_private_vars,bitmap writes_gang_private,vec<propagation_set * > * prop_set)769 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
770 hash_set<tree> *partitioned_var_uses,
771 hash_set<tree> *gang_private_vars,
772 bitmap writes_gang_private,
773 vec<propagation_set *> *prop_set)
774 {
775 unsigned mask = outer_mask | par->mask;
776
777 if (par->inner)
778 find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
779 gang_private_vars, writes_gang_private,
780 prop_set);
781 if (par->next)
782 find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
783 gang_private_vars, writes_gang_private,
784 prop_set);
785
786 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
787 {
788 basic_block block;
789 int ix;
790
791 for (ix = 0; par->blocks.iterate (ix, &block); ix++)
792 {
793 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
794 !gsi_end_p (gsi); gsi_next (&gsi))
795 {
796 gimple *stmt = gsi_stmt (gsi);
797 tree var;
798 unsigned i;
799
800 FOR_EACH_LOCAL_DECL (cfun, i, var)
801 {
802 if (!VAR_P (var)
803 || is_global_var (var)
804 || AGGREGATE_TYPE_P (TREE_TYPE (var))
805 || !partitioned_var_uses->contains (var))
806 continue;
807
808 if (stmt_may_clobber_ref_p (stmt, var))
809 {
810 if (dump_file)
811 {
812 fprintf (dump_file, "bb %u: local variable may be "
813 "clobbered in %s mode: ", block->index,
814 mask_name (mask));
815 print_generic_expr (dump_file, var, TDF_SLIM);
816 fprintf (dump_file, "\n");
817 }
818
819 if (gang_private_vars->contains (var))
820 {
821 /* If we write a gang-private variable, we want a
822 barrier at the end of the block. */
823 bitmap_set_bit (writes_gang_private, block->index);
824 continue;
825 }
826
827 if (!(*prop_set)[block->index])
828 (*prop_set)[block->index] = new propagation_set;
829
830 propagation_set *ws_prop
831 = (*prop_set)[block->index];
832
833 ws_prop->add (var);
834 }
835 }
836 }
837 }
838 }
839 }
840
841 /* Transform basic blocks FROM, TO (which may be the same block) into:
842 if (GOACC_single_start ())
843 BLOCK;
844 GOACC_barrier ();
845 \ | /
846 +----+
847 | | (new) predicate block
848 +----+--
849 \ | / \ | / |t \
850 +----+ +----+ +----+ |
851 | | | | ===> | | | f (old) from block
852 +----+ +----+ +----+ |
853 | t/ \f | /
854 +----+/
855 (split (split before | | skip block
856 at end) condition) +----+
857 t/ \f
858 */
859
860 static void
worker_single_simple(basic_block from,basic_block to,hash_set<tree> * def_escapes_block)861 worker_single_simple (basic_block from, basic_block to,
862 hash_set<tree> *def_escapes_block)
863 {
864 gimple *call, *cond;
865 tree lhs, decl;
866 basic_block skip_block;
867
868 gimple_stmt_iterator gsi = gsi_last_bb (to);
869 if (EDGE_COUNT (to->succs) > 1)
870 {
871 gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
872 gsi_prev (&gsi);
873 }
874 edge e = split_block (to, gsi_stmt (gsi));
875 skip_block = e->dest;
876
877 gimple_stmt_iterator start = gsi_after_labels (from);
878
879 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
880 lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
881 call = gimple_build_call (decl, 0);
882 gimple_call_set_lhs (call, lhs);
883 gsi_insert_before (&start, call, GSI_NEW_STMT);
884 update_stmt (call);
885
886 cond = gimple_build_cond (EQ_EXPR, lhs,
887 fold_convert_loc (UNKNOWN_LOCATION,
888 TREE_TYPE (lhs),
889 boolean_true_node),
890 NULL_TREE, NULL_TREE);
891 gsi_insert_after (&start, cond, GSI_NEW_STMT);
892 update_stmt (cond);
893
894 edge et = split_block (from, cond);
895 et->flags &= ~EDGE_FALLTHRU;
896 et->flags |= EDGE_TRUE_VALUE;
897 /* Make the active worker the more probable path so we prefer fallthrough
898 (letting the idle workers jump around more). */
899 et->probability = profile_probability::likely ();
900
901 edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
902 ef->probability = et->probability.invert ();
903
904 basic_block neutered = split_edge (ef);
905 gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
906
907 for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
908 {
909 gimple *stmt = gsi_stmt (gsi);
910 ssa_op_iter iter;
911 tree var;
912
913 FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
914 {
915 if (def_escapes_block->contains (var))
916 {
917 gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
918 create_new_def_for (var, join_phi,
919 gimple_phi_result_ptr (join_phi));
920 add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
921
922 tree neutered_def = copy_ssa_name (var, NULL);
923 /* We really want "don't care" or some value representing
924 undefined here, but optimizers will probably get rid of the
925 zero-assignments anyway. */
926 gassign *zero = gimple_build_assign (neutered_def,
927 build_zero_cst (TREE_TYPE (neutered_def)));
928
929 gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
930 update_stmt (zero);
931
932 add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
933 UNKNOWN_LOCATION);
934 update_stmt (join_phi);
935 }
936 }
937 }
938 }
939
940 /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it
941 as appropriate. */
942 /* Adapted from 'gcc/omp-low.c:omp_build_component_ref'. */
943
944 static tree
oacc_build_component_ref(tree obj,tree field)945 oacc_build_component_ref (tree obj, tree field)
946 {
947 tree field_type = TREE_TYPE (field);
948 tree obj_type = TREE_TYPE (obj);
949 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type)))
950 field_type = build_qualified_type
951 (field_type,
952 KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type)));
953
954 tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL);
955 if (TREE_THIS_VOLATILE (field))
956 TREE_THIS_VOLATILE (ret) |= 1;
957 if (TREE_READONLY (field))
958 TREE_READONLY (ret) |= 1;
959 return ret;
960 }
961
962 static tree
build_receiver_ref(tree var,tree receiver_decl,field_map_t * fields)963 build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
964 {
965 tree x = build_simple_mem_ref (receiver_decl);
966 tree field = *fields->get (var);
967 TREE_THIS_NOTRAP (x) = 1;
968 x = oacc_build_component_ref (x, field);
969 return x;
970 }
971
972 static tree
build_sender_ref(tree var,tree sender_decl,field_map_t * fields)973 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
974 {
975 if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
976 sender_decl = build_simple_mem_ref (sender_decl);
977 tree field = *fields->get (var);
978 return oacc_build_component_ref (sender_decl, field);
979 }
980
981 static int
sort_by_ssa_version_or_uid(const void * p1,const void * p2)982 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
983 {
984 const tree t1 = *(const tree *)p1;
985 const tree t2 = *(const tree *)p2;
986
987 if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
988 return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
989 else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
990 return -1;
991 else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
992 return 1;
993 else
994 return DECL_UID (t1) - DECL_UID (t2);
995 }
996
997 static int
sort_by_size_then_ssa_version_or_uid(const void * p1,const void * p2)998 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
999 {
1000 const tree t1 = *(const tree *)p1;
1001 const tree t2 = *(const tree *)p2;
1002 unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
1003 unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
1004 if (s1 != s2)
1005 return s2 - s1;
1006 else
1007 return sort_by_ssa_version_or_uid (p1, p2);
1008 }
1009
1010 static void
worker_single_copy(basic_block from,basic_block to,hash_set<tree> * def_escapes_block,hash_set<tree> * worker_partitioned_uses,tree record_type,record_field_map_t * record_field_map,unsigned HOST_WIDE_INT placement,bool isolate_broadcasts,bool has_gang_private_write)1011 worker_single_copy (basic_block from, basic_block to,
1012 hash_set<tree> *def_escapes_block,
1013 hash_set<tree> *worker_partitioned_uses,
1014 tree record_type, record_field_map_t *record_field_map,
1015 unsigned HOST_WIDE_INT placement,
1016 bool isolate_broadcasts, bool has_gang_private_write)
1017 {
1018 /* If we only have virtual defs, we'll have no record type, but we still want
1019 to emit single_copy_start and (particularly) single_copy_end to act as
1020 a vdef source on the neutered edge representing memory writes on the
1021 non-neutered edge. */
1022 if (!record_type)
1023 record_type = char_type_node;
1024
1025 tree sender_decl
1026 = targetm.goacc.create_worker_broadcast_record (record_type, true,
1027 ".oacc_worker_o",
1028 placement);
1029 tree receiver_decl
1030 = targetm.goacc.create_worker_broadcast_record (record_type, false,
1031 ".oacc_worker_i",
1032 placement);
1033
1034 gimple_stmt_iterator gsi = gsi_last_bb (to);
1035 if (EDGE_COUNT (to->succs) > 1)
1036 gsi_prev (&gsi);
1037 edge e = split_block (to, gsi_stmt (gsi));
1038 basic_block barrier_block = e->dest;
1039
1040 gimple_stmt_iterator start = gsi_after_labels (from);
1041
1042 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
1043
1044 tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
1045
1046 gimple *call
1047 = gimple_build_call (decl, 1,
1048 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1049 ? sender_decl : build_fold_addr_expr (sender_decl));
1050 gimple_call_set_lhs (call, lhs);
1051 gsi_insert_before (&start, call, GSI_NEW_STMT);
1052 update_stmt (call);
1053
1054 /* The shared-memory range for this block overflowed. Add a barrier before
1055 the GOACC_single_copy_start call. */
1056 if (isolate_broadcasts)
1057 {
1058 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1059 gimple *acc_bar = gimple_build_call (decl, 0);
1060 gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
1061 }
1062
1063 tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1064
1065 gimple *conv = gimple_build_assign (conv_tmp,
1066 fold_convert (TREE_TYPE (receiver_decl),
1067 lhs));
1068 update_stmt (conv);
1069 gsi_insert_after (&start, conv, GSI_NEW_STMT);
1070 gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
1071 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1072 update_stmt (asgn);
1073
1074 tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
1075
1076 tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1077 asgn = gimple_build_assign (recv_tmp, receiver_decl);
1078 gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1079 update_stmt (asgn);
1080
1081 gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
1082 NULL_TREE);
1083 update_stmt (cond);
1084
1085 gsi_insert_after (&start, cond, GSI_NEW_STMT);
1086
1087 edge et = split_block (from, cond);
1088 et->flags &= ~EDGE_FALLTHRU;
1089 et->flags |= EDGE_TRUE_VALUE;
1090 /* Make the active worker the more probable path so we prefer fallthrough
1091 (letting the idle workers jump around more). */
1092 et->probability = profile_probability::likely ();
1093
1094 basic_block body = et->dest;
1095
1096 edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
1097 ef->probability = et->probability.invert ();
1098
1099 gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
1100 cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
1101
1102 if (record_type != char_type_node || has_gang_private_write)
1103 {
1104 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1105 gimple *acc_bar = gimple_build_call (decl, 0);
1106
1107 gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
1108 gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
1109 }
1110 else
1111 gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
1112
1113 edge et2 = split_block (barrier_block, cond);
1114 et2->flags &= ~EDGE_FALLTHRU;
1115 et2->flags |= EDGE_TRUE_VALUE;
1116 et2->probability = profile_probability::unlikely ();
1117
1118 basic_block exit_block = et2->dest;
1119
1120 basic_block copyout_block = split_edge (et2);
1121 edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
1122 ef2->probability = et2->probability.invert ();
1123
1124 gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
1125
1126 edge copyout_to_exit = single_succ_edge (copyout_block);
1127
1128 gimple_seq sender_seq = NULL;
1129
1130 /* Make sure we iterate over definitions in a stable order. */
1131 auto_vec<tree> escape_vec (def_escapes_block->elements ());
1132 for (hash_set<tree>::iterator it = def_escapes_block->begin ();
1133 it != def_escapes_block->end (); ++it)
1134 escape_vec.quick_push (*it);
1135 escape_vec.qsort (sort_by_ssa_version_or_uid);
1136
1137 for (unsigned i = 0; i < escape_vec.length (); i++)
1138 {
1139 tree var = escape_vec[i];
1140
1141 if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
1142 continue;
1143
1144 tree barrier_def = 0;
1145
1146 if (TREE_CODE (var) == SSA_NAME)
1147 {
1148 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1149
1150 if (gimple_nop_p (def_stmt))
1151 continue;
1152
1153 /* The barrier phi takes one result from the actual work of the
1154 block we're neutering, and the other result is constant zero of
1155 the same type. */
1156
1157 gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
1158 barrier_def = create_new_def_for (var, barrier_phi,
1159 gimple_phi_result_ptr (barrier_phi));
1160
1161 add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
1162 add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
1163 UNKNOWN_LOCATION);
1164
1165 update_stmt (barrier_phi);
1166 }
1167 else
1168 gcc_assert (TREE_CODE (var) == VAR_DECL);
1169
1170 /* If we had no record type, we will have no fields map. */
1171 field_map_t **fields_p = record_field_map->get (record_type);
1172 field_map_t *fields = fields_p ? *fields_p : NULL;
1173
1174 if (worker_partitioned_uses->contains (var)
1175 && fields
1176 && fields->get (var))
1177 {
1178 tree neutered_def = make_ssa_name (TREE_TYPE (var));
1179
1180 /* Receive definition from shared memory block. */
1181
1182 tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
1183 gassign *recv = gimple_build_assign (neutered_def,
1184 receiver_ref);
1185 gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING);
1186 update_stmt (recv);
1187
1188 if (TREE_CODE (var) == VAR_DECL)
1189 {
1190 /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy
1191 to the final location now. */
1192 gassign *asgn = gimple_build_assign (var, neutered_def);
1193 gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING);
1194 update_stmt (asgn);
1195 }
1196 else
1197 {
1198 /* If it's an SSA name, create a new phi at the join node to
1199 represent either the output from the active worker (the
1200 barrier) or the inactive workers (the copyout block). */
1201 gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
1202 create_new_def_for (barrier_def, join_phi,
1203 gimple_phi_result_ptr (join_phi));
1204 add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
1205 add_phi_arg (join_phi, neutered_def, copyout_to_exit,
1206 UNKNOWN_LOCATION);
1207 update_stmt (join_phi);
1208 }
1209
1210 /* Send definition to shared memory block. */
1211
1212 tree sender_ref = build_sender_ref (var, sender_decl, fields);
1213
1214 if (TREE_CODE (var) == SSA_NAME)
1215 {
1216 gassign *send = gimple_build_assign (sender_ref, var);
1217 gimple_seq_add_stmt (&sender_seq, send);
1218 update_stmt (send);
1219 }
1220 else if (TREE_CODE (var) == VAR_DECL)
1221 {
1222 tree tmp = make_ssa_name (TREE_TYPE (var));
1223 gassign *send = gimple_build_assign (tmp, var);
1224 gimple_seq_add_stmt (&sender_seq, send);
1225 update_stmt (send);
1226 send = gimple_build_assign (sender_ref, tmp);
1227 gimple_seq_add_stmt (&sender_seq, send);
1228 update_stmt (send);
1229 }
1230 else
1231 gcc_unreachable ();
1232 }
1233 }
1234
1235 /* The shared-memory range for this block overflowed. Add a barrier at the
1236 end. */
1237 if (isolate_broadcasts)
1238 {
1239 gsi = gsi_start_bb (exit_block);
1240 decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1241 gimple *acc_bar = gimple_build_call (decl, 0);
1242 gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
1243 }
1244
1245 /* It's possible for the ET->DEST block (the work done by the active thread)
1246 to finish with a control-flow insn, e.g. a UNIQUE function call. Split
1247 the block and add SENDER_SEQ in the latter part to avoid having control
1248 flow in the middle of a BB. */
1249
1250 decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
1251 call = gimple_build_call (decl, 1,
1252 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1253 ? sender_decl
1254 : build_fold_addr_expr (sender_decl));
1255 gimple_seq_add_stmt (&sender_seq, call);
1256
1257 gsi = gsi_last_bb (body);
1258 gimple *last = gsi_stmt (gsi);
1259 basic_block sender_block = split_block (body, last)->dest;
1260 gsi = gsi_last_bb (sender_block);
1261 gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
1262 }
1263
1264 typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
1265 blk_offset_map_t;
1266
1267 static void
neuter_worker_single(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set,hash_set<tree> * partitioned_var_uses,record_field_map_t * record_field_map,blk_offset_map_t * blk_offset_map,bitmap writes_gang_private)1268 neuter_worker_single (parallel_g *par, unsigned outer_mask,
1269 bitmap worker_single, bitmap vector_single,
1270 vec<propagation_set *> *prop_set,
1271 hash_set<tree> *partitioned_var_uses,
1272 record_field_map_t *record_field_map,
1273 blk_offset_map_t *blk_offset_map,
1274 bitmap writes_gang_private)
1275 {
1276 unsigned mask = outer_mask | par->mask;
1277
1278 if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1279 {
1280 basic_block block;
1281
1282 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1283 {
1284 bool has_defs = false;
1285 hash_set<tree> def_escapes_block;
1286 hash_set<tree> worker_partitioned_uses;
1287 unsigned j;
1288 tree var;
1289
1290 FOR_EACH_SSA_NAME (j, var, cfun)
1291 {
1292 if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
1293 {
1294 has_defs = true;
1295 continue;
1296 }
1297
1298 gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1299
1300 if (gimple_nop_p (def_stmt))
1301 continue;
1302
1303 if (gimple_bb (def_stmt)->index != block->index)
1304 continue;
1305
1306 gimple *use_stmt;
1307 imm_use_iterator use_iter;
1308 bool uses_outside_block = false;
1309 bool worker_partitioned_use = false;
1310
1311 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
1312 {
1313 int blocknum = gimple_bb (use_stmt)->index;
1314
1315 /* Don't propagate SSA names that are only used in the
1316 current block, unless the usage is in a phi node: that
1317 means the name left the block, then came back in at the
1318 top. */
1319 if (blocknum != block->index
1320 || gimple_code (use_stmt) == GIMPLE_PHI)
1321 uses_outside_block = true;
1322 if (!bitmap_bit_p (worker_single, blocknum))
1323 worker_partitioned_use = true;
1324 }
1325
1326 if (uses_outside_block)
1327 def_escapes_block.add (var);
1328
1329 if (worker_partitioned_use)
1330 {
1331 worker_partitioned_uses.add (var);
1332 has_defs = true;
1333 }
1334 }
1335
1336 propagation_set *ws_prop = (*prop_set)[block->index];
1337
1338 if (ws_prop)
1339 {
1340 for (propagation_set::iterator it = ws_prop->begin ();
1341 it != ws_prop->end ();
1342 ++it)
1343 {
1344 tree var = *it;
1345 if (TREE_CODE (var) == VAR_DECL)
1346 {
1347 def_escapes_block.add (var);
1348 if (partitioned_var_uses->contains (var))
1349 {
1350 worker_partitioned_uses.add (var);
1351 has_defs = true;
1352 }
1353 }
1354 }
1355
1356 delete ws_prop;
1357 (*prop_set)[block->index] = 0;
1358 }
1359
1360 bool only_marker_fns = true;
1361 bool join_block = false;
1362
1363 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1364 !gsi_end_p (gsi);
1365 gsi_next (&gsi))
1366 {
1367 gimple *stmt = gsi_stmt (gsi);
1368 if (gimple_code (stmt) == GIMPLE_CALL
1369 && gimple_call_internal_p (stmt, IFN_UNIQUE))
1370 {
1371 enum ifn_unique_kind k = ((enum ifn_unique_kind)
1372 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1373 if (k != IFN_UNIQUE_OACC_PRIVATE
1374 && k != IFN_UNIQUE_OACC_JOIN
1375 && k != IFN_UNIQUE_OACC_FORK
1376 && k != IFN_UNIQUE_OACC_HEAD_MARK
1377 && k != IFN_UNIQUE_OACC_TAIL_MARK)
1378 only_marker_fns = false;
1379 else if (k == IFN_UNIQUE_OACC_JOIN)
1380 /* The JOIN marker is special in that it *cannot* be
1381 predicated for worker zero, because it may be lowered
1382 to a barrier instruction and all workers must typically
1383 execute that barrier. We shouldn't be doing any
1384 broadcasts from the join block anyway. */
1385 join_block = true;
1386 }
1387 else if (gimple_code (stmt) == GIMPLE_CALL
1388 && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
1389 /* Empty. */;
1390 else if (gimple_nop_p (stmt))
1391 /* Empty. */;
1392 else
1393 only_marker_fns = false;
1394 }
1395
1396 /* We can skip predicating this block for worker zero if the only
1397 thing it contains is marker functions that will be removed in the
1398 oaccdevlow pass anyway.
1399 Don't do this if the block has (any) phi nodes, because those
1400 might define SSA names that need broadcasting.
1401 TODO: We might be able to skip transforming blocks that only
1402 contain some other trivial statements too. */
1403 if (only_marker_fns && !phi_nodes (block))
1404 continue;
1405
1406 gcc_assert (!join_block);
1407
1408 if (has_defs)
1409 {
1410 tree record_type = (tree) block->aux;
1411 std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
1412 = blk_offset_map->get (block);
1413 gcc_assert (!record_type || off_rngalloc);
1414 unsigned HOST_WIDE_INT offset
1415 = off_rngalloc ? off_rngalloc->first : 0;
1416 bool range_allocated
1417 = off_rngalloc ? off_rngalloc->second : true;
1418 bool has_gang_private_write
1419 = bitmap_bit_p (writes_gang_private, block->index);
1420 worker_single_copy (block, block, &def_escapes_block,
1421 &worker_partitioned_uses, record_type,
1422 record_field_map,
1423 offset, !range_allocated,
1424 has_gang_private_write);
1425 }
1426 else
1427 worker_single_simple (block, block, &def_escapes_block);
1428 }
1429 }
1430
1431 if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1432 {
1433 basic_block block;
1434
1435 for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1436 for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1437 !gsi_end_p (gsi);
1438 gsi_next (&gsi))
1439 {
1440 gimple *stmt = gsi_stmt (gsi);
1441
1442 if (gimple_code (stmt) == GIMPLE_CALL
1443 && !gimple_call_internal_p (stmt)
1444 && !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
1445 {
1446 /* If we have an OpenACC routine call in worker-single mode,
1447 place barriers before and afterwards to prevent
1448 clobbering re-used shared memory regions (as are used
1449 for AMDGCN at present, for example). */
1450 tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1451 gsi_insert_before (&gsi, gimple_build_call (decl, 0),
1452 GSI_SAME_STMT);
1453 gsi_insert_after (&gsi, gimple_build_call (decl, 0),
1454 GSI_NEW_STMT);
1455 }
1456 }
1457 }
1458
1459 if (par->inner)
1460 neuter_worker_single (par->inner, mask, worker_single, vector_single,
1461 prop_set, partitioned_var_uses, record_field_map,
1462 blk_offset_map, writes_gang_private);
1463 if (par->next)
1464 neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
1465 prop_set, partitioned_var_uses, record_field_map,
1466 blk_offset_map, writes_gang_private);
1467 }
1468
1469 static void
dfs_broadcast_reachable_1(basic_block bb,sbitmap reachable)1470 dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
1471 {
1472 if (bb->flags & BB_VISITED)
1473 return;
1474
1475 bb->flags |= BB_VISITED;
1476
1477 if (bb->succs)
1478 {
1479 edge e;
1480 edge_iterator ei;
1481 FOR_EACH_EDGE (e, ei, bb->succs)
1482 {
1483 basic_block dest = e->dest;
1484 if (dest->aux)
1485 bitmap_set_bit (reachable, dest->index);
1486 else
1487 dfs_broadcast_reachable_1 (dest, reachable);
1488 }
1489 }
1490 }
1491
1492 typedef std::pair<int, tree> idx_decl_pair_t;
1493
1494 typedef auto_vec<splay_tree> used_range_vec_t;
1495
1496 static int
sort_size_descending(const void * a,const void * b)1497 sort_size_descending (const void *a, const void *b)
1498 {
1499 const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
1500 const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
1501 unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
1502 unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
1503 return bsize - asize;
1504 }
1505
1506 class addr_range
1507 {
1508 public:
addr_range(unsigned HOST_WIDE_INT addr_lo,unsigned HOST_WIDE_INT addr_hi)1509 addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
1510 : lo (addr_lo), hi (addr_hi)
1511 { }
addr_range(const addr_range & ar)1512 addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
1513 { }
addr_range()1514 addr_range () : lo (0), hi (0)
1515 { }
1516
invalid()1517 bool invalid () { return lo == 0 && hi == 0; }
1518
1519 unsigned HOST_WIDE_INT lo;
1520 unsigned HOST_WIDE_INT hi;
1521 };
1522
1523 static int
splay_tree_compare_addr_range(splay_tree_key a,splay_tree_key b)1524 splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
1525 {
1526 addr_range *ar = (addr_range *) a;
1527 addr_range *br = (addr_range *) b;
1528 if (ar->lo == br->lo && ar->hi == br->hi)
1529 return 0;
1530 if (ar->hi <= br->lo)
1531 return -1;
1532 else if (ar->lo >= br->hi)
1533 return 1;
1534 return 0;
1535 }
1536
1537 static void
splay_tree_free_key(splay_tree_key k)1538 splay_tree_free_key (splay_tree_key k)
1539 {
1540 addr_range *ar = (addr_range *) k;
1541 delete ar;
1542 }
1543
1544 static addr_range
first_fit_range(splay_tree s,unsigned HOST_WIDE_INT size,unsigned HOST_WIDE_INT align,addr_range * bounds)1545 first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
1546 unsigned HOST_WIDE_INT align, addr_range *bounds)
1547 {
1548 splay_tree_node min = splay_tree_min (s);
1549 if (min)
1550 {
1551 splay_tree_node next;
1552 while ((next = splay_tree_successor (s, min->key)))
1553 {
1554 unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
1555 unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
1556 unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
1557 if (base + size <= hi)
1558 return addr_range (base, base + size);
1559 min = next;
1560 }
1561
1562 unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
1563 base = (base + align - 1) & ~(align - 1);
1564 if (base + size <= bounds->hi)
1565 return addr_range (base, base + size);
1566 else
1567 return addr_range ();
1568 }
1569 else
1570 {
1571 unsigned HOST_WIDE_INT lo = bounds->lo;
1572 lo = (lo + align - 1) & ~(align - 1);
1573 if (lo + size <= bounds->hi)
1574 return addr_range (lo, lo + size);
1575 else
1576 return addr_range ();
1577 }
1578 }
1579
1580 static int
merge_ranges_1(splay_tree_node n,void * ptr)1581 merge_ranges_1 (splay_tree_node n, void *ptr)
1582 {
1583 splay_tree accum = (splay_tree) ptr;
1584 addr_range ar = *(addr_range *) n->key;
1585
1586 splay_tree_node old = splay_tree_lookup (accum, n->key);
1587
1588 /* We might have an overlap. Create a new range covering the
1589 overlapping parts. */
1590 if (old)
1591 {
1592 addr_range *old_ar = (addr_range *) old->key;
1593 ar.lo = MIN (old_ar->lo, ar.lo);
1594 ar.hi = MAX (old_ar->hi, ar.hi);
1595 splay_tree_remove (accum, old->key);
1596 }
1597
1598 addr_range *new_ar = new addr_range (ar);
1599
1600 splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
1601
1602 return 0;
1603 }
1604
1605 static void
merge_ranges(splay_tree accum,splay_tree sp)1606 merge_ranges (splay_tree accum, splay_tree sp)
1607 {
1608 splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
1609 }
1610
1611 static void
oacc_do_neutering(unsigned HOST_WIDE_INT bounds_lo,unsigned HOST_WIDE_INT bounds_hi)1612 oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
1613 unsigned HOST_WIDE_INT bounds_hi)
1614 {
1615 bb_stmt_map_t bb_stmt_map;
1616 auto_bitmap worker_single, vector_single;
1617
1618 omp_sese_split_blocks (&bb_stmt_map);
1619
1620 if (dump_file)
1621 {
1622 fprintf (dump_file, "\n\nAfter splitting:\n\n");
1623 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1624 }
1625
1626 unsigned mask = 0;
1627
1628 /* If this is a routine, calculate MASK as if the outer levels are already
1629 partitioned. */
1630 {
1631 tree attr = oacc_get_fn_attrib (current_function_decl);
1632 tree dims = TREE_VALUE (attr);
1633 unsigned ix;
1634 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
1635 {
1636 tree allowed = TREE_PURPOSE (dims);
1637 if (allowed && integer_zerop (allowed))
1638 mask |= GOMP_DIM_MASK (ix);
1639 }
1640 }
1641
1642 parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
1643 populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
1644
1645 basic_block bb;
1646 FOR_ALL_BB_FN (bb, cfun)
1647 bb->aux = NULL;
1648
1649 vec<propagation_set *> prop_set (vNULL);
1650 prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
1651
1652 find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
1653 &prop_set);
1654
1655 hash_set<tree> partitioned_var_uses;
1656 hash_set<tree> gang_private_vars;
1657 auto_bitmap writes_gang_private;
1658
1659 find_gang_private_vars (&gang_private_vars);
1660 find_partitioned_var_uses (par, mask, &partitioned_var_uses);
1661 find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
1662 &gang_private_vars, writes_gang_private,
1663 &prop_set);
1664
1665 record_field_map_t record_field_map;
1666
1667 FOR_ALL_BB_FN (bb, cfun)
1668 {
1669 propagation_set *ws_prop = prop_set[bb->index];
1670 if (ws_prop)
1671 {
1672 tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
1673 tree name = create_tmp_var_name (".oacc_ws_data_s");
1674 name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
1675 DECL_ARTIFICIAL (name) = 1;
1676 DECL_NAMELESS (name) = 1;
1677 TYPE_NAME (record_type) = name;
1678 TYPE_ARTIFICIAL (record_type) = 1;
1679
1680 auto_vec<tree> field_vec (ws_prop->elements ());
1681 for (hash_set<tree>::iterator it = ws_prop->begin ();
1682 it != ws_prop->end (); ++it)
1683 field_vec.quick_push (*it);
1684
1685 field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
1686
1687 field_map_t *fields = new field_map_t;
1688
1689 bool existed;
1690 existed = record_field_map.put (record_type, fields);
1691 gcc_checking_assert (!existed);
1692
1693 /* Insert var fields in reverse order, so the last inserted element
1694 is the first in the structure. */
1695 for (int i = field_vec.length () - 1; i >= 0; i--)
1696 install_var_field (field_vec[i], record_type, fields);
1697
1698 layout_type (record_type);
1699
1700 bb->aux = (tree) record_type;
1701 }
1702 }
1703
1704 sbitmap *reachable
1705 = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1706 last_basic_block_for_fn (cfun));
1707
1708 bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
1709
1710 auto_vec<std::pair<int, tree> > priority;
1711
1712 FOR_ALL_BB_FN (bb, cfun)
1713 {
1714 if (bb->aux)
1715 {
1716 tree record_type = (tree) bb->aux;
1717
1718 basic_block bb2;
1719 FOR_ALL_BB_FN (bb2, cfun)
1720 bb2->flags &= ~BB_VISITED;
1721
1722 priority.safe_push (std::make_pair (bb->index, record_type));
1723 dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
1724 }
1725 }
1726
1727 sbitmap *inverted
1728 = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1729 last_basic_block_for_fn (cfun));
1730
1731 bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
1732
1733 for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1734 {
1735 sbitmap_iterator bi;
1736 unsigned int j;
1737 EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
1738 bitmap_set_bit (inverted[j], i);
1739 }
1740
1741 for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1742 bitmap_ior (reachable[i], reachable[i], inverted[i]);
1743
1744 sbitmap_vector_free (inverted);
1745
1746 used_range_vec_t used_ranges;
1747
1748 used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
1749
1750 blk_offset_map_t blk_offset_map;
1751
1752 addr_range worker_shm_bounds (bounds_lo, bounds_hi);
1753
1754 priority.qsort (sort_size_descending);
1755 for (unsigned int i = 0; i < priority.length (); i++)
1756 {
1757 idx_decl_pair_t p = priority[i];
1758 int blkno = p.first;
1759 tree record_type = p.second;
1760 HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
1761 HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
1762
1763 splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
1764 splay_tree_free_key, NULL);
1765
1766 if (!used_ranges[blkno])
1767 used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
1768 splay_tree_free_key, NULL);
1769 else
1770 merge_ranges (conflicts, used_ranges[blkno]);
1771
1772 sbitmap_iterator bi;
1773 unsigned int j;
1774 EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
1775 if (used_ranges[j])
1776 merge_ranges (conflicts, used_ranges[j]);
1777
1778 addr_range ar
1779 = first_fit_range (conflicts, size, align, &worker_shm_bounds);
1780
1781 splay_tree_delete (conflicts);
1782
1783 if (ar.invalid ())
1784 {
1785 unsigned HOST_WIDE_INT base
1786 = (bounds_lo + align - 1) & ~(align - 1);
1787 if (base + size > bounds_hi)
1788 error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
1789 std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1790 = std::make_pair (base, false);
1791 blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1792 }
1793 else
1794 {
1795 splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
1796 (splay_tree_key) &ar);
1797 if (old)
1798 {
1799 fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
1800 "already mapped in block %d\n", (int) ar.lo,
1801 (int) ar.hi, (int) ((addr_range *) old->key)->lo,
1802 (int) ((addr_range *) old->key)->hi, blkno);
1803 abort ();
1804 }
1805
1806 addr_range *arp = new addr_range (ar);
1807 splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
1808 (splay_tree_value) blkno);
1809 std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1810 = std::make_pair (ar.lo, true);
1811 blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1812 }
1813 }
1814
1815 sbitmap_vector_free (reachable);
1816
1817 neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
1818 &partitioned_var_uses, &record_field_map,
1819 &blk_offset_map, writes_gang_private);
1820
1821 for (auto it : record_field_map)
1822 delete it.second;
1823 record_field_map.empty ();
1824
1825 /* These are supposed to have been 'delete'd by 'neuter_worker_single'. */
1826 for (auto it : prop_set)
1827 gcc_checking_assert (!it);
1828 prop_set.release ();
1829
1830 delete par;
1831
1832 /* This doesn't seem to make a difference. */
1833 loops_state_clear (LOOP_CLOSED_SSA);
1834
1835 /* Neutering worker-single neutered blocks will invalidate dominance info.
1836 It may be possible to incrementally update just the affected blocks, but
1837 obliterate everything for now. */
1838 free_dominance_info (CDI_DOMINATORS);
1839 free_dominance_info (CDI_POST_DOMINATORS);
1840
1841 if (dump_file)
1842 {
1843 fprintf (dump_file, "\n\nAfter neutering:\n\n");
1844 dump_function_to_file (current_function_decl, dump_file, dump_flags);
1845 }
1846 }
1847
1848 static int
execute_omp_oacc_neuter_broadcast()1849 execute_omp_oacc_neuter_broadcast ()
1850 {
1851 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
1852 unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
1853
1854 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1855 {
1856 reduction_size[i] = 0;
1857 private_size[i] = 0;
1858 }
1859
1860 /* Calculate shared memory size required for reduction variables and
1861 gang-private memory for this offloaded function. */
1862 basic_block bb;
1863 FOR_ALL_BB_FN (bb, cfun)
1864 {
1865 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
1866 !gsi_end_p (gsi);
1867 gsi_next (&gsi))
1868 {
1869 gimple *stmt = gsi_stmt (gsi);
1870 if (!is_gimple_call (stmt))
1871 continue;
1872 gcall *call = as_a <gcall *> (stmt);
1873 if (!gimple_call_internal_p (call))
1874 continue;
1875 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1876 switch (ifn_code)
1877 {
1878 default: break;
1879 case IFN_GOACC_REDUCTION:
1880 if (integer_minus_onep (gimple_call_arg (call, 3)))
1881 continue;
1882 else
1883 {
1884 unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1885 /* Only count reduction variables once: the choice to pick
1886 the setup call is fairly arbitrary. */
1887 if (code == IFN_GOACC_REDUCTION_SETUP)
1888 {
1889 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
1890 tree var = gimple_call_arg (call, 2);
1891 tree offset = gimple_call_arg (call, 5);
1892 tree var_type = TREE_TYPE (var);
1893 unsigned HOST_WIDE_INT limit
1894 = (tree_to_uhwi (offset)
1895 + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
1896 reduction_size[level]
1897 = MAX (reduction_size[level], limit);
1898 }
1899 }
1900 break;
1901 case IFN_UNIQUE:
1902 {
1903 enum ifn_unique_kind kind
1904 = ((enum ifn_unique_kind)
1905 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1906
1907 if (kind == IFN_UNIQUE_OACC_PRIVATE)
1908 {
1909 HOST_WIDE_INT level
1910 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1911 if (level == -1)
1912 break;
1913 for (unsigned i = 3;
1914 i < gimple_call_num_args (call);
1915 i++)
1916 {
1917 tree arg = gimple_call_arg (call, i);
1918 gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
1919 tree decl = TREE_OPERAND (arg, 0);
1920 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
1921 private_size[level] = ((private_size[level] + align - 1)
1922 & ~(align - 1));
1923 unsigned HOST_WIDE_INT decl_size
1924 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
1925 private_size[level] += decl_size;
1926 }
1927 }
1928 }
1929 break;
1930 }
1931 }
1932 }
1933
1934 int dims[GOMP_DIM_MAX];
1935 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1936 dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
1937
1938 /* Find bounds of shared-memory buffer space we can use. */
1939 unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
1940 if (targetm.goacc.shared_mem_layout)
1941 targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
1942 private_size, reduction_size);
1943
1944 /* Perform worker partitioning unless we know 'num_workers(1)'. */
1945 if (dims[GOMP_DIM_WORKER] != 1)
1946 oacc_do_neutering (bounds_lo, bounds_hi);
1947
1948 return 0;
1949 }
1950
1951 namespace {
1952
1953 const pass_data pass_data_omp_oacc_neuter_broadcast =
1954 {
1955 GIMPLE_PASS, /* type */
1956 "omp_oacc_neuter_broadcast", /* name */
1957 OPTGROUP_OMP, /* optinfo_flags */
1958 TV_NONE, /* tv_id */
1959 PROP_cfg, /* properties_required */
1960 0, /* properties_provided */
1961 0, /* properties_destroyed */
1962 0, /* todo_flags_start */
1963 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1964 };
1965
1966 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
1967 {
1968 public:
pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)1969 pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1970 : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
1971 {}
1972
1973 /* opt_pass methods: */
gate(function * fun)1974 virtual bool gate (function *fun)
1975 {
1976 if (!flag_openacc)
1977 return false;
1978
1979 if (!targetm.goacc.create_worker_broadcast_record)
1980 return false;
1981
1982 /* Only relevant for OpenACC offloaded functions. */
1983 tree attr = oacc_get_fn_attrib (fun->decl);
1984 if (!attr)
1985 return false;
1986
1987 return true;
1988 }
1989
execute(function *)1990 virtual unsigned int execute (function *)
1991 {
1992 return execute_omp_oacc_neuter_broadcast ();
1993 }
1994
1995 }; // class pass_omp_oacc_neuter_broadcast
1996
1997 } // anon namespace
1998
1999 gimple_opt_pass *
make_pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)2000 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
2001 {
2002 return new pass_omp_oacc_neuter_broadcast (ctxt);
2003 }
2004