1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2022 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56 #include "convert.h"
57 #include "opts.h"
58
59 /* Describe the OpenACC looping structure of a function. The entire
60 function is held in a 'NULL' loop. */
61
62 struct oacc_loop
63 {
64 oacc_loop *parent; /* Containing loop. */
65
66 oacc_loop *child; /* First inner loop. */
67
68 oacc_loop *sibling; /* Next loop within same parent. */
69
70 location_t loc; /* Location of the loop start. */
71
72 gcall *marker; /* Initial head marker. */
73
74 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
75 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
76
77 tree routine; /* Pseudo-loop enclosing a routine. */
78
79 unsigned mask; /* Partitioning mask. */
80 unsigned e_mask; /* Partitioning of element loops (when tiling). */
81 unsigned inner; /* Partitioning of inner loops. */
82 unsigned flags; /* Partitioning flags. */
83 vec<gcall *> ifns; /* Contained loop abstraction functions. */
84 tree chunk_size; /* Chunk size. */
85 gcall *head_end; /* Final marker of head sequence. */
86 };
87
88 /* Holds offload tables with decls. */
89 vec<tree, va_gc> *offload_funcs, *offload_vars;
90
91 /* Return level at which oacc routine may spawn a partitioned loop, or
92 -1 if it is not a routine (i.e. is an offload fn). */
93
94 int
oacc_fn_attrib_level(tree attr)95 oacc_fn_attrib_level (tree attr)
96 {
97 tree pos = TREE_VALUE (attr);
98
99 if (!TREE_PURPOSE (pos))
100 return -1;
101
102 int ix = 0;
103 for (ix = 0; ix != GOMP_DIM_MAX;
104 ix++, pos = TREE_CHAIN (pos))
105 if (!integer_zerop (TREE_PURPOSE (pos)))
106 break;
107
108 return ix;
109 }
110
111 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
112 adds their addresses and sizes to constructor-vector V_CTOR. */
113
114 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)115 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116 vec<constructor_elt, va_gc> *v_ctor)
117 {
118 unsigned len = vec_safe_length (v_decls);
119 for (unsigned i = 0; i < len; i++)
120 {
121 tree it = (*v_decls)[i];
122 bool is_var = VAR_P (it);
123 bool is_link_var
124 = is_var
125 #ifdef ACCEL_COMPILER
126 && DECL_HAS_VALUE_EXPR_P (it)
127 #endif
128 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
129
130 /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc. */
131 if (!in_lto_p && !symtab_node::get (it))
132 continue;
133
134 tree size = NULL_TREE;
135 if (is_var)
136 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
137
138 tree addr;
139 if (!is_link_var)
140 addr = build_fold_addr_expr (it);
141 else
142 {
143 #ifdef ACCEL_COMPILER
144 /* For "omp declare target link" vars add address of the pointer to
145 the target table, instead of address of the var. */
146 tree value_expr = DECL_VALUE_EXPR (it);
147 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148 varpool_node::finalize_decl (link_ptr_decl);
149 addr = build_fold_addr_expr (link_ptr_decl);
150 #else
151 addr = build_fold_addr_expr (it);
152 #endif
153
154 /* Most significant bit of the size marks "omp declare target link"
155 vars in host and target tables. */
156 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158 * BITS_PER_UNIT - 1);
159 size = wide_int_to_tree (const_ptr_type_node, isize);
160 }
161
162 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163 if (is_var)
164 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
165 }
166 }
167
168 /* Return true if DECL is a function for which its references should be
169 analyzed. */
170
171 static bool
omp_declare_target_fn_p(tree decl)172 omp_declare_target_fn_p (tree decl)
173 {
174 return (TREE_CODE (decl) == FUNCTION_DECL
175 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
176 && !lookup_attribute ("omp declare target host",
177 DECL_ATTRIBUTES (decl))
178 && (!flag_openacc
179 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 }
181
182 /* Return true if DECL Is a variable for which its initializer references
183 should be analyzed. */
184
185 static bool
omp_declare_target_var_p(tree decl)186 omp_declare_target_var_p (tree decl)
187 {
188 return (VAR_P (decl)
189 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
190 && !lookup_attribute ("omp declare target link",
191 DECL_ATTRIBUTES (decl)));
192 }
193
194 /* Helper function for omp_discover_implicit_declare_target, called through
195 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
196 declare target to. */
197
198 static tree
omp_discover_declare_target_tgt_fn_r(tree * tp,int * walk_subtrees,void * data)199 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
200 {
201 if (TREE_CODE (*tp) == CALL_EXPR
202 && CALL_EXPR_FN (*tp)
203 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205 && lookup_attribute ("omp declare variant base",
206 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207 0))))
208 {
209 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
211 {
212 attr = lookup_attribute ("omp declare variant base", attr);
213 if (attr == NULL_TREE)
214 break;
215 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216 if (TREE_CODE (purpose) == FUNCTION_DECL)
217 omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
218 }
219 }
220 else if (TREE_CODE (*tp) == FUNCTION_DECL)
221 {
222 tree decl = *tp;
223 tree id = get_identifier ("omp declare target");
224 symtab_node *node = symtab_node::get (*tp);
225 if (node != NULL)
226 {
227 while (node->alias_target
228 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
229 {
230 if (!omp_declare_target_fn_p (node->decl)
231 && !lookup_attribute ("omp declare target host",
232 DECL_ATTRIBUTES (node->decl)))
233 {
234 node->offloadable = 1;
235 DECL_ATTRIBUTES (node->decl)
236 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
237 }
238 node = symtab_node::get (node->alias_target);
239 }
240 symtab_node *new_node = node->ultimate_alias_target ();
241 decl = new_node->decl;
242 while (node != new_node)
243 {
244 if (!omp_declare_target_fn_p (node->decl)
245 && !lookup_attribute ("omp declare target host",
246 DECL_ATTRIBUTES (node->decl)))
247 {
248 node->offloadable = 1;
249 DECL_ATTRIBUTES (node->decl)
250 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
251 }
252 gcc_assert (node->alias && node->analyzed);
253 node = node->get_alias_target ();
254 }
255 node->offloadable = 1;
256 if (ENABLE_OFFLOADING)
257 g->have_offload = true;
258 }
259 if (omp_declare_target_fn_p (decl)
260 || lookup_attribute ("omp declare target host",
261 DECL_ATTRIBUTES (decl)))
262 return NULL_TREE;
263
264 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
265 ((vec<tree> *) data)->safe_push (decl);
266 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
267 DECL_ATTRIBUTES (decl));
268 }
269 else if (TYPE_P (*tp))
270 *walk_subtrees = 0;
271 /* else if (TREE_CODE (*tp) == OMP_TARGET)
272 {
273 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
274 if (OMP_DEVICE_ANCESTOR (dev))
275 *walk_subtrees = 0;
276 } */
277 return NULL_TREE;
278 }
279
280 /* Similarly, but ignore references outside of OMP_TARGET regions. */
281
282 static tree
omp_discover_declare_target_fn_r(tree * tp,int * walk_subtrees,void * data)283 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
284 {
285 if (TREE_CODE (*tp) == OMP_TARGET)
286 {
287 /* And not OMP_DEVICE_ANCESTOR. */
288 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
289 omp_discover_declare_target_tgt_fn_r,
290 data);
291 *walk_subtrees = 0;
292 }
293 else if (TYPE_P (*tp))
294 *walk_subtrees = 0;
295 return NULL_TREE;
296 }
297
298 /* Helper function for omp_discover_implicit_declare_target, called through
299 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
300 declare target to. */
301
302 static tree
omp_discover_declare_target_var_r(tree * tp,int * walk_subtrees,void * data)303 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
304 {
305 if (TREE_CODE (*tp) == FUNCTION_DECL)
306 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
307 else if (VAR_P (*tp)
308 && is_global_var (*tp)
309 && !omp_declare_target_var_p (*tp))
310 {
311 tree id = get_identifier ("omp declare target");
312 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
313 {
314 error_at (DECL_SOURCE_LOCATION (*tp),
315 "%qD specified both in declare target %<link%> and "
316 "implicitly in %<to%> clauses", *tp);
317 DECL_ATTRIBUTES (*tp)
318 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
319 }
320 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
321 ((vec<tree> *) data)->safe_push (*tp);
322 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
323 symtab_node *node = symtab_node::get (*tp);
324 if (node != NULL && !node->offloadable)
325 {
326 node->offloadable = 1;
327 if (ENABLE_OFFLOADING)
328 {
329 g->have_offload = true;
330 if (is_a <varpool_node *> (node))
331 vec_safe_push (offload_vars, node->decl);
332 }
333 }
334 }
335 else if (TYPE_P (*tp))
336 *walk_subtrees = 0;
337 return NULL_TREE;
338 }
339
340 /* Perform the OpenMP implicit declare target to discovery. */
341
342 void
omp_discover_implicit_declare_target(void)343 omp_discover_implicit_declare_target (void)
344 {
345 cgraph_node *node;
346 varpool_node *vnode;
347 auto_vec<tree> worklist;
348
349 FOR_EACH_DEFINED_FUNCTION (node)
350 if (DECL_SAVED_TREE (node->decl))
351 {
352 struct cgraph_node *cgn;
353 if (omp_declare_target_fn_p (node->decl))
354 worklist.safe_push (node->decl);
355 else if (DECL_STRUCT_FUNCTION (node->decl)
356 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
357 worklist.safe_push (node->decl);
358 for (cgn = first_nested_function (node);
359 cgn; cgn = next_nested_function (cgn))
360 if (omp_declare_target_fn_p (cgn->decl))
361 worklist.safe_push (cgn->decl);
362 else if (DECL_STRUCT_FUNCTION (cgn->decl)
363 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
364 worklist.safe_push (cgn->decl);
365 }
366 FOR_EACH_VARIABLE (vnode)
367 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
368 && omp_declare_target_var_p (vnode->decl))
369 worklist.safe_push (vnode->decl);
370 while (!worklist.is_empty ())
371 {
372 tree decl = worklist.pop ();
373 if (VAR_P (decl))
374 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
375 omp_discover_declare_target_var_r,
376 &worklist);
377 else if (omp_declare_target_fn_p (decl))
378 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
379 omp_discover_declare_target_tgt_fn_r,
380 &worklist);
381 else
382 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
383 omp_discover_declare_target_fn_r,
384 &worklist);
385 }
386
387 lang_hooks.decls.omp_finish_decl_inits ();
388 }
389
390
391 /* Create new symbols containing (address, size) pairs for global variables,
392 marked with "omp declare target" attribute, as well as addresses for the
393 functions, which are outlined offloading regions. */
394 void
omp_finish_file(void)395 omp_finish_file (void)
396 {
397 unsigned num_funcs = vec_safe_length (offload_funcs);
398 unsigned num_vars = vec_safe_length (offload_vars);
399
400 if (num_funcs == 0 && num_vars == 0)
401 return;
402
403 if (targetm_common.have_named_sections)
404 {
405 vec<constructor_elt, va_gc> *v_f, *v_v;
406 vec_alloc (v_f, num_funcs);
407 vec_alloc (v_v, num_vars * 2);
408
409 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
410 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
411
412 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
413 vec_safe_length (v_v));
414 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
415 num_funcs);
416 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
417 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
418 tree ctor_v = build_constructor (vars_decl_type, v_v);
419 tree ctor_f = build_constructor (funcs_decl_type, v_f);
420 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
421 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
422 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
423 get_identifier (".offload_func_table"),
424 funcs_decl_type);
425 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
426 get_identifier (".offload_var_table"),
427 vars_decl_type);
428 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
429 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
430 otherwise a joint table in a binary will contain padding between
431 tables from multiple object files. */
432 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
433 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
434 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
435 DECL_INITIAL (funcs_decl) = ctor_f;
436 DECL_INITIAL (vars_decl) = ctor_v;
437 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
438 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
439
440 varpool_node::finalize_decl (vars_decl);
441 varpool_node::finalize_decl (funcs_decl);
442 }
443 else
444 {
445 for (unsigned i = 0; i < num_funcs; i++)
446 {
447 tree it = (*offload_funcs)[i];
448 /* See also add_decls_addresses_to_decl_constructor
449 and output_offload_tables in lto-cgraph.cc. */
450 if (!in_lto_p && !symtab_node::get (it))
451 continue;
452 targetm.record_offload_symbol (it);
453 }
454 for (unsigned i = 0; i < num_vars; i++)
455 {
456 tree it = (*offload_vars)[i];
457 if (!in_lto_p && !symtab_node::get (it))
458 continue;
459 #ifdef ACCEL_COMPILER
460 if (DECL_HAS_VALUE_EXPR_P (it)
461 && lookup_attribute ("omp declare target link",
462 DECL_ATTRIBUTES (it)))
463 {
464 tree value_expr = DECL_VALUE_EXPR (it);
465 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
466 targetm.record_offload_symbol (link_ptr_decl);
467 varpool_node::finalize_decl (link_ptr_decl);
468 }
469 else
470 #endif
471 targetm.record_offload_symbol (it);
472 }
473 }
474 }
475
476 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
477 axis DIM. Return a tmp var holding the result. */
478
479 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)480 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
481 {
482 tree arg = build_int_cst (unsigned_type_node, dim);
483 tree size = create_tmp_var (integer_type_node);
484 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
485 gimple *call = gimple_build_call_internal (fn, 1, arg);
486
487 gimple_call_set_lhs (call, size);
488 gimple_seq_add_stmt (seq, call);
489
490 return size;
491 }
492
493 /* Find the number of threads (POS = false), or thread number (POS =
494 true) for an OpenACC region partitioned as MASK. Setup code
495 required for the calculation is added to SEQ. */
496
497 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)498 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
499 {
500 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
501 unsigned ix;
502
503 /* Start at gang level, and examine relevant dimension indices. */
504 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
505 if (GOMP_DIM_MASK (ix) & mask)
506 {
507 if (res)
508 {
509 /* We had an outer index, so scale that by the size of
510 this dimension. */
511 tree n = oacc_dim_call (false, ix, seq);
512 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
513 }
514 if (pos)
515 {
516 /* Determine index in this dimension. */
517 tree id = oacc_dim_call (true, ix, seq);
518 if (res)
519 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
520 else
521 res = id;
522 }
523 }
524
525 if (res == NULL_TREE)
526 res = integer_zero_node;
527
528 return res;
529 }
530
531 /* Transform IFN_GOACC_LOOP calls to actual code. See
532 expand_oacc_for for where these are generated. At the vector
533 level, we stride loops, such that each member of a warp will
534 operate on adjacent iterations. At the worker and gang level,
535 each gang/warp executes a set of contiguous iterations. Chunking
536 can override this such that each iteration engine executes a
537 contiguous chunk, and then moves on to stride to the next chunk. */
538
539 static void
oacc_xform_loop(gcall * call)540 oacc_xform_loop (gcall *call)
541 {
542 gimple_stmt_iterator gsi = gsi_for_stmt (call);
543 enum ifn_goacc_loop_kind code
544 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
545 tree dir = gimple_call_arg (call, 1);
546 tree range = gimple_call_arg (call, 2);
547 tree step = gimple_call_arg (call, 3);
548 tree chunk_size = NULL_TREE;
549 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
550 tree lhs = gimple_call_lhs (call);
551 tree type = NULL_TREE;
552 tree diff_type = TREE_TYPE (range);
553 tree r = NULL_TREE;
554 gimple_seq seq = NULL;
555 bool chunking = false, striding = true;
556 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
557 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
558
559 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
560 if (!lhs)
561 {
562 gsi_replace_with_seq (&gsi, seq, true);
563 return;
564 }
565
566 type = TREE_TYPE (lhs);
567
568 #ifdef ACCEL_COMPILER
569 chunk_size = gimple_call_arg (call, 4);
570 if (integer_minus_onep (chunk_size) /* Force static allocation. */
571 || integer_zerop (chunk_size)) /* Default (also static). */
572 {
573 /* If we're at the gang level, we want each to execute a
574 contiguous run of iterations. Otherwise we want each element
575 to stride. */
576 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
577 chunking = false;
578 }
579 else
580 {
581 /* Chunk of size 1 is striding. */
582 striding = integer_onep (chunk_size);
583 chunking = !striding;
584 }
585 #endif
586
587 /* striding=true, chunking=true
588 -> invalid.
589 striding=true, chunking=false
590 -> chunks=1
591 striding=false,chunking=true
592 -> chunks=ceil (range/(chunksize*threads*step))
593 striding=false,chunking=false
594 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
595 push_gimplify_context (true);
596
597 switch (code)
598 {
599 default: gcc_unreachable ();
600
601 case IFN_GOACC_LOOP_CHUNKS:
602 if (!chunking)
603 r = build_int_cst (type, 1);
604 else
605 {
606 /* chunk_max
607 = (range - dir) / (chunks * step * num_threads) + dir */
608 tree per = oacc_thread_numbers (false, mask, &seq);
609 per = fold_convert (type, per);
610 chunk_size = fold_convert (type, chunk_size);
611 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
612 per = fold_build2 (MULT_EXPR, type, per, step);
613 r = build2 (MINUS_EXPR, type, range, dir);
614 r = build2 (PLUS_EXPR, type, r, per);
615 r = build2 (TRUNC_DIV_EXPR, type, r, per);
616 }
617 break;
618
619 case IFN_GOACC_LOOP_STEP:
620 {
621 /* If striding, step by the entire compute volume, otherwise
622 step by the inner volume. */
623 unsigned volume = striding ? mask : inner_mask;
624
625 r = oacc_thread_numbers (false, volume, &seq);
626 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
627 }
628 break;
629
630 case IFN_GOACC_LOOP_OFFSET:
631 /* Enable vectorization on non-SIMT targets. */
632 if (!targetm.simt.vf
633 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
634 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
635 the loop. */
636 && (flag_tree_loop_vectorize
637 || !OPTION_SET_P (flag_tree_loop_vectorize)))
638 {
639 basic_block bb = gsi_bb (gsi);
640 class loop *parent = bb->loop_father;
641 class loop *body = parent->inner;
642
643 parent->force_vectorize = true;
644 parent->safelen = INT_MAX;
645
646 /* "Chunking loops" may have inner loops. */
647 if (parent->inner)
648 {
649 body->force_vectorize = true;
650 body->safelen = INT_MAX;
651 }
652
653 cfun->has_force_vectorize_loops = true;
654 }
655 if (striding)
656 {
657 r = oacc_thread_numbers (true, mask, &seq);
658 r = fold_convert (diff_type, r);
659 }
660 else
661 {
662 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
663 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
664 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
665 inner_size, outer_size);
666
667 volume = fold_convert (diff_type, volume);
668 if (chunking)
669 chunk_size = fold_convert (diff_type, chunk_size);
670 else
671 {
672 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
673
674 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
675 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
676 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
677 }
678
679 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
680 fold_convert (diff_type, inner_size));
681 r = oacc_thread_numbers (true, outer_mask, &seq);
682 r = fold_convert (diff_type, r);
683 r = build2 (MULT_EXPR, diff_type, r, span);
684
685 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
686 inner = fold_convert (diff_type, inner);
687 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
688
689 if (chunking)
690 {
691 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
692 tree per
693 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
694 per = build2 (MULT_EXPR, diff_type, per, chunk);
695
696 r = build2 (PLUS_EXPR, diff_type, r, per);
697 }
698 }
699 r = fold_build2 (MULT_EXPR, diff_type, r, step);
700 if (type != diff_type)
701 r = fold_convert (type, r);
702 break;
703
704 case IFN_GOACC_LOOP_BOUND:
705 if (striding)
706 r = range;
707 else
708 {
709 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
710 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
711 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
712 inner_size, outer_size);
713
714 volume = fold_convert (diff_type, volume);
715 if (chunking)
716 chunk_size = fold_convert (diff_type, chunk_size);
717 else
718 {
719 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
720
721 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
722 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
723 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
724 }
725
726 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
727 fold_convert (diff_type, inner_size));
728
729 r = fold_build2 (MULT_EXPR, diff_type, span, step);
730
731 tree offset = gimple_call_arg (call, 6);
732 r = build2 (PLUS_EXPR, diff_type, r,
733 fold_convert (diff_type, offset));
734 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
735 diff_type, r, range);
736 }
737 if (diff_type != type)
738 r = fold_convert (type, r);
739 break;
740 }
741
742 gimplify_assign (lhs, r, &seq);
743
744 pop_gimplify_context (NULL);
745
746 gsi_replace_with_seq (&gsi, seq, true);
747 }
748
749 /* Transform a GOACC_TILE call. Determines the element loop span for
750 the specified loop of the nest. This is 1 if we're not tiling.
751
752 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
753
754 static void
oacc_xform_tile(gcall * call)755 oacc_xform_tile (gcall *call)
756 {
757 gimple_stmt_iterator gsi = gsi_for_stmt (call);
758 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
759 /* Inner loops have higher loop_nos. */
760 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
761 tree tile_size = gimple_call_arg (call, 2);
762 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
763 tree lhs = gimple_call_lhs (call);
764 tree type = TREE_TYPE (lhs);
765 gimple_seq seq = NULL;
766 tree span = build_int_cst (type, 1);
767
768 gcc_assert (!(e_mask
769 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
770 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
771 push_gimplify_context (!seen_error ());
772
773 #ifndef ACCEL_COMPILER
774 /* Partitioning disabled on host compilers. */
775 e_mask = 0;
776 #endif
777 if (!e_mask)
778 /* Not paritioning. */
779 span = integer_one_node;
780 else if (!integer_zerop (tile_size))
781 /* User explicitly specified size. */
782 span = tile_size;
783 else
784 {
785 /* Pick a size based on the paritioning of the element loop and
786 the number of loop nests. */
787 tree first_size = NULL_TREE;
788 tree second_size = NULL_TREE;
789
790 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
791 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
792 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
793 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
794
795 if (!first_size)
796 {
797 first_size = second_size;
798 second_size = NULL_TREE;
799 }
800
801 if (loop_no + 1 == collapse)
802 {
803 span = first_size;
804 if (!loop_no && second_size)
805 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
806 span, second_size);
807 }
808 else if (loop_no + 2 == collapse)
809 span = second_size;
810 else
811 span = NULL_TREE;
812
813 if (!span)
814 /* There's no obvious element size for this loop. Options
815 are 1, first_size or some non-unity constant (32 is my
816 favourite). We should gather some statistics. */
817 span = first_size;
818 }
819
820 span = fold_convert (type, span);
821 gimplify_assign (lhs, span, &seq);
822
823 pop_gimplify_context (NULL);
824
825 gsi_replace_with_seq (&gsi, seq, true);
826 }
827
828 /* Default partitioned and minimum partitioned dimensions. */
829
830 static int oacc_default_dims[GOMP_DIM_MAX];
831 static int oacc_min_dims[GOMP_DIM_MAX];
832
833 int
oacc_get_default_dim(int dim)834 oacc_get_default_dim (int dim)
835 {
836 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
837 return oacc_default_dims[dim];
838 }
839
840 int
oacc_get_min_dim(int dim)841 oacc_get_min_dim (int dim)
842 {
843 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
844 return oacc_min_dims[dim];
845 }
846
847 /* Parse the default dimension parameter. This is a set of
848 :-separated optional compute dimensions. Each specified dimension
849 is a positive integer. When device type support is added, it is
850 planned to be a comma separated list of such compute dimensions,
851 with all but the first prefixed by the colon-terminated device
852 type. */
853
854 static void
oacc_parse_default_dims(const char * dims)855 oacc_parse_default_dims (const char *dims)
856 {
857 int ix;
858
859 for (ix = GOMP_DIM_MAX; ix--;)
860 {
861 oacc_default_dims[ix] = -1;
862 oacc_min_dims[ix] = 1;
863 }
864
865 #ifndef ACCEL_COMPILER
866 /* Cannot be overridden on the host. */
867 dims = NULL;
868 #endif
869 if (dims)
870 {
871 const char *pos = dims;
872
873 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
874 {
875 if (ix)
876 {
877 if (*pos != ':')
878 goto malformed;
879 pos++;
880 }
881
882 if (*pos != ':')
883 {
884 long val;
885 const char *eptr;
886
887 errno = 0;
888 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
889 if (errno || val <= 0 || (int) val != val)
890 goto malformed;
891 pos = eptr;
892 oacc_default_dims[ix] = (int) val;
893 }
894 }
895 if (*pos)
896 {
897 malformed:
898 error_at (UNKNOWN_LOCATION,
899 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
900 }
901 }
902
903 /* Allow the backend to validate the dimensions. */
904 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
905 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
906 }
907
908 /* Validate and update the dimensions for offloaded FN. ATTRS is the
909 raw attribute. DIMS is an array of dimensions, which is filled in.
910 LEVEL is the partitioning level of a routine, or -1 for an offload
911 region itself. USED is the mask of partitioned execution in the
912 function. */
913
914 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)915 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
916 {
917 tree purpose[GOMP_DIM_MAX];
918 unsigned ix;
919 tree pos = TREE_VALUE (attrs);
920
921 /* Make sure the attribute creator attached the dimension
922 information. */
923 gcc_assert (pos);
924
925 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
926 {
927 purpose[ix] = TREE_PURPOSE (pos);
928 tree val = TREE_VALUE (pos);
929 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
930 pos = TREE_CHAIN (pos);
931 }
932
933 bool check = true;
934 #ifdef ACCEL_COMPILER
935 check = false;
936 #endif
937 if (check
938 && warn_openacc_parallelism
939 && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
940 {
941 static char const *const axes[] =
942 /* Must be kept in sync with GOMP_DIM enumeration. */
943 { "gang", "worker", "vector" };
944 for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
945 if (dims[ix] < 0)
946 ; /* Defaulting axis. */
947 else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
948 /* There is partitioned execution, but the user requested a
949 dimension size of 1. They're probably confused. */
950 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
951 "region contains %s partitioned code but"
952 " is not %s partitioned", axes[ix], axes[ix]);
953 else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
954 /* The dimension is explicitly partitioned to non-unity, but
955 no use is made within the region. */
956 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
957 "region is %s partitioned but"
958 " does not contain %s partitioned code",
959 axes[ix], axes[ix]);
960 }
961
962 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
963
964 /* Default anything left to 1 or a partitioned default. */
965 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
966 if (dims[ix] < 0)
967 {
968 /* The OpenACC spec says 'If the [num_gangs] clause is not
969 specified, an implementation-defined default will be used;
970 the default may depend on the code within the construct.'
971 (2.5.6). Thus an implementation is free to choose
972 non-unity default for a parallel region that doesn't have
973 any gang-partitioned loops. However, it appears that there
974 is a sufficient body of user code that expects non-gang
975 partitioned regions to not execute in gang-redundant mode.
976 So we (a) don't warn about the non-portability and (b) pick
977 the minimum permissible dimension size when there is no
978 partitioned execution. Otherwise we pick the global
979 default for the dimension, which the user can control. The
980 same wording and logic applies to num_workers and
981 vector_length, however the worker- or vector- single
982 execution doesn't have the same impact as gang-redundant
983 execution. (If the minimum gang-level partioning is not 1,
984 the target is probably too confusing.) */
985 dims[ix] = (used & GOMP_DIM_MASK (ix)
986 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
987 changed = true;
988 }
989
990 if (changed)
991 {
992 /* Replace the attribute with new values. */
993 pos = NULL_TREE;
994 for (ix = GOMP_DIM_MAX; ix--;)
995 pos = tree_cons (purpose[ix],
996 build_int_cst (integer_type_node, dims[ix]), pos);
997 oacc_replace_fn_attrib (fn, pos);
998 }
999 }
1000
1001 /* Create an empty OpenACC loop structure at LOC. */
1002
1003 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)1004 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1005 {
1006 oacc_loop *loop = XCNEW (oacc_loop);
1007
1008 loop->parent = parent;
1009
1010 if (parent)
1011 {
1012 loop->sibling = parent->child;
1013 parent->child = loop;
1014 }
1015
1016 loop->loc = loc;
1017 return loop;
1018 }
1019
1020 /* Create an outermost, dummy OpenACC loop for offloaded function
1021 DECL. */
1022
1023 static oacc_loop *
new_oacc_loop_outer(tree decl)1024 new_oacc_loop_outer (tree decl)
1025 {
1026 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1027 }
1028
1029 /* Start a new OpenACC loop structure beginning at head marker HEAD.
1030 Link into PARENT loop. Return the new loop. */
1031
1032 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)1033 new_oacc_loop (oacc_loop *parent, gcall *marker)
1034 {
1035 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1036
1037 loop->marker = marker;
1038
1039 /* TODO: This is where device_type flattening would occur for the loop
1040 flags. */
1041
1042 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1043
1044 tree chunk_size = integer_zero_node;
1045 if (loop->flags & OLF_GANG_STATIC)
1046 chunk_size = gimple_call_arg (marker, 4);
1047 loop->chunk_size = chunk_size;
1048
1049 return loop;
1050 }
1051
1052 /* Create a dummy loop encompassing a call to a openACC routine.
1053 Extract the routine's partitioning requirements. */
1054
1055 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)1056 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1057 {
1058 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1059 int level = oacc_fn_attrib_level (attrs);
1060
1061 gcc_assert (level >= 0);
1062
1063 loop->marker = call;
1064 loop->routine = decl;
1065 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1066 ^ (GOMP_DIM_MASK (level) - 1));
1067 }
1068
1069 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1070 Return the parent loop. */
1071
1072 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)1073 finish_oacc_loop (oacc_loop *loop)
1074 {
1075 /* If the loop has been collapsed, don't partition it. */
1076 if (loop->ifns.is_empty ())
1077 loop->mask = loop->flags = 0;
1078 return loop->parent;
1079 }
1080
1081 /* Free all OpenACC loop structures within LOOP (inclusive). */
1082
1083 static void
free_oacc_loop(oacc_loop * loop)1084 free_oacc_loop (oacc_loop *loop)
1085 {
1086 if (loop->sibling)
1087 free_oacc_loop (loop->sibling);
1088 if (loop->child)
1089 free_oacc_loop (loop->child);
1090
1091 loop->ifns.release ();
1092 free (loop);
1093 }
1094
1095 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1096
1097 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)1098 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1099 const char *title, int level)
1100 {
1101 enum ifn_unique_kind kind
1102 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1103
1104 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1105 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1106 {
1107 gimple *stmt = gsi_stmt (gsi);
1108
1109 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1110 {
1111 enum ifn_unique_kind k
1112 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1113 (gimple_call_arg (stmt, 0)));
1114
1115 if (k == kind && stmt != from)
1116 break;
1117 }
1118 print_gimple_stmt (file, stmt, depth * 2 + 2);
1119
1120 gsi_next (&gsi);
1121 while (gsi_end_p (gsi))
1122 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1123 }
1124 }
1125
1126 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1127
1128 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)1129 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1130 {
1131 int ix;
1132
1133 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1134 loop->flags, loop->mask,
1135 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1136
1137 if (loop->marker)
1138 print_gimple_stmt (file, loop->marker, depth * 2);
1139
1140 if (loop->routine)
1141 fprintf (file, "%*sRoutine %s:%u:%s\n",
1142 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1143 DECL_SOURCE_LINE (loop->routine),
1144 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1145
1146 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1147 if (loop->heads[ix])
1148 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1149 for (ix = GOMP_DIM_MAX; ix--;)
1150 if (loop->tails[ix])
1151 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1152
1153 if (loop->child)
1154 dump_oacc_loop (file, loop->child, depth + 1);
1155 if (loop->sibling)
1156 dump_oacc_loop (file, loop->sibling, depth);
1157 }
1158
1159 void debug_oacc_loop (oacc_loop *);
1160
1161 /* Dump loops to stderr. */
1162
1163 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)1164 debug_oacc_loop (oacc_loop *loop)
1165 {
1166 dump_oacc_loop (stderr, loop, 0);
1167 }
1168
1169 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1170 siblings. */
1171
1172 static void
inform_oacc_loop(const oacc_loop * loop)1173 inform_oacc_loop (const oacc_loop *loop)
1174 {
1175 const char *gang
1176 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1177 const char *worker
1178 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1179 const char *vector
1180 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1181 const char *seq = loop->mask == 0 ? " seq" : "";
1182 const dump_user_location_t loc
1183 = dump_user_location_t::from_location_t (loop->loc);
1184 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1185 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1186 vector, seq);
1187
1188 if (loop->child)
1189 inform_oacc_loop (loop->child);
1190 if (loop->sibling)
1191 inform_oacc_loop (loop->sibling);
1192 }
1193
1194 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1195 structures as we go. By construction these loops are properly
1196 nested. */
1197
1198 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)1199 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1200 {
1201 int marker = 0;
1202 int remaining = 0;
1203
1204 if (bb->flags & BB_VISITED)
1205 return;
1206
1207 follow:
1208 bb->flags |= BB_VISITED;
1209
1210 /* Scan for loop markers. */
1211 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1212 gsi_next (&gsi))
1213 {
1214 gimple *stmt = gsi_stmt (gsi);
1215
1216 if (!is_gimple_call (stmt))
1217 continue;
1218
1219 gcall *call = as_a <gcall *> (stmt);
1220
1221 /* If this is a routine, make a dummy loop for it. */
1222 if (tree decl = gimple_call_fndecl (call))
1223 if (tree attrs = oacc_get_fn_attrib (decl))
1224 {
1225 gcc_assert (!marker);
1226 new_oacc_loop_routine (loop, call, decl, attrs);
1227 }
1228
1229 if (!gimple_call_internal_p (call))
1230 continue;
1231
1232 switch (gimple_call_internal_fn (call))
1233 {
1234 default:
1235 break;
1236
1237 case IFN_GOACC_LOOP:
1238 case IFN_GOACC_TILE:
1239 /* Record the abstraction function, so we can manipulate it
1240 later. */
1241 loop->ifns.safe_push (call);
1242 break;
1243
1244 case IFN_UNIQUE:
1245 enum ifn_unique_kind kind
1246 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1247 (gimple_call_arg (call, 0)));
1248 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1249 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1250 {
1251 if (gimple_call_num_args (call) == 2)
1252 {
1253 gcc_assert (marker && !remaining);
1254 marker = 0;
1255 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1256 loop = finish_oacc_loop (loop);
1257 else
1258 loop->head_end = call;
1259 }
1260 else
1261 {
1262 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1263
1264 if (!marker)
1265 {
1266 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1267 loop = new_oacc_loop (loop, call);
1268 remaining = count;
1269 }
1270 gcc_assert (count == remaining);
1271 if (remaining)
1272 {
1273 remaining--;
1274 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1275 loop->heads[marker] = call;
1276 else
1277 loop->tails[remaining] = call;
1278 }
1279 marker++;
1280 }
1281 }
1282 }
1283 }
1284 if (remaining || marker)
1285 {
1286 bb = single_succ (bb);
1287 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1288 goto follow;
1289 }
1290
1291 /* Walk successor blocks. */
1292 edge e;
1293 edge_iterator ei;
1294
1295 FOR_EACH_EDGE (e, ei, bb->succs)
1296 oacc_loop_discover_walk (loop, e->dest);
1297 }
1298
1299 /* LOOP is the first sibling. Reverse the order in place and return
1300 the new first sibling. Recurse to child loops. */
1301
1302 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1303 oacc_loop_sibling_nreverse (oacc_loop *loop)
1304 {
1305 oacc_loop *last = NULL;
1306 do
1307 {
1308 if (loop->child)
1309 loop->child = oacc_loop_sibling_nreverse (loop->child);
1310
1311 oacc_loop *next = loop->sibling;
1312 loop->sibling = last;
1313 last = loop;
1314 loop = next;
1315 }
1316 while (loop);
1317
1318 return last;
1319 }
1320
1321 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1322 the current function. */
1323
1324 static oacc_loop *
oacc_loop_discovery()1325 oacc_loop_discovery ()
1326 {
1327 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1328 in the following. */
1329 clear_bb_flags ();
1330
1331 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1332 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1333
1334 /* The siblings were constructed in reverse order, reverse them so
1335 that diagnostics come out in an unsurprising order. */
1336 top = oacc_loop_sibling_nreverse (top);
1337
1338 return top;
1339 }
1340
1341 /* Transform the abstract internal function markers starting at FROM
1342 to be for partitioning level LEVEL. Stop when we meet another HEAD
1343 or TAIL marker. */
1344
1345 static void
oacc_loop_xform_head_tail(gcall * from,int level)1346 oacc_loop_xform_head_tail (gcall *from, int level)
1347 {
1348 enum ifn_unique_kind kind
1349 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1350 tree replacement = build_int_cst (unsigned_type_node, level);
1351
1352 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1353 {
1354 gimple *stmt = gsi_stmt (gsi);
1355
1356 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1357 {
1358 enum ifn_unique_kind k
1359 = ((enum ifn_unique_kind)
1360 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1361
1362 if (k == IFN_UNIQUE_OACC_FORK
1363 || k == IFN_UNIQUE_OACC_JOIN
1364 || k == IFN_UNIQUE_OACC_PRIVATE)
1365 *gimple_call_arg_ptr (stmt, 2) = replacement;
1366 else if (k == kind && stmt != from)
1367 break;
1368 }
1369 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1370 *gimple_call_arg_ptr (stmt, 3) = replacement;
1371 update_stmt (stmt);
1372
1373 gsi_next (&gsi);
1374 while (gsi_end_p (gsi))
1375 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1376 }
1377 }
1378
1379 /* Process the discovered OpenACC loops, setting the correct
1380 partitioning level etc. */
1381
1382 static void
oacc_loop_process(oacc_loop * loop,int fn_level)1383 oacc_loop_process (oacc_loop *loop, int fn_level)
1384 {
1385 if (loop->child)
1386 oacc_loop_process (loop->child, fn_level);
1387
1388 if (loop->mask && !loop->routine)
1389 {
1390 int ix;
1391 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1392 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1393 tree chunk_arg = loop->chunk_size;
1394 gcall *call;
1395
1396 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1397 {
1398 switch (gimple_call_internal_fn (call))
1399 {
1400 case IFN_GOACC_LOOP:
1401 {
1402 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1403 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1404 if (!is_e)
1405 gimple_call_set_arg (call, 4, chunk_arg);
1406 }
1407 break;
1408
1409 case IFN_GOACC_TILE:
1410 gimple_call_set_arg (call, 3, mask_arg);
1411 gimple_call_set_arg (call, 4, e_mask_arg);
1412 break;
1413
1414 default:
1415 gcc_unreachable ();
1416 }
1417 update_stmt (call);
1418 }
1419
1420 unsigned dim = GOMP_DIM_GANG;
1421 unsigned mask = loop->mask | loop->e_mask;
1422 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1423 {
1424 while (!(GOMP_DIM_MASK (dim) & mask))
1425 dim++;
1426
1427 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1428 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1429
1430 mask ^= GOMP_DIM_MASK (dim);
1431 }
1432 }
1433
1434 if (loop->sibling)
1435 oacc_loop_process (loop->sibling, fn_level);
1436
1437
1438 /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1439 "The 'reduction' clause may not be specified on an orphaned 'loop'
1440 construct with the 'gang' clause, or on an orphaned 'loop' construct that
1441 will generate gang parallelism in a procedure that is compiled with the
1442 'routine gang' clause." */
1443 if (fn_level == GOMP_DIM_GANG
1444 && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1445 && (loop->flags & OLF_REDUCTION))
1446 error_at (loop->loc,
1447 "gang reduction on an orphan loop");
1448 }
1449
1450 /* Walk the OpenACC loop heirarchy checking and assigning the
1451 programmer-specified partitionings. OUTER_MASK is the partitioning
1452 this loop is contained within. Return mask of partitioning
1453 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1454 bit. */
1455
1456 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1457 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1458 {
1459 unsigned this_mask = loop->mask;
1460 unsigned mask_all = 0;
1461 bool noisy = true;
1462
1463 #ifdef ACCEL_COMPILER
1464 /* When device_type is supported, we want the device compiler to be
1465 noisy, if the loop parameters are device_type-specific. */
1466 noisy = false;
1467 #endif
1468
1469 if (!loop->routine)
1470 {
1471 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1472 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1473 bool tiling = (loop->flags & OLF_TILE) != 0;
1474
1475 this_mask = ((loop->flags >> OLF_DIM_BASE)
1476 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1477
1478 /* Apply auto partitioning if this is a non-partitioned regular
1479 loop, or (no more than) single axis tiled loop. */
1480 bool maybe_auto
1481 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1482
1483 if ((this_mask != 0) + auto_par + seq_par > 1)
1484 {
1485 if (noisy)
1486 error_at (loop->loc,
1487 seq_par
1488 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1489 : G_("%<auto%> conflicts with other OpenACC loop "
1490 "specifiers"));
1491 maybe_auto = false;
1492 loop->flags &= ~OLF_AUTO;
1493 if (seq_par)
1494 {
1495 loop->flags
1496 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1497 this_mask = 0;
1498 }
1499 }
1500
1501 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1502 {
1503 loop->flags |= OLF_AUTO;
1504 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1505 }
1506 }
1507
1508 if (this_mask & outer_mask)
1509 {
1510 const oacc_loop *outer;
1511 for (outer = loop->parent; outer; outer = outer->parent)
1512 if ((outer->mask | outer->e_mask) & this_mask)
1513 break;
1514
1515 if (noisy)
1516 {
1517 if (outer)
1518 {
1519 error_at (loop->loc,
1520 loop->routine
1521 ? G_("routine call uses same OpenACC parallelism"
1522 " as containing loop")
1523 : G_("inner loop uses same OpenACC parallelism"
1524 " as containing loop"));
1525 inform (outer->loc, "containing loop here");
1526 }
1527 else
1528 error_at (loop->loc,
1529 loop->routine
1530 ? G_("routine call uses OpenACC parallelism disallowed"
1531 " by containing routine")
1532 : G_("loop uses OpenACC parallelism disallowed"
1533 " by containing routine"));
1534
1535 if (loop->routine)
1536 inform (DECL_SOURCE_LOCATION (loop->routine),
1537 "routine %qD declared here", loop->routine);
1538 }
1539 this_mask &= ~outer_mask;
1540 }
1541 else
1542 {
1543 unsigned outermost = least_bit_hwi (this_mask);
1544
1545 if (outermost && outermost <= outer_mask)
1546 {
1547 if (noisy)
1548 {
1549 error_at (loop->loc,
1550 "incorrectly nested OpenACC loop parallelism");
1551
1552 const oacc_loop *outer;
1553 for (outer = loop->parent;
1554 outer->flags && outer->flags < outermost;
1555 outer = outer->parent)
1556 continue;
1557 inform (outer->loc, "containing loop here");
1558 }
1559
1560 this_mask &= ~outermost;
1561 }
1562 }
1563
1564 mask_all |= this_mask;
1565
1566 if (loop->flags & OLF_TILE)
1567 {
1568 /* When tiling, vector goes to the element loop, and failing
1569 that we put worker there. The std doesn't contemplate
1570 specifying all three. We choose to put worker and vector on
1571 the element loops in that case. */
1572 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1573 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1574 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1575
1576 loop->e_mask = this_e_mask;
1577 this_mask ^= this_e_mask;
1578 }
1579
1580 loop->mask = this_mask;
1581
1582 if (dump_file)
1583 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1584 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1585 loop->mask, loop->e_mask);
1586
1587 if (loop->child)
1588 {
1589 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1590 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1591 mask_all |= loop->inner;
1592 }
1593
1594 if (loop->sibling)
1595 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1596
1597 return mask_all;
1598 }
1599
1600 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1601 OUTER_MASK is the partitioning this loop is contained within.
1602 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1603 Return the cumulative partitioning used by this loop, siblings and
1604 children. */
1605
1606 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1607 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1608 bool outer_assign)
1609 {
1610 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1611 bool noisy = true;
1612 bool tiling = loop->flags & OLF_TILE;
1613
1614 #ifdef ACCEL_COMPILER
1615 /* When device_type is supported, we want the device compiler to be
1616 noisy, if the loop parameters are device_type-specific. */
1617 noisy = false;
1618 #endif
1619
1620 if (assign && (!outer_assign || loop->inner))
1621 {
1622 /* Allocate outermost and non-innermost loops at the outermost
1623 non-innermost available level. */
1624 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1625
1626 /* Find the first outermost available partition. */
1627 while (this_mask <= outer_mask)
1628 this_mask <<= 1;
1629
1630 /* Grab two axes if tiling, and we've not assigned anything */
1631 if (tiling && !(loop->mask | loop->e_mask))
1632 this_mask |= this_mask << 1;
1633
1634 /* Prohibit the innermost partitioning at the moment. */
1635 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1636
1637 /* Don't use any dimension explicitly claimed by an inner loop. */
1638 this_mask &= ~loop->inner;
1639
1640 if (tiling && !loop->e_mask)
1641 {
1642 /* If we got two axes, allocate the inner one to the element
1643 loop. */
1644 loop->e_mask = this_mask & (this_mask << 1);
1645 this_mask ^= loop->e_mask;
1646 }
1647
1648 loop->mask |= this_mask;
1649 }
1650
1651 if (loop->child)
1652 {
1653 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1654 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1655 outer_assign | assign);
1656 }
1657
1658 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1659 {
1660 /* Allocate the loop at the innermost available level. Note
1661 that we do this even if we already assigned this loop the
1662 outermost available level above. That way we'll partition
1663 this along 2 axes, if they are available. */
1664 unsigned this_mask = 0;
1665
1666 /* Determine the outermost partitioning used within this loop. */
1667 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1668 this_mask = least_bit_hwi (this_mask);
1669
1670 /* Pick the partitioning just inside that one. */
1671 this_mask >>= 1;
1672
1673 /* And avoid picking one use by an outer loop. */
1674 this_mask &= ~outer_mask;
1675
1676 /* If tiling and we failed completely above, grab the next one
1677 too. Making sure it doesn't hit an outer loop. */
1678 if (tiling)
1679 {
1680 this_mask &= ~(loop->e_mask | loop->mask);
1681 unsigned tile_mask = ((this_mask >> 1)
1682 & ~(outer_mask | loop->e_mask | loop->mask));
1683
1684 if (tile_mask || loop->mask)
1685 {
1686 loop->e_mask |= this_mask;
1687 this_mask = tile_mask;
1688 }
1689 if (!loop->e_mask && noisy)
1690 warning_at (loop->loc, 0,
1691 "insufficient partitioning available"
1692 " to parallelize element loop");
1693 }
1694
1695 loop->mask |= this_mask;
1696 if (!loop->mask && noisy)
1697 warning_at (loop->loc, 0,
1698 tiling
1699 ? G_("insufficient partitioning available"
1700 " to parallelize tile loop")
1701 : G_("insufficient partitioning available"
1702 " to parallelize loop"));
1703 }
1704
1705 if (assign && dump_file)
1706 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1707 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1708 loop->mask, loop->e_mask);
1709
1710 unsigned inner_mask = 0;
1711
1712 if (loop->sibling)
1713 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1714 outer_mask, outer_assign);
1715
1716 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1717
1718 return inner_mask;
1719 }
1720
1721 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1722 axes. Return mask of partitioning. */
1723
1724 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1725 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1726 {
1727 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1728
1729 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1730 {
1731 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1732 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1733 }
1734 return mask_all;
1735 }
1736
1737 /* Default fork/join early expander. Delete the function calls if
1738 there is no RTL expander. */
1739
1740 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1741 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1742 const int *ARG_UNUSED (dims), bool is_fork)
1743 {
1744 if (is_fork)
1745 return targetm.have_oacc_fork ();
1746 else
1747 return targetm.have_oacc_join ();
1748 }
1749
1750 /* Default goacc.reduction early expander.
1751
1752 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1753 If RES_PTR is not integer-zerop:
1754 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1755 TEARDOWN - emit '*RES_PTR = VAR'
1756 If LHS is not NULL
1757 emit 'LHS = VAR' */
1758
1759 void
default_goacc_reduction(gcall * call)1760 default_goacc_reduction (gcall *call)
1761 {
1762 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1763 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1764 tree lhs = gimple_call_lhs (call);
1765 tree var = gimple_call_arg (call, 2);
1766 gimple_seq seq = NULL;
1767
1768 if (code == IFN_GOACC_REDUCTION_SETUP
1769 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1770 {
1771 /* Setup and Teardown need to copy from/to the receiver object,
1772 if there is one. */
1773 tree ref_to_res = gimple_call_arg (call, 1);
1774
1775 if (!integer_zerop (ref_to_res))
1776 {
1777 tree dst = build_simple_mem_ref (ref_to_res);
1778 tree src = var;
1779
1780 if (code == IFN_GOACC_REDUCTION_SETUP)
1781 {
1782 src = dst;
1783 dst = lhs;
1784 lhs = NULL;
1785 }
1786 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1787 }
1788 }
1789
1790 /* Copy VAR to LHS, if there is an LHS. */
1791 if (lhs)
1792 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1793
1794 gsi_replace_with_seq (&gsi, seq, true);
1795 }
1796
1797 struct var_decl_rewrite_info
1798 {
1799 gimple *stmt;
1800 hash_map<tree, tree> *adjusted_vars;
1801 bool avoid_pointer_conversion;
1802 bool modified;
1803 };
1804
1805 /* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1806 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1807 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1808 gang-private variables in OpenACC offload regions to reside in GPU shared
1809 memory. */
1810
1811 static tree
oacc_rewrite_var_decl(tree * tp,int * walk_subtrees,void * data)1812 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1813 {
1814 walk_stmt_info *wi = (walk_stmt_info *) data;
1815 var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1816
1817 if (TREE_CODE (*tp) == ADDR_EXPR)
1818 {
1819 tree arg = TREE_OPERAND (*tp, 0);
1820 tree *new_arg = info->adjusted_vars->get (arg);
1821
1822 if (new_arg)
1823 {
1824 if (info->avoid_pointer_conversion)
1825 {
1826 *tp = build_fold_addr_expr (*new_arg);
1827 info->modified = true;
1828 *walk_subtrees = 0;
1829 }
1830 else
1831 {
1832 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1833 tree repl = build_fold_addr_expr (*new_arg);
1834 gimple *stmt1
1835 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1836 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1837 gimple_assign_lhs (stmt1));
1838 gimple *stmt2
1839 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1840 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1841 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1842 *tp = gimple_assign_lhs (stmt2);
1843 info->modified = true;
1844 *walk_subtrees = 0;
1845 }
1846 }
1847 }
1848 else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1849 {
1850 tree *base = &TREE_OPERAND (*tp, 0);
1851
1852 while (TREE_CODE (*base) == COMPONENT_REF
1853 || TREE_CODE (*base) == ARRAY_REF)
1854 base = &TREE_OPERAND (*base, 0);
1855
1856 if (TREE_CODE (*base) != VAR_DECL)
1857 return NULL;
1858
1859 tree *new_decl = info->adjusted_vars->get (*base);
1860 if (!new_decl)
1861 return NULL;
1862
1863 int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1864 tree field = TREE_OPERAND (*tp, 1);
1865
1866 /* Adjust the type of the field. */
1867 int field_quals = TYPE_QUALS (TREE_TYPE (field));
1868 if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1869 {
1870 tree *field_type = &TREE_TYPE (field);
1871 while (TREE_CODE (*field_type) == ARRAY_TYPE)
1872 field_type = &TREE_TYPE (*field_type);
1873 field_quals |= base_quals;
1874 *field_type = build_qualified_type (*field_type, field_quals);
1875 }
1876
1877 /* Adjust the type of the component ref itself. */
1878 tree comp_type = TREE_TYPE (*tp);
1879 int comp_quals = TYPE_QUALS (comp_type);
1880 if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1881 {
1882 comp_quals |= base_quals;
1883 TREE_TYPE (*tp)
1884 = build_qualified_type (comp_type, comp_quals);
1885 }
1886
1887 *base = *new_decl;
1888 info->modified = true;
1889 }
1890 else if (TREE_CODE (*tp) == VAR_DECL)
1891 {
1892 tree *new_decl = info->adjusted_vars->get (*tp);
1893 if (new_decl)
1894 {
1895 *tp = *new_decl;
1896 info->modified = true;
1897 }
1898 }
1899
1900 return NULL_TREE;
1901 }
1902
1903 /* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1904
1905 static bool
is_sync_builtin_call(gcall * call)1906 is_sync_builtin_call (gcall *call)
1907 {
1908 tree callee = gimple_call_fndecl (call);
1909
1910 if (callee != NULL_TREE
1911 && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1912 switch (DECL_FUNCTION_CODE (callee))
1913 {
1914 #undef DEF_SYNC_BUILTIN
1915 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1916 #include "sync-builtins.def"
1917 #undef DEF_SYNC_BUILTIN
1918 return true;
1919
1920 default:
1921 ;
1922 }
1923
1924 return false;
1925 }
1926
1927 /* Main entry point for oacc transformations which run on the device
1928 compiler after LTO, so we know what the target device is at this
1929 point (including the host fallback). */
1930
1931 static unsigned int
execute_oacc_loop_designation()1932 execute_oacc_loop_designation ()
1933 {
1934 tree attrs = oacc_get_fn_attrib (current_function_decl);
1935
1936 if (!attrs)
1937 /* Not an offloaded function. */
1938 return 0;
1939
1940 /* Parse the default dim argument exactly once. */
1941 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1942 {
1943 oacc_parse_default_dims (flag_openacc_dims);
1944 flag_openacc_dims = (char *)&flag_openacc_dims;
1945 }
1946
1947 bool is_oacc_parallel
1948 = (lookup_attribute ("oacc parallel",
1949 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1950 bool is_oacc_kernels
1951 = (lookup_attribute ("oacc kernels",
1952 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1953 bool is_oacc_serial
1954 = (lookup_attribute ("oacc serial",
1955 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1956 bool is_oacc_parallel_kernels_parallelized
1957 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1958 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1959 bool is_oacc_parallel_kernels_gang_single
1960 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1961 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1962 int fn_level = oacc_fn_attrib_level (attrs);
1963 bool is_oacc_routine = (fn_level >= 0);
1964 gcc_checking_assert (is_oacc_parallel
1965 + is_oacc_kernels
1966 + is_oacc_serial
1967 + is_oacc_parallel_kernels_parallelized
1968 + is_oacc_parallel_kernels_gang_single
1969 + is_oacc_routine
1970 == 1);
1971
1972 bool is_oacc_kernels_parallelized
1973 = (lookup_attribute ("oacc kernels parallelized",
1974 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1975 if (is_oacc_kernels_parallelized)
1976 gcc_checking_assert (is_oacc_kernels);
1977
1978 if (dump_file)
1979 {
1980 if (is_oacc_parallel)
1981 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1982 else if (is_oacc_kernels)
1983 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1984 (is_oacc_kernels_parallelized
1985 ? "parallelized" : "unparallelized"));
1986 else if (is_oacc_serial)
1987 fprintf (dump_file, "Function is OpenACC serial offload\n");
1988 else if (is_oacc_parallel_kernels_parallelized)
1989 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1990 "parallel_kernels_parallelized");
1991 else if (is_oacc_parallel_kernels_gang_single)
1992 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1993 "parallel_kernels_gang_single");
1994 else if (is_oacc_routine)
1995 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1996 fn_level);
1997 else
1998 gcc_unreachable ();
1999 }
2000
2001 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2002 it's a convenient place, so... */
2003 if (is_oacc_routine)
2004 {
2005 tree attr = lookup_attribute ("omp declare target",
2006 DECL_ATTRIBUTES (current_function_decl));
2007 gcc_checking_assert (attr);
2008 tree clauses = TREE_VALUE (attr);
2009 gcc_checking_assert (clauses);
2010
2011 /* Should this OpenACC routine be discarded? */
2012 bool discard = false;
2013
2014 tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
2015 if (dump_file)
2016 fprintf (dump_file,
2017 "OpenACC routine '%s' %s '%s' clause.\n",
2018 lang_hooks.decl_printable_name (current_function_decl, 2),
2019 clause_nohost ? "has" : "doesn't have",
2020 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2021 /* Host compiler, 'nohost' clause? */
2022 #ifndef ACCEL_COMPILER
2023 if (clause_nohost)
2024 discard = true;
2025 #endif
2026
2027 if (dump_file)
2028 fprintf (dump_file,
2029 "OpenACC routine '%s' %sdiscarded.\n",
2030 lang_hooks.decl_printable_name (current_function_decl, 2),
2031 discard ? "" : "not ");
2032 if (discard)
2033 {
2034 TREE_ASM_WRITTEN (current_function_decl) = 1;
2035 return TODO_discard_function;
2036 }
2037 }
2038
2039 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2040 kernels, so remove the parallelism dimensions function attributes
2041 potentially set earlier on. */
2042 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2043 {
2044 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
2045 attrs = oacc_get_fn_attrib (current_function_decl);
2046 }
2047
2048 /* Discover, partition and process the loops. */
2049 oacc_loop *loops = oacc_loop_discovery ();
2050
2051 unsigned outer_mask = 0;
2052 if (is_oacc_routine)
2053 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2054 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
2055 /* OpenACC kernels constructs are special: they currently don't use the
2056 generic oacc_loop infrastructure and attribute/dimension processing. */
2057 if (is_oacc_kernels && is_oacc_kernels_parallelized)
2058 {
2059 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2060 also tree-parloops.cc:create_parallel_loop. */
2061 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2062 }
2063
2064 int dims[GOMP_DIM_MAX];
2065 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
2066
2067 if (dump_file)
2068 {
2069 const char *comma = "Compute dimensions [";
2070 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2071 fprintf (dump_file, "%s%d", comma, dims[ix]);
2072 fprintf (dump_file, "]\n");
2073 }
2074
2075 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2076 a single gang only. */
2077 if (is_oacc_parallel_kernels_gang_single)
2078 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2079
2080 oacc_loop_process (loops, fn_level);
2081 if (dump_file)
2082 {
2083 fprintf (dump_file, "OpenACC loops\n");
2084 dump_oacc_loop (dump_file, loops, 0);
2085 fprintf (dump_file, "\n");
2086 }
2087 if (dump_enabled_p ())
2088 {
2089 oacc_loop *l = loops;
2090 /* OpenACC kernels constructs are special: they currently don't use the
2091 generic oacc_loop infrastructure. */
2092 if (is_oacc_kernels)
2093 {
2094 /* Create a fake oacc_loop for diagnostic purposes. */
2095 l = new_oacc_loop_raw (NULL,
2096 DECL_SOURCE_LOCATION (current_function_decl));
2097 l->mask = used_mask;
2098 }
2099 else
2100 {
2101 /* Skip the outermost, dummy OpenACC loop */
2102 l = l->child;
2103 }
2104 if (l)
2105 inform_oacc_loop (l);
2106 if (is_oacc_kernels)
2107 free_oacc_loop (l);
2108 }
2109
2110 free_oacc_loop (loops);
2111
2112 return 0;
2113 }
2114
2115 static unsigned int
execute_oacc_device_lower()2116 execute_oacc_device_lower ()
2117 {
2118 tree attrs = oacc_get_fn_attrib (current_function_decl);
2119
2120 if (!attrs)
2121 /* Not an offloaded function. */
2122 return 0;
2123
2124 int dims[GOMP_DIM_MAX];
2125 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2126 dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
2127
2128 hash_map<tree, tree> adjusted_vars;
2129
2130 /* Now lower internal loop functions to target-specific code
2131 sequences. */
2132 basic_block bb;
2133 FOR_ALL_BB_FN (bb, cfun)
2134 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2135 {
2136 gimple *stmt = gsi_stmt (gsi);
2137 if (!is_gimple_call (stmt))
2138 {
2139 gsi_next (&gsi);
2140 continue;
2141 }
2142
2143 gcall *call = as_a <gcall *> (stmt);
2144 if (!gimple_call_internal_p (call))
2145 {
2146 gsi_next (&gsi);
2147 continue;
2148 }
2149
2150 /* Rewind to allow rescan. */
2151 gsi_prev (&gsi);
2152 bool rescan = false, remove = false;
2153 enum internal_fn ifn_code = gimple_call_internal_fn (call);
2154
2155 switch (ifn_code)
2156 {
2157 default: break;
2158
2159 case IFN_GOACC_TILE:
2160 oacc_xform_tile (call);
2161 rescan = true;
2162 break;
2163
2164 case IFN_GOACC_LOOP:
2165 oacc_xform_loop (call);
2166 rescan = true;
2167 break;
2168
2169 case IFN_GOACC_REDUCTION:
2170 /* Mark the function for SSA renaming. */
2171 mark_virtual_operands_for_renaming (cfun);
2172
2173 /* If the level is -1, this ended up being an unused
2174 axis. Handle as a default. */
2175 if (integer_minus_onep (gimple_call_arg (call, 3)))
2176 default_goacc_reduction (call);
2177 else
2178 targetm.goacc.reduction (call);
2179 rescan = true;
2180 break;
2181
2182 case IFN_UNIQUE:
2183 {
2184 enum ifn_unique_kind kind
2185 = ((enum ifn_unique_kind)
2186 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2187
2188 switch (kind)
2189 {
2190 default:
2191 break;
2192
2193 case IFN_UNIQUE_OACC_FORK:
2194 case IFN_UNIQUE_OACC_JOIN:
2195 if (integer_minus_onep (gimple_call_arg (call, 2)))
2196 remove = true;
2197 else if (!targetm.goacc.fork_join
2198 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2199 remove = true;
2200 break;
2201
2202 case IFN_UNIQUE_OACC_HEAD_MARK:
2203 case IFN_UNIQUE_OACC_TAIL_MARK:
2204 remove = true;
2205 break;
2206
2207 case IFN_UNIQUE_OACC_PRIVATE:
2208 {
2209 dump_flags_t l_dump_flags
2210 = get_openacc_privatization_dump_flags ();
2211
2212 location_t loc = gimple_location (stmt);
2213 if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2214 loc = DECL_SOURCE_LOCATION (current_function_decl);
2215 const dump_user_location_t d_u_loc
2216 = dump_user_location_t::from_location_t (loc);
2217
2218 HOST_WIDE_INT level
2219 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2220 gcc_checking_assert (level == -1
2221 || (level >= 0
2222 && level < GOMP_DIM_MAX));
2223 for (unsigned i = 3;
2224 i < gimple_call_num_args (call);
2225 i++)
2226 {
2227 static char const *const axes[] =
2228 /* Must be kept in sync with GOMP_DIM enumeration. */
2229 { "gang", "worker", "vector" };
2230
2231 tree arg = gimple_call_arg (call, i);
2232 gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2233 tree decl = TREE_OPERAND (arg, 0);
2234 if (dump_enabled_p ())
2235 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2236 #if __GNUC__ >= 10
2237 # pragma GCC diagnostic push
2238 # pragma GCC diagnostic ignored "-Wformat"
2239 #endif
2240 dump_printf_loc (l_dump_flags, d_u_loc,
2241 "variable %<%T%> ought to be"
2242 " adjusted for OpenACC"
2243 " privatization level: %qs\n",
2244 decl,
2245 (level == -1
2246 ? "UNKNOWN" : axes[level]));
2247 #if __GNUC__ >= 10
2248 # pragma GCC diagnostic pop
2249 #endif
2250 bool adjusted;
2251 if (level == -1)
2252 adjusted = false;
2253 else if (!targetm.goacc.adjust_private_decl)
2254 adjusted = false;
2255 else if (level == GOMP_DIM_VECTOR)
2256 {
2257 /* That's the default behavior. */
2258 adjusted = true;
2259 }
2260 else
2261 {
2262 tree oldtype = TREE_TYPE (decl);
2263 tree newdecl
2264 = targetm.goacc.adjust_private_decl (loc, decl,
2265 level);
2266 adjusted = (TREE_TYPE (newdecl) != oldtype
2267 || newdecl != decl);
2268 if (adjusted)
2269 adjusted_vars.put (decl, newdecl);
2270 }
2271 if (adjusted
2272 && dump_enabled_p ())
2273 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2274 #if __GNUC__ >= 10
2275 # pragma GCC diagnostic push
2276 # pragma GCC diagnostic ignored "-Wformat"
2277 #endif
2278 dump_printf_loc (l_dump_flags, d_u_loc,
2279 "variable %<%T%> adjusted for"
2280 " OpenACC privatization level:"
2281 " %qs\n",
2282 decl, axes[level]);
2283 #if __GNUC__ >= 10
2284 # pragma GCC diagnostic pop
2285 #endif
2286 }
2287 remove = true;
2288 }
2289 break;
2290 }
2291 break;
2292 }
2293 }
2294
2295 if (gsi_end_p (gsi))
2296 /* We rewound past the beginning of the BB. */
2297 gsi = gsi_start_bb (bb);
2298 else
2299 /* Undo the rewind. */
2300 gsi_next (&gsi);
2301
2302 if (remove)
2303 {
2304 if (gimple_vdef (call))
2305 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
2306 if (gimple_call_lhs (call))
2307 {
2308 /* Propagate the data dependency var. */
2309 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
2310 gimple_call_arg (call, 1));
2311 gsi_replace (&gsi, ass, false);
2312 }
2313 else
2314 gsi_remove (&gsi, true);
2315 }
2316 else if (!rescan)
2317 /* If not rescanning, advance over the call. */
2318 gsi_next (&gsi);
2319 }
2320
2321 /* Regarding the OpenACC privatization level, we're currently only looking at
2322 making the gang-private level work. Regarding that, we have the following
2323 configurations:
2324
2325 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2326 particular, change 'TREE_TYPE', etc.) and there is no
2327 'targetm.goacc.expand_var_decl'.
2328
2329 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2330 marker and then 'targetm.goacc.expand_var_decl' does the work.
2331
2332 Eventually (in particular, for worker-private level?), both
2333 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2334 may need to do things, but that's currently not meant to be addressed, and
2335 thus not fully worked out and implemented, and thus untested. Hence,
2336 'assert' what currently is implemented/tested, only. */
2337
2338 if (targetm.goacc.expand_var_decl)
2339 gcc_assert (adjusted_vars.is_empty ());
2340
2341 /* Make adjustments to gang-private local variables if required by the
2342 target, e.g. forcing them into a particular address space. Afterwards,
2343 ADDR_EXPR nodes which have adjusted variables as their argument need to
2344 be modified in one of two ways:
2345
2346 1. They can be recreated, making a pointer to the variable in the new
2347 address space, or
2348
2349 2. The address of the variable in the new address space can be taken,
2350 converted to the default (original) address space, and the result of
2351 that conversion subsituted in place of the original ADDR_EXPR node.
2352
2353 Which of these is done depends on the gimple statement being processed.
2354 At present atomic operations and inline asms use (1), and everything else
2355 uses (2). At least on AMD GCN, there are atomic operations that work
2356 directly in the LDS address space.
2357
2358 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2359 the new decl, adjusting types of appropriate tree nodes as necessary. */
2360
2361 if (targetm.goacc.adjust_private_decl
2362 && !adjusted_vars.is_empty ())
2363 {
2364 FOR_ALL_BB_FN (bb, cfun)
2365 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2366 !gsi_end_p (gsi);
2367 gsi_next (&gsi))
2368 {
2369 gimple *stmt = gsi_stmt (gsi);
2370 walk_stmt_info wi;
2371 var_decl_rewrite_info info;
2372
2373 info.avoid_pointer_conversion
2374 = (is_gimple_call (stmt)
2375 && is_sync_builtin_call (as_a <gcall *> (stmt)))
2376 || gimple_code (stmt) == GIMPLE_ASM;
2377 info.stmt = stmt;
2378 info.modified = false;
2379 info.adjusted_vars = &adjusted_vars;
2380
2381 memset (&wi, 0, sizeof (wi));
2382 wi.info = &info;
2383
2384 walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2385
2386 if (info.modified)
2387 update_stmt (stmt);
2388 }
2389 }
2390
2391 return 0;
2392 }
2393
2394 /* Default launch dimension validator. Force everything to 1. A
2395 backend that wants to provide larger dimensions must override this
2396 hook. */
2397
2398 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))2399 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2400 int ARG_UNUSED (fn_level),
2401 unsigned ARG_UNUSED (used))
2402 {
2403 bool changed = false;
2404
2405 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2406 {
2407 if (dims[ix] != 1)
2408 {
2409 dims[ix] = 1;
2410 changed = true;
2411 }
2412 }
2413
2414 return changed;
2415 }
2416
2417 /* Default dimension bound is unknown on accelerator and 1 on host. */
2418
2419 int
default_goacc_dim_limit(int ARG_UNUSED (axis))2420 default_goacc_dim_limit (int ARG_UNUSED (axis))
2421 {
2422 #ifdef ACCEL_COMPILER
2423 return 0;
2424 #else
2425 return 1;
2426 #endif
2427 }
2428
2429 namespace {
2430
2431 const pass_data pass_data_oacc_loop_designation =
2432 {
2433 GIMPLE_PASS, /* type */
2434 "oaccloops", /* name */
2435 OPTGROUP_OMP, /* optinfo_flags */
2436 TV_NONE, /* tv_id */
2437 PROP_cfg, /* properties_required */
2438 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2439 0, /* properties_destroyed */
2440 0, /* todo_flags_start */
2441 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2442 };
2443
2444 class pass_oacc_loop_designation : public gimple_opt_pass
2445 {
2446 public:
pass_oacc_loop_designation(gcc::context * ctxt)2447 pass_oacc_loop_designation (gcc::context *ctxt)
2448 : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2449 {}
2450
2451 /* opt_pass methods: */
gate(function *)2452 virtual bool gate (function *) { return flag_openacc; };
2453
execute(function *)2454 virtual unsigned int execute (function *)
2455 {
2456 return execute_oacc_loop_designation ();
2457 }
2458
2459 }; // class pass_oacc_loop_designation
2460
2461 const pass_data pass_data_oacc_device_lower =
2462 {
2463 GIMPLE_PASS, /* type */
2464 "oaccdevlow", /* name */
2465 OPTGROUP_OMP, /* optinfo_flags */
2466 TV_NONE, /* tv_id */
2467 PROP_cfg, /* properties_required */
2468 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2469 0, /* properties_destroyed */
2470 0, /* todo_flags_start */
2471 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2472 };
2473
2474 class pass_oacc_device_lower : public gimple_opt_pass
2475 {
2476 public:
pass_oacc_device_lower(gcc::context * ctxt)2477 pass_oacc_device_lower (gcc::context *ctxt)
2478 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2479 {}
2480
2481 /* opt_pass methods: */
gate(function *)2482 virtual bool gate (function *) { return flag_openacc; };
2483
execute(function *)2484 virtual unsigned int execute (function *)
2485 {
2486 return execute_oacc_device_lower ();
2487 }
2488
2489 }; // class pass_oacc_device_lower
2490
2491 } // anon namespace
2492
2493 gimple_opt_pass *
make_pass_oacc_loop_designation(gcc::context * ctxt)2494 make_pass_oacc_loop_designation (gcc::context *ctxt)
2495 {
2496 return new pass_oacc_loop_designation (ctxt);
2497 }
2498
2499 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)2500 make_pass_oacc_device_lower (gcc::context *ctxt)
2501 {
2502 return new pass_oacc_device_lower (ctxt);
2503 }
2504
2505
2506 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2507 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2508 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2509 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2510
2511 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)2512 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2513 {
2514 gimple *alloc_stmt = gsi_stmt (*gsi);
2515 tree simtrec = gimple_call_lhs (alloc_stmt);
2516 tree simduid = gimple_call_arg (alloc_stmt, 0);
2517 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2518 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2519 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2520 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2521 TREE_ADDRESSABLE (rectype) = 1;
2522 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2523 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2524 {
2525 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2526 if (*argp == null_pointer_node)
2527 continue;
2528 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2529 && VAR_P (TREE_OPERAND (*argp, 0)));
2530 tree var = TREE_OPERAND (*argp, 0);
2531
2532 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2533 DECL_NAME (var), TREE_TYPE (var));
2534 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2535 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2536 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2537
2538 insert_field_into_struct (rectype, field);
2539
2540 tree t = build_simple_mem_ref (simtrec);
2541 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2542 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2543 SET_DECL_VALUE_EXPR (var, t);
2544 DECL_HAS_VALUE_EXPR_P (var) = 1;
2545 *regimplify = true;
2546 }
2547 layout_type (rectype);
2548 tree size = TYPE_SIZE_UNIT (rectype);
2549 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2550
2551 alloc_stmt
2552 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2553 gimple_call_set_lhs (alloc_stmt, simtrec);
2554 gsi_replace (gsi, alloc_stmt, false);
2555 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2556 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2557 gsi_replace (&enter_gsi, enter_stmt, false);
2558
2559 use_operand_p use;
2560 gimple *exit_stmt;
2561 if (single_imm_use (simtrec, &use, &exit_stmt))
2562 {
2563 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2564 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2565 tree clobber = build_clobber (rectype);
2566 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2567 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2568 }
2569 else
2570 gcc_checking_assert (has_zero_uses (simtrec));
2571 }
2572
2573 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2574
2575 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)2576 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2577 {
2578 tree t = *tp;
2579
2580 if (VAR_P (t)
2581 && DECL_HAS_VALUE_EXPR_P (t)
2582 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2583 {
2584 *walk_subtrees = 0;
2585 return t;
2586 }
2587 return NULL_TREE;
2588 }
2589
2590 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2591 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2592 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2593 internal functions on non-SIMT targets, and likewise some SIMD internal
2594 functions on SIMT targets. */
2595
2596 static unsigned int
execute_omp_device_lower()2597 execute_omp_device_lower ()
2598 {
2599 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2600 bool regimplify = false;
2601 basic_block bb;
2602 gimple_stmt_iterator gsi;
2603 bool calls_declare_variant_alt
2604 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2605 FOR_EACH_BB_FN (bb, cfun)
2606 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2607 {
2608 gimple *stmt = gsi_stmt (gsi);
2609 if (!is_gimple_call (stmt))
2610 continue;
2611 if (!gimple_call_internal_p (stmt))
2612 {
2613 if (calls_declare_variant_alt)
2614 if (tree fndecl = gimple_call_fndecl (stmt))
2615 {
2616 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2617 if (new_fndecl != fndecl)
2618 {
2619 gimple_call_set_fndecl (stmt, new_fndecl);
2620 update_stmt (stmt);
2621 }
2622 }
2623 continue;
2624 }
2625 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2626 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2627 switch (gimple_call_internal_fn (stmt))
2628 {
2629 case IFN_GOMP_USE_SIMT:
2630 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2631 break;
2632 case IFN_GOMP_SIMT_ENTER:
2633 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2634 goto simtreg_enter_exit;
2635 case IFN_GOMP_SIMT_ENTER_ALLOC:
2636 if (vf != 1)
2637 ompdevlow_adjust_simt_enter (&gsi, ®implify);
2638 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2639 goto simtreg_enter_exit;
2640 case IFN_GOMP_SIMT_EXIT:
2641 simtreg_enter_exit:
2642 if (vf != 1)
2643 continue;
2644 unlink_stmt_vdef (stmt);
2645 break;
2646 case IFN_GOMP_SIMT_LANE:
2647 case IFN_GOMP_SIMT_LAST_LANE:
2648 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2649 break;
2650 case IFN_GOMP_SIMT_VF:
2651 rhs = build_int_cst (type, vf);
2652 break;
2653 case IFN_GOMP_SIMT_ORDERED_PRED:
2654 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2655 if (rhs || !lhs)
2656 unlink_stmt_vdef (stmt);
2657 break;
2658 case IFN_GOMP_SIMT_VOTE_ANY:
2659 case IFN_GOMP_SIMT_XCHG_BFLY:
2660 case IFN_GOMP_SIMT_XCHG_IDX:
2661 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2662 break;
2663 case IFN_GOMP_SIMD_LANE:
2664 case IFN_GOMP_SIMD_LAST_LANE:
2665 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2666 break;
2667 case IFN_GOMP_SIMD_VF:
2668 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2669 break;
2670 default:
2671 continue;
2672 }
2673 if (lhs && !rhs)
2674 continue;
2675 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2676 gsi_replace (&gsi, stmt, false);
2677 }
2678 if (regimplify)
2679 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2680 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2681 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2682 {
2683 if (gimple_clobber_p (gsi_stmt (gsi)))
2684 gsi_remove (&gsi, true);
2685 else
2686 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2687 }
2688 if (vf != 1)
2689 cfun->has_force_vectorize_loops = false;
2690 return 0;
2691 }
2692
2693 namespace {
2694
2695 const pass_data pass_data_omp_device_lower =
2696 {
2697 GIMPLE_PASS, /* type */
2698 "ompdevlow", /* name */
2699 OPTGROUP_OMP, /* optinfo_flags */
2700 TV_NONE, /* tv_id */
2701 PROP_cfg, /* properties_required */
2702 PROP_gimple_lomp_dev, /* properties_provided */
2703 0, /* properties_destroyed */
2704 0, /* todo_flags_start */
2705 TODO_update_ssa, /* todo_flags_finish */
2706 };
2707
2708 class pass_omp_device_lower : public gimple_opt_pass
2709 {
2710 public:
pass_omp_device_lower(gcc::context * ctxt)2711 pass_omp_device_lower (gcc::context *ctxt)
2712 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2713 {}
2714
2715 /* opt_pass methods: */
gate(function * fun)2716 virtual bool gate (function *fun)
2717 {
2718 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2719 || (flag_openmp
2720 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2721 }
execute(function *)2722 virtual unsigned int execute (function *)
2723 {
2724 return execute_omp_device_lower ();
2725 }
2726
2727 }; // class pass_expand_omp_ssa
2728
2729 } // anon namespace
2730
2731 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2732 make_pass_omp_device_lower (gcc::context *ctxt)
2733 {
2734 return new pass_omp_device_lower (ctxt);
2735 }
2736
2737 /* "omp declare target link" handling pass. */
2738
2739 namespace {
2740
2741 const pass_data pass_data_omp_target_link =
2742 {
2743 GIMPLE_PASS, /* type */
2744 "omptargetlink", /* name */
2745 OPTGROUP_OMP, /* optinfo_flags */
2746 TV_NONE, /* tv_id */
2747 PROP_ssa, /* properties_required */
2748 0, /* properties_provided */
2749 0, /* properties_destroyed */
2750 0, /* todo_flags_start */
2751 TODO_update_ssa, /* todo_flags_finish */
2752 };
2753
2754 class pass_omp_target_link : public gimple_opt_pass
2755 {
2756 public:
pass_omp_target_link(gcc::context * ctxt)2757 pass_omp_target_link (gcc::context *ctxt)
2758 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2759 {}
2760
2761 /* opt_pass methods: */
gate(function * fun)2762 virtual bool gate (function *fun)
2763 {
2764 #ifdef ACCEL_COMPILER
2765 return offloading_function_p (fun->decl);
2766 #else
2767 (void) fun;
2768 return false;
2769 #endif
2770 }
2771
2772 virtual unsigned execute (function *);
2773 };
2774
2775 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2776
2777 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2778 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2779 {
2780 tree t = *tp;
2781
2782 if (VAR_P (t)
2783 && DECL_HAS_VALUE_EXPR_P (t)
2784 && is_global_var (t)
2785 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2786 {
2787 *walk_subtrees = 0;
2788 return t;
2789 }
2790
2791 return NULL_TREE;
2792 }
2793
2794 unsigned
execute(function * fun)2795 pass_omp_target_link::execute (function *fun)
2796 {
2797 basic_block bb;
2798 FOR_EACH_BB_FN (bb, fun)
2799 {
2800 gimple_stmt_iterator gsi;
2801 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2802 {
2803 if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
2804 {
2805 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2806 gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
2807 update_stmt (gsi_stmt (gsi));
2808 }
2809 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2810 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2811 }
2812 }
2813
2814 return 0;
2815 }
2816
2817 } // anon namespace
2818
2819 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2820 make_pass_omp_target_link (gcc::context *ctxt)
2821 {
2822 return new pass_omp_target_link (ctxt);
2823 }
2824