1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55
56 /* Describe the OpenACC looping structure of a function. The entire
57 function is held in a 'NULL' loop. */
58
59 struct oacc_loop
60 {
61 oacc_loop *parent; /* Containing loop. */
62
63 oacc_loop *child; /* First inner loop. */
64
65 oacc_loop *sibling; /* Next loop within same parent. */
66
67 location_t loc; /* Location of the loop start. */
68
69 gcall *marker; /* Initial head marker. */
70
71 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
72 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
73
74 tree routine; /* Pseudo-loop enclosing a routine. */
75
76 unsigned mask; /* Partitioning mask. */
77 unsigned e_mask; /* Partitioning of element loops (when tiling). */
78 unsigned inner; /* Partitioning of inner loops. */
79 unsigned flags; /* Partitioning flags. */
80 vec<gcall *> ifns; /* Contained loop abstraction functions. */
81 tree chunk_size; /* Chunk size. */
82 gcall *head_end; /* Final marker of head sequence. */
83 };
84
85 /* Holds offload tables with decls. */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
87
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89 -1 if it is not a routine (i.e. is an offload fn). */
90
91 int
oacc_fn_attrib_level(tree attr)92 oacc_fn_attrib_level (tree attr)
93 {
94 tree pos = TREE_VALUE (attr);
95
96 if (!TREE_PURPOSE (pos))
97 return -1;
98
99 int ix = 0;
100 for (ix = 0; ix != GOMP_DIM_MAX;
101 ix++, pos = TREE_CHAIN (pos))
102 if (!integer_zerop (TREE_PURPOSE (pos)))
103 break;
104
105 return ix;
106 }
107
108 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
109 adds their addresses and sizes to constructor-vector V_CTOR. */
110
111 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 vec<constructor_elt, va_gc> *v_ctor)
114 {
115 unsigned len = vec_safe_length (v_decls);
116 for (unsigned i = 0; i < len; i++)
117 {
118 tree it = (*v_decls)[i];
119 bool is_var = VAR_P (it);
120 bool is_link_var
121 = is_var
122 #ifdef ACCEL_COMPILER
123 && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126
127 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
128 if (!in_lto_p && !symtab_node::get (it))
129 continue;
130
131 tree size = NULL_TREE;
132 if (is_var)
133 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
134
135 tree addr;
136 if (!is_link_var)
137 addr = build_fold_addr_expr (it);
138 else
139 {
140 #ifdef ACCEL_COMPILER
141 /* For "omp declare target link" vars add address of the pointer to
142 the target table, instead of address of the var. */
143 tree value_expr = DECL_VALUE_EXPR (it);
144 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
145 varpool_node::finalize_decl (link_ptr_decl);
146 addr = build_fold_addr_expr (link_ptr_decl);
147 #else
148 addr = build_fold_addr_expr (it);
149 #endif
150
151 /* Most significant bit of the size marks "omp declare target link"
152 vars in host and target tables. */
153 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
154 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
155 * BITS_PER_UNIT - 1);
156 size = wide_int_to_tree (const_ptr_type_node, isize);
157 }
158
159 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
160 if (is_var)
161 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
162 }
163 }
164
165 /* Create new symbols containing (address, size) pairs for global variables,
166 marked with "omp declare target" attribute, as well as addresses for the
167 functions, which are outlined offloading regions. */
168 void
omp_finish_file(void)169 omp_finish_file (void)
170 {
171 unsigned num_funcs = vec_safe_length (offload_funcs);
172 unsigned num_vars = vec_safe_length (offload_vars);
173
174 if (num_funcs == 0 && num_vars == 0)
175 return;
176
177 if (targetm_common.have_named_sections)
178 {
179 vec<constructor_elt, va_gc> *v_f, *v_v;
180 vec_alloc (v_f, num_funcs);
181 vec_alloc (v_v, num_vars * 2);
182
183 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
184 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
185
186 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
187 vec_safe_length (v_v));
188 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
189 num_funcs);
190 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
191 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
192 tree ctor_v = build_constructor (vars_decl_type, v_v);
193 tree ctor_f = build_constructor (funcs_decl_type, v_f);
194 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
195 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
196 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
197 get_identifier (".offload_func_table"),
198 funcs_decl_type);
199 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
200 get_identifier (".offload_var_table"),
201 vars_decl_type);
202 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
203 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
204 otherwise a joint table in a binary will contain padding between
205 tables from multiple object files. */
206 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
207 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
208 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
209 DECL_INITIAL (funcs_decl) = ctor_f;
210 DECL_INITIAL (vars_decl) = ctor_v;
211 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
212 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
213
214 varpool_node::finalize_decl (vars_decl);
215 varpool_node::finalize_decl (funcs_decl);
216 }
217 else
218 {
219 for (unsigned i = 0; i < num_funcs; i++)
220 {
221 tree it = (*offload_funcs)[i];
222 /* See also add_decls_addresses_to_decl_constructor
223 and output_offload_tables in lto-cgraph.c. */
224 if (!in_lto_p && !symtab_node::get (it))
225 continue;
226 targetm.record_offload_symbol (it);
227 }
228 for (unsigned i = 0; i < num_vars; i++)
229 {
230 tree it = (*offload_vars)[i];
231 if (!in_lto_p && !symtab_node::get (it))
232 continue;
233 #ifdef ACCEL_COMPILER
234 if (DECL_HAS_VALUE_EXPR_P (it)
235 && lookup_attribute ("omp declare target link",
236 DECL_ATTRIBUTES (it)))
237 {
238 tree value_expr = DECL_VALUE_EXPR (it);
239 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
240 targetm.record_offload_symbol (link_ptr_decl);
241 varpool_node::finalize_decl (link_ptr_decl);
242 }
243 else
244 #endif
245 targetm.record_offload_symbol (it);
246 }
247 }
248 }
249
250 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
251 axis DIM. Return a tmp var holding the result. */
252
253 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)254 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
255 {
256 tree arg = build_int_cst (unsigned_type_node, dim);
257 tree size = create_tmp_var (integer_type_node);
258 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
259 gimple *call = gimple_build_call_internal (fn, 1, arg);
260
261 gimple_call_set_lhs (call, size);
262 gimple_seq_add_stmt (seq, call);
263
264 return size;
265 }
266
267 /* Find the number of threads (POS = false), or thread number (POS =
268 true) for an OpenACC region partitioned as MASK. Setup code
269 required for the calculation is added to SEQ. */
270
271 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)272 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
273 {
274 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
275 unsigned ix;
276
277 /* Start at gang level, and examine relevant dimension indices. */
278 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
279 if (GOMP_DIM_MASK (ix) & mask)
280 {
281 if (res)
282 {
283 /* We had an outer index, so scale that by the size of
284 this dimension. */
285 tree n = oacc_dim_call (false, ix, seq);
286 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
287 }
288 if (pos)
289 {
290 /* Determine index in this dimension. */
291 tree id = oacc_dim_call (true, ix, seq);
292 if (res)
293 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
294 else
295 res = id;
296 }
297 }
298
299 if (res == NULL_TREE)
300 res = integer_zero_node;
301
302 return res;
303 }
304
305 /* Transform IFN_GOACC_LOOP calls to actual code. See
306 expand_oacc_for for where these are generated. At the vector
307 level, we stride loops, such that each member of a warp will
308 operate on adjacent iterations. At the worker and gang level,
309 each gang/warp executes a set of contiguous iterations. Chunking
310 can override this such that each iteration engine executes a
311 contiguous chunk, and then moves on to stride to the next chunk. */
312
313 static void
oacc_xform_loop(gcall * call)314 oacc_xform_loop (gcall *call)
315 {
316 gimple_stmt_iterator gsi = gsi_for_stmt (call);
317 enum ifn_goacc_loop_kind code
318 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
319 tree dir = gimple_call_arg (call, 1);
320 tree range = gimple_call_arg (call, 2);
321 tree step = gimple_call_arg (call, 3);
322 tree chunk_size = NULL_TREE;
323 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
324 tree lhs = gimple_call_lhs (call);
325 tree type = NULL_TREE;
326 tree diff_type = TREE_TYPE (range);
327 tree r = NULL_TREE;
328 gimple_seq seq = NULL;
329 bool chunking = false, striding = true;
330 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
331 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
332
333 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
334 if (!lhs)
335 {
336 gsi_replace_with_seq (&gsi, seq, true);
337 return;
338 }
339
340 type = TREE_TYPE (lhs);
341
342 #ifdef ACCEL_COMPILER
343 chunk_size = gimple_call_arg (call, 4);
344 if (integer_minus_onep (chunk_size) /* Force static allocation. */
345 || integer_zerop (chunk_size)) /* Default (also static). */
346 {
347 /* If we're at the gang level, we want each to execute a
348 contiguous run of iterations. Otherwise we want each element
349 to stride. */
350 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
351 chunking = false;
352 }
353 else
354 {
355 /* Chunk of size 1 is striding. */
356 striding = integer_onep (chunk_size);
357 chunking = !striding;
358 }
359 #endif
360
361 /* striding=true, chunking=true
362 -> invalid.
363 striding=true, chunking=false
364 -> chunks=1
365 striding=false,chunking=true
366 -> chunks=ceil (range/(chunksize*threads*step))
367 striding=false,chunking=false
368 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
369 push_gimplify_context (true);
370
371 switch (code)
372 {
373 default: gcc_unreachable ();
374
375 case IFN_GOACC_LOOP_CHUNKS:
376 if (!chunking)
377 r = build_int_cst (type, 1);
378 else
379 {
380 /* chunk_max
381 = (range - dir) / (chunks * step * num_threads) + dir */
382 tree per = oacc_thread_numbers (false, mask, &seq);
383 per = fold_convert (type, per);
384 chunk_size = fold_convert (type, chunk_size);
385 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
386 per = fold_build2 (MULT_EXPR, type, per, step);
387 r = build2 (MINUS_EXPR, type, range, dir);
388 r = build2 (PLUS_EXPR, type, r, per);
389 r = build2 (TRUNC_DIV_EXPR, type, r, per);
390 }
391 break;
392
393 case IFN_GOACC_LOOP_STEP:
394 {
395 /* If striding, step by the entire compute volume, otherwise
396 step by the inner volume. */
397 unsigned volume = striding ? mask : inner_mask;
398
399 r = oacc_thread_numbers (false, volume, &seq);
400 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
401 }
402 break;
403
404 case IFN_GOACC_LOOP_OFFSET:
405 /* Enable vectorization on non-SIMT targets. */
406 if (!targetm.simt.vf
407 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
408 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
409 the loop. */
410 && (flag_tree_loop_vectorize
411 || !global_options_set.x_flag_tree_loop_vectorize))
412 {
413 basic_block bb = gsi_bb (gsi);
414 class loop *parent = bb->loop_father;
415 class loop *body = parent->inner;
416
417 parent->force_vectorize = true;
418 parent->safelen = INT_MAX;
419
420 /* "Chunking loops" may have inner loops. */
421 if (parent->inner)
422 {
423 body->force_vectorize = true;
424 body->safelen = INT_MAX;
425 }
426
427 cfun->has_force_vectorize_loops = true;
428 }
429 if (striding)
430 {
431 r = oacc_thread_numbers (true, mask, &seq);
432 r = fold_convert (diff_type, r);
433 }
434 else
435 {
436 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
437 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
438 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
439 inner_size, outer_size);
440
441 volume = fold_convert (diff_type, volume);
442 if (chunking)
443 chunk_size = fold_convert (diff_type, chunk_size);
444 else
445 {
446 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
447
448 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
449 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
450 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
451 }
452
453 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
454 fold_convert (diff_type, inner_size));
455 r = oacc_thread_numbers (true, outer_mask, &seq);
456 r = fold_convert (diff_type, r);
457 r = build2 (MULT_EXPR, diff_type, r, span);
458
459 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
460 inner = fold_convert (diff_type, inner);
461 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
462
463 if (chunking)
464 {
465 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
466 tree per
467 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
468 per = build2 (MULT_EXPR, diff_type, per, chunk);
469
470 r = build2 (PLUS_EXPR, diff_type, r, per);
471 }
472 }
473 r = fold_build2 (MULT_EXPR, diff_type, r, step);
474 if (type != diff_type)
475 r = fold_convert (type, r);
476 break;
477
478 case IFN_GOACC_LOOP_BOUND:
479 if (striding)
480 r = range;
481 else
482 {
483 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
484 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
485 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
486 inner_size, outer_size);
487
488 volume = fold_convert (diff_type, volume);
489 if (chunking)
490 chunk_size = fold_convert (diff_type, chunk_size);
491 else
492 {
493 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
494
495 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
496 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
497 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
498 }
499
500 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
501 fold_convert (diff_type, inner_size));
502
503 r = fold_build2 (MULT_EXPR, diff_type, span, step);
504
505 tree offset = gimple_call_arg (call, 6);
506 r = build2 (PLUS_EXPR, diff_type, r,
507 fold_convert (diff_type, offset));
508 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
509 diff_type, r, range);
510 }
511 if (diff_type != type)
512 r = fold_convert (type, r);
513 break;
514 }
515
516 gimplify_assign (lhs, r, &seq);
517
518 pop_gimplify_context (NULL);
519
520 gsi_replace_with_seq (&gsi, seq, true);
521 }
522
523 /* Transform a GOACC_TILE call. Determines the element loop span for
524 the specified loop of the nest. This is 1 if we're not tiling.
525
526 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
527
528 static void
oacc_xform_tile(gcall * call)529 oacc_xform_tile (gcall *call)
530 {
531 gimple_stmt_iterator gsi = gsi_for_stmt (call);
532 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
533 /* Inner loops have higher loop_nos. */
534 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
535 tree tile_size = gimple_call_arg (call, 2);
536 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
537 tree lhs = gimple_call_lhs (call);
538 tree type = TREE_TYPE (lhs);
539 gimple_seq seq = NULL;
540 tree span = build_int_cst (type, 1);
541
542 gcc_assert (!(e_mask
543 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
544 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
545 push_gimplify_context (!seen_error ());
546
547 #ifndef ACCEL_COMPILER
548 /* Partitioning disabled on host compilers. */
549 e_mask = 0;
550 #endif
551 if (!e_mask)
552 /* Not paritioning. */
553 span = integer_one_node;
554 else if (!integer_zerop (tile_size))
555 /* User explicitly specified size. */
556 span = tile_size;
557 else
558 {
559 /* Pick a size based on the paritioning of the element loop and
560 the number of loop nests. */
561 tree first_size = NULL_TREE;
562 tree second_size = NULL_TREE;
563
564 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
565 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
566 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
567 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
568
569 if (!first_size)
570 {
571 first_size = second_size;
572 second_size = NULL_TREE;
573 }
574
575 if (loop_no + 1 == collapse)
576 {
577 span = first_size;
578 if (!loop_no && second_size)
579 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
580 span, second_size);
581 }
582 else if (loop_no + 2 == collapse)
583 span = second_size;
584 else
585 span = NULL_TREE;
586
587 if (!span)
588 /* There's no obvious element size for this loop. Options
589 are 1, first_size or some non-unity constant (32 is my
590 favourite). We should gather some statistics. */
591 span = first_size;
592 }
593
594 span = fold_convert (type, span);
595 gimplify_assign (lhs, span, &seq);
596
597 pop_gimplify_context (NULL);
598
599 gsi_replace_with_seq (&gsi, seq, true);
600 }
601
602 /* Default partitioned and minimum partitioned dimensions. */
603
604 static int oacc_default_dims[GOMP_DIM_MAX];
605 static int oacc_min_dims[GOMP_DIM_MAX];
606
607 int
oacc_get_default_dim(int dim)608 oacc_get_default_dim (int dim)
609 {
610 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
611 return oacc_default_dims[dim];
612 }
613
614 int
oacc_get_min_dim(int dim)615 oacc_get_min_dim (int dim)
616 {
617 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
618 return oacc_min_dims[dim];
619 }
620
621 /* Parse the default dimension parameter. This is a set of
622 :-separated optional compute dimensions. Each specified dimension
623 is a positive integer. When device type support is added, it is
624 planned to be a comma separated list of such compute dimensions,
625 with all but the first prefixed by the colon-terminated device
626 type. */
627
628 static void
oacc_parse_default_dims(const char * dims)629 oacc_parse_default_dims (const char *dims)
630 {
631 int ix;
632
633 for (ix = GOMP_DIM_MAX; ix--;)
634 {
635 oacc_default_dims[ix] = -1;
636 oacc_min_dims[ix] = 1;
637 }
638
639 #ifndef ACCEL_COMPILER
640 /* Cannot be overridden on the host. */
641 dims = NULL;
642 #endif
643 if (dims)
644 {
645 const char *pos = dims;
646
647 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
648 {
649 if (ix)
650 {
651 if (*pos != ':')
652 goto malformed;
653 pos++;
654 }
655
656 if (*pos != ':')
657 {
658 long val;
659 const char *eptr;
660
661 errno = 0;
662 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
663 if (errno || val <= 0 || (int) val != val)
664 goto malformed;
665 pos = eptr;
666 oacc_default_dims[ix] = (int) val;
667 }
668 }
669 if (*pos)
670 {
671 malformed:
672 error_at (UNKNOWN_LOCATION,
673 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
674 }
675 }
676
677 /* Allow the backend to validate the dimensions. */
678 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
679 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
680 }
681
682 /* Validate and update the dimensions for offloaded FN. ATTRS is the
683 raw attribute. DIMS is an array of dimensions, which is filled in.
684 LEVEL is the partitioning level of a routine, or -1 for an offload
685 region itself. USED is the mask of partitioned execution in the
686 function. */
687
688 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)689 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
690 {
691 tree purpose[GOMP_DIM_MAX];
692 unsigned ix;
693 tree pos = TREE_VALUE (attrs);
694
695 /* Make sure the attribute creator attached the dimension
696 information. */
697 gcc_assert (pos);
698
699 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
700 {
701 purpose[ix] = TREE_PURPOSE (pos);
702 tree val = TREE_VALUE (pos);
703 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
704 pos = TREE_CHAIN (pos);
705 }
706
707 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
708
709 /* Default anything left to 1 or a partitioned default. */
710 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
711 if (dims[ix] < 0)
712 {
713 /* The OpenACC spec says 'If the [num_gangs] clause is not
714 specified, an implementation-defined default will be used;
715 the default may depend on the code within the construct.'
716 (2.5.6). Thus an implementation is free to choose
717 non-unity default for a parallel region that doesn't have
718 any gang-partitioned loops. However, it appears that there
719 is a sufficient body of user code that expects non-gang
720 partitioned regions to not execute in gang-redundant mode.
721 So we (a) don't warn about the non-portability and (b) pick
722 the minimum permissible dimension size when there is no
723 partitioned execution. Otherwise we pick the global
724 default for the dimension, which the user can control. The
725 same wording and logic applies to num_workers and
726 vector_length, however the worker- or vector- single
727 execution doesn't have the same impact as gang-redundant
728 execution. (If the minimum gang-level partioning is not 1,
729 the target is probably too confusing.) */
730 dims[ix] = (used & GOMP_DIM_MASK (ix)
731 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
732 changed = true;
733 }
734
735 if (changed)
736 {
737 /* Replace the attribute with new values. */
738 pos = NULL_TREE;
739 for (ix = GOMP_DIM_MAX; ix--;)
740 pos = tree_cons (purpose[ix],
741 build_int_cst (integer_type_node, dims[ix]), pos);
742 oacc_replace_fn_attrib (fn, pos);
743 }
744 }
745
746 /* Create an empty OpenACC loop structure at LOC. */
747
748 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)749 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
750 {
751 oacc_loop *loop = XCNEW (oacc_loop);
752
753 loop->parent = parent;
754
755 if (parent)
756 {
757 loop->sibling = parent->child;
758 parent->child = loop;
759 }
760
761 loop->loc = loc;
762 return loop;
763 }
764
765 /* Create an outermost, dummy OpenACC loop for offloaded function
766 DECL. */
767
768 static oacc_loop *
new_oacc_loop_outer(tree decl)769 new_oacc_loop_outer (tree decl)
770 {
771 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
772 }
773
774 /* Start a new OpenACC loop structure beginning at head marker HEAD.
775 Link into PARENT loop. Return the new loop. */
776
777 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)778 new_oacc_loop (oacc_loop *parent, gcall *marker)
779 {
780 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
781
782 loop->marker = marker;
783
784 /* TODO: This is where device_type flattening would occur for the loop
785 flags. */
786
787 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
788
789 tree chunk_size = integer_zero_node;
790 if (loop->flags & OLF_GANG_STATIC)
791 chunk_size = gimple_call_arg (marker, 4);
792 loop->chunk_size = chunk_size;
793
794 return loop;
795 }
796
797 /* Create a dummy loop encompassing a call to a openACC routine.
798 Extract the routine's partitioning requirements. */
799
800 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)801 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
802 {
803 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
804 int level = oacc_fn_attrib_level (attrs);
805
806 gcc_assert (level >= 0);
807
808 loop->marker = call;
809 loop->routine = decl;
810 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
811 ^ (GOMP_DIM_MASK (level) - 1));
812 }
813
814 /* Finish off the current OpenACC loop ending at tail marker TAIL.
815 Return the parent loop. */
816
817 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)818 finish_oacc_loop (oacc_loop *loop)
819 {
820 /* If the loop has been collapsed, don't partition it. */
821 if (loop->ifns.is_empty ())
822 loop->mask = loop->flags = 0;
823 return loop->parent;
824 }
825
826 /* Free all OpenACC loop structures within LOOP (inclusive). */
827
828 static void
free_oacc_loop(oacc_loop * loop)829 free_oacc_loop (oacc_loop *loop)
830 {
831 if (loop->sibling)
832 free_oacc_loop (loop->sibling);
833 if (loop->child)
834 free_oacc_loop (loop->child);
835
836 loop->ifns.release ();
837 free (loop);
838 }
839
840 /* Dump out the OpenACC loop head or tail beginning at FROM. */
841
842 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)843 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
844 const char *title, int level)
845 {
846 enum ifn_unique_kind kind
847 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
848
849 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
850 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
851 {
852 gimple *stmt = gsi_stmt (gsi);
853
854 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
855 {
856 enum ifn_unique_kind k
857 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
858 (gimple_call_arg (stmt, 0)));
859
860 if (k == kind && stmt != from)
861 break;
862 }
863 print_gimple_stmt (file, stmt, depth * 2 + 2);
864
865 gsi_next (&gsi);
866 while (gsi_end_p (gsi))
867 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
868 }
869 }
870
871 /* Dump OpenACC loop LOOP, its children, and its siblings. */
872
873 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)874 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
875 {
876 int ix;
877
878 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
879 loop->flags, loop->mask,
880 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
881
882 if (loop->marker)
883 print_gimple_stmt (file, loop->marker, depth * 2);
884
885 if (loop->routine)
886 fprintf (file, "%*sRoutine %s:%u:%s\n",
887 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
888 DECL_SOURCE_LINE (loop->routine),
889 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
890
891 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
892 if (loop->heads[ix])
893 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
894 for (ix = GOMP_DIM_MAX; ix--;)
895 if (loop->tails[ix])
896 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
897
898 if (loop->child)
899 dump_oacc_loop (file, loop->child, depth + 1);
900 if (loop->sibling)
901 dump_oacc_loop (file, loop->sibling, depth);
902 }
903
904 void debug_oacc_loop (oacc_loop *);
905
906 /* Dump loops to stderr. */
907
908 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)909 debug_oacc_loop (oacc_loop *loop)
910 {
911 dump_oacc_loop (stderr, loop, 0);
912 }
913
914 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
915 siblings. */
916
917 static void
inform_oacc_loop(const oacc_loop * loop)918 inform_oacc_loop (const oacc_loop *loop)
919 {
920 const char *gang
921 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
922 const char *worker
923 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
924 const char *vector
925 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
926 const char *seq = loop->mask == 0 ? " seq" : "";
927 const dump_user_location_t loc
928 = dump_user_location_t::from_location_t (loop->loc);
929 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
930 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
931 vector, seq);
932
933 if (loop->child)
934 inform_oacc_loop (loop->child);
935 if (loop->sibling)
936 inform_oacc_loop (loop->sibling);
937 }
938
939 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
940 structures as we go. By construction these loops are properly
941 nested. */
942
943 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)944 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
945 {
946 int marker = 0;
947 int remaining = 0;
948
949 if (bb->flags & BB_VISITED)
950 return;
951
952 follow:
953 bb->flags |= BB_VISITED;
954
955 /* Scan for loop markers. */
956 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
957 gsi_next (&gsi))
958 {
959 gimple *stmt = gsi_stmt (gsi);
960
961 if (!is_gimple_call (stmt))
962 continue;
963
964 gcall *call = as_a <gcall *> (stmt);
965
966 /* If this is a routine, make a dummy loop for it. */
967 if (tree decl = gimple_call_fndecl (call))
968 if (tree attrs = oacc_get_fn_attrib (decl))
969 {
970 gcc_assert (!marker);
971 new_oacc_loop_routine (loop, call, decl, attrs);
972 }
973
974 if (!gimple_call_internal_p (call))
975 continue;
976
977 switch (gimple_call_internal_fn (call))
978 {
979 default:
980 break;
981
982 case IFN_GOACC_LOOP:
983 case IFN_GOACC_TILE:
984 /* Record the abstraction function, so we can manipulate it
985 later. */
986 loop->ifns.safe_push (call);
987 break;
988
989 case IFN_UNIQUE:
990 enum ifn_unique_kind kind
991 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
992 (gimple_call_arg (call, 0)));
993 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
994 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
995 {
996 if (gimple_call_num_args (call) == 2)
997 {
998 gcc_assert (marker && !remaining);
999 marker = 0;
1000 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1001 loop = finish_oacc_loop (loop);
1002 else
1003 loop->head_end = call;
1004 }
1005 else
1006 {
1007 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1008
1009 if (!marker)
1010 {
1011 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1012 loop = new_oacc_loop (loop, call);
1013 remaining = count;
1014 }
1015 gcc_assert (count == remaining);
1016 if (remaining)
1017 {
1018 remaining--;
1019 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1020 loop->heads[marker] = call;
1021 else
1022 loop->tails[remaining] = call;
1023 }
1024 marker++;
1025 }
1026 }
1027 }
1028 }
1029 if (remaining || marker)
1030 {
1031 bb = single_succ (bb);
1032 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1033 goto follow;
1034 }
1035
1036 /* Walk successor blocks. */
1037 edge e;
1038 edge_iterator ei;
1039
1040 FOR_EACH_EDGE (e, ei, bb->succs)
1041 oacc_loop_discover_walk (loop, e->dest);
1042 }
1043
1044 /* LOOP is the first sibling. Reverse the order in place and return
1045 the new first sibling. Recurse to child loops. */
1046
1047 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1048 oacc_loop_sibling_nreverse (oacc_loop *loop)
1049 {
1050 oacc_loop *last = NULL;
1051 do
1052 {
1053 if (loop->child)
1054 loop->child = oacc_loop_sibling_nreverse (loop->child);
1055
1056 oacc_loop *next = loop->sibling;
1057 loop->sibling = last;
1058 last = loop;
1059 loop = next;
1060 }
1061 while (loop);
1062
1063 return last;
1064 }
1065
1066 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1067 the current function. */
1068
1069 static oacc_loop *
oacc_loop_discovery()1070 oacc_loop_discovery ()
1071 {
1072 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1073 in the following. */
1074 clear_bb_flags ();
1075
1076 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1077 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1078
1079 /* The siblings were constructed in reverse order, reverse them so
1080 that diagnostics come out in an unsurprising order. */
1081 top = oacc_loop_sibling_nreverse (top);
1082
1083 return top;
1084 }
1085
1086 /* Transform the abstract internal function markers starting at FROM
1087 to be for partitioning level LEVEL. Stop when we meet another HEAD
1088 or TAIL marker. */
1089
1090 static void
oacc_loop_xform_head_tail(gcall * from,int level)1091 oacc_loop_xform_head_tail (gcall *from, int level)
1092 {
1093 enum ifn_unique_kind kind
1094 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1095 tree replacement = build_int_cst (unsigned_type_node, level);
1096
1097 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1098 {
1099 gimple *stmt = gsi_stmt (gsi);
1100
1101 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1102 {
1103 enum ifn_unique_kind k
1104 = ((enum ifn_unique_kind)
1105 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1106
1107 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1108 *gimple_call_arg_ptr (stmt, 2) = replacement;
1109 else if (k == kind && stmt != from)
1110 break;
1111 }
1112 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1113 *gimple_call_arg_ptr (stmt, 3) = replacement;
1114
1115 gsi_next (&gsi);
1116 while (gsi_end_p (gsi))
1117 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1118 }
1119 }
1120
1121 /* Process the discovered OpenACC loops, setting the correct
1122 partitioning level etc. */
1123
1124 static void
oacc_loop_process(oacc_loop * loop)1125 oacc_loop_process (oacc_loop *loop)
1126 {
1127 if (loop->child)
1128 oacc_loop_process (loop->child);
1129
1130 if (loop->mask && !loop->routine)
1131 {
1132 int ix;
1133 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1134 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1135 tree chunk_arg = loop->chunk_size;
1136 gcall *call;
1137
1138 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1139 switch (gimple_call_internal_fn (call))
1140 {
1141 case IFN_GOACC_LOOP:
1142 {
1143 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1144 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1145 if (!is_e)
1146 gimple_call_set_arg (call, 4, chunk_arg);
1147 }
1148 break;
1149
1150 case IFN_GOACC_TILE:
1151 gimple_call_set_arg (call, 3, mask_arg);
1152 gimple_call_set_arg (call, 4, e_mask_arg);
1153 break;
1154
1155 default:
1156 gcc_unreachable ();
1157 }
1158
1159 unsigned dim = GOMP_DIM_GANG;
1160 unsigned mask = loop->mask | loop->e_mask;
1161 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1162 {
1163 while (!(GOMP_DIM_MASK (dim) & mask))
1164 dim++;
1165
1166 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1167 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1168
1169 mask ^= GOMP_DIM_MASK (dim);
1170 }
1171 }
1172
1173 if (loop->sibling)
1174 oacc_loop_process (loop->sibling);
1175 }
1176
1177 /* Walk the OpenACC loop heirarchy checking and assigning the
1178 programmer-specified partitionings. OUTER_MASK is the partitioning
1179 this loop is contained within. Return mask of partitioning
1180 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1181 bit. */
1182
1183 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1184 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1185 {
1186 unsigned this_mask = loop->mask;
1187 unsigned mask_all = 0;
1188 bool noisy = true;
1189
1190 #ifdef ACCEL_COMPILER
1191 /* When device_type is supported, we want the device compiler to be
1192 noisy, if the loop parameters are device_type-specific. */
1193 noisy = false;
1194 #endif
1195
1196 if (!loop->routine)
1197 {
1198 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1199 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1200 bool tiling = (loop->flags & OLF_TILE) != 0;
1201
1202 this_mask = ((loop->flags >> OLF_DIM_BASE)
1203 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1204
1205 /* Apply auto partitioning if this is a non-partitioned regular
1206 loop, or (no more than) single axis tiled loop. */
1207 bool maybe_auto
1208 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1209
1210 if ((this_mask != 0) + auto_par + seq_par > 1)
1211 {
1212 if (noisy)
1213 error_at (loop->loc,
1214 seq_par
1215 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1216 : G_("%<auto%> conflicts with other OpenACC loop "
1217 "specifiers"));
1218 maybe_auto = false;
1219 loop->flags &= ~OLF_AUTO;
1220 if (seq_par)
1221 {
1222 loop->flags
1223 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1224 this_mask = 0;
1225 }
1226 }
1227
1228 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1229 {
1230 loop->flags |= OLF_AUTO;
1231 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1232 }
1233 }
1234
1235 if (this_mask & outer_mask)
1236 {
1237 const oacc_loop *outer;
1238 for (outer = loop->parent; outer; outer = outer->parent)
1239 if ((outer->mask | outer->e_mask) & this_mask)
1240 break;
1241
1242 if (noisy)
1243 {
1244 if (outer)
1245 {
1246 error_at (loop->loc,
1247 loop->routine
1248 ? G_("routine call uses same OpenACC parallelism"
1249 " as containing loop")
1250 : G_("inner loop uses same OpenACC parallelism"
1251 " as containing loop"));
1252 inform (outer->loc, "containing loop here");
1253 }
1254 else
1255 error_at (loop->loc,
1256 loop->routine
1257 ? G_("routine call uses OpenACC parallelism disallowed"
1258 " by containing routine")
1259 : G_("loop uses OpenACC parallelism disallowed"
1260 " by containing routine"));
1261
1262 if (loop->routine)
1263 inform (DECL_SOURCE_LOCATION (loop->routine),
1264 "routine %qD declared here", loop->routine);
1265 }
1266 this_mask &= ~outer_mask;
1267 }
1268 else
1269 {
1270 unsigned outermost = least_bit_hwi (this_mask);
1271
1272 if (outermost && outermost <= outer_mask)
1273 {
1274 if (noisy)
1275 {
1276 error_at (loop->loc,
1277 "incorrectly nested OpenACC loop parallelism");
1278
1279 const oacc_loop *outer;
1280 for (outer = loop->parent;
1281 outer->flags && outer->flags < outermost;
1282 outer = outer->parent)
1283 continue;
1284 inform (outer->loc, "containing loop here");
1285 }
1286
1287 this_mask &= ~outermost;
1288 }
1289 }
1290
1291 mask_all |= this_mask;
1292
1293 if (loop->flags & OLF_TILE)
1294 {
1295 /* When tiling, vector goes to the element loop, and failing
1296 that we put worker there. The std doesn't contemplate
1297 specifying all three. We choose to put worker and vector on
1298 the element loops in that case. */
1299 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1300 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1301 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1302
1303 loop->e_mask = this_e_mask;
1304 this_mask ^= this_e_mask;
1305 }
1306
1307 loop->mask = this_mask;
1308
1309 if (dump_file)
1310 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1311 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1312 loop->mask, loop->e_mask);
1313
1314 if (loop->child)
1315 {
1316 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1317 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1318 mask_all |= loop->inner;
1319 }
1320
1321 if (loop->sibling)
1322 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1323
1324 return mask_all;
1325 }
1326
1327 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1328 OUTER_MASK is the partitioning this loop is contained within.
1329 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1330 Return the cumulative partitioning used by this loop, siblings and
1331 children. */
1332
1333 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1334 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1335 bool outer_assign)
1336 {
1337 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1338 bool noisy = true;
1339 bool tiling = loop->flags & OLF_TILE;
1340
1341 #ifdef ACCEL_COMPILER
1342 /* When device_type is supported, we want the device compiler to be
1343 noisy, if the loop parameters are device_type-specific. */
1344 noisy = false;
1345 #endif
1346
1347 if (assign && (!outer_assign || loop->inner))
1348 {
1349 /* Allocate outermost and non-innermost loops at the outermost
1350 non-innermost available level. */
1351 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1352
1353 /* Find the first outermost available partition. */
1354 while (this_mask <= outer_mask)
1355 this_mask <<= 1;
1356
1357 /* Grab two axes if tiling, and we've not assigned anything */
1358 if (tiling && !(loop->mask | loop->e_mask))
1359 this_mask |= this_mask << 1;
1360
1361 /* Prohibit the innermost partitioning at the moment. */
1362 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1363
1364 /* Don't use any dimension explicitly claimed by an inner loop. */
1365 this_mask &= ~loop->inner;
1366
1367 if (tiling && !loop->e_mask)
1368 {
1369 /* If we got two axes, allocate the inner one to the element
1370 loop. */
1371 loop->e_mask = this_mask & (this_mask << 1);
1372 this_mask ^= loop->e_mask;
1373 }
1374
1375 loop->mask |= this_mask;
1376 }
1377
1378 if (loop->child)
1379 {
1380 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1381 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1382 outer_assign | assign);
1383 }
1384
1385 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1386 {
1387 /* Allocate the loop at the innermost available level. Note
1388 that we do this even if we already assigned this loop the
1389 outermost available level above. That way we'll partition
1390 this along 2 axes, if they are available. */
1391 unsigned this_mask = 0;
1392
1393 /* Determine the outermost partitioning used within this loop. */
1394 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1395 this_mask = least_bit_hwi (this_mask);
1396
1397 /* Pick the partitioning just inside that one. */
1398 this_mask >>= 1;
1399
1400 /* And avoid picking one use by an outer loop. */
1401 this_mask &= ~outer_mask;
1402
1403 /* If tiling and we failed completely above, grab the next one
1404 too. Making sure it doesn't hit an outer loop. */
1405 if (tiling)
1406 {
1407 this_mask &= ~(loop->e_mask | loop->mask);
1408 unsigned tile_mask = ((this_mask >> 1)
1409 & ~(outer_mask | loop->e_mask | loop->mask));
1410
1411 if (tile_mask || loop->mask)
1412 {
1413 loop->e_mask |= this_mask;
1414 this_mask = tile_mask;
1415 }
1416 if (!loop->e_mask && noisy)
1417 warning_at (loop->loc, 0,
1418 "insufficient partitioning available"
1419 " to parallelize element loop");
1420 }
1421
1422 loop->mask |= this_mask;
1423 if (!loop->mask && noisy)
1424 warning_at (loop->loc, 0,
1425 tiling
1426 ? G_("insufficient partitioning available"
1427 " to parallelize tile loop")
1428 : G_("insufficient partitioning available"
1429 " to parallelize loop"));
1430 }
1431
1432 if (assign && dump_file)
1433 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1434 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1435 loop->mask, loop->e_mask);
1436
1437 unsigned inner_mask = 0;
1438
1439 if (loop->sibling)
1440 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1441 outer_mask, outer_assign);
1442
1443 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1444
1445 return inner_mask;
1446 }
1447
1448 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1449 axes. Return mask of partitioning. */
1450
1451 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1452 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1453 {
1454 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1455
1456 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1457 {
1458 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1459 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1460 }
1461 return mask_all;
1462 }
1463
1464 /* Default fork/join early expander. Delete the function calls if
1465 there is no RTL expander. */
1466
1467 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1468 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1469 const int *ARG_UNUSED (dims), bool is_fork)
1470 {
1471 if (is_fork)
1472 return targetm.have_oacc_fork ();
1473 else
1474 return targetm.have_oacc_join ();
1475 }
1476
1477 /* Default goacc.reduction early expander.
1478
1479 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1480 If RES_PTR is not integer-zerop:
1481 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1482 TEARDOWN - emit '*RES_PTR = VAR'
1483 If LHS is not NULL
1484 emit 'LHS = VAR' */
1485
1486 void
default_goacc_reduction(gcall * call)1487 default_goacc_reduction (gcall *call)
1488 {
1489 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1490 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1491 tree lhs = gimple_call_lhs (call);
1492 tree var = gimple_call_arg (call, 2);
1493 gimple_seq seq = NULL;
1494
1495 if (code == IFN_GOACC_REDUCTION_SETUP
1496 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1497 {
1498 /* Setup and Teardown need to copy from/to the receiver object,
1499 if there is one. */
1500 tree ref_to_res = gimple_call_arg (call, 1);
1501
1502 if (!integer_zerop (ref_to_res))
1503 {
1504 tree dst = build_simple_mem_ref (ref_to_res);
1505 tree src = var;
1506
1507 if (code == IFN_GOACC_REDUCTION_SETUP)
1508 {
1509 src = dst;
1510 dst = lhs;
1511 lhs = NULL;
1512 }
1513 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1514 }
1515 }
1516
1517 /* Copy VAR to LHS, if there is an LHS. */
1518 if (lhs)
1519 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1520
1521 gsi_replace_with_seq (&gsi, seq, true);
1522 }
1523
1524 /* Main entry point for oacc transformations which run on the device
1525 compiler after LTO, so we know what the target device is at this
1526 point (including the host fallback). */
1527
1528 static unsigned int
execute_oacc_device_lower()1529 execute_oacc_device_lower ()
1530 {
1531 tree attrs = oacc_get_fn_attrib (current_function_decl);
1532
1533 if (!attrs)
1534 /* Not an offloaded function. */
1535 return 0;
1536
1537 /* Parse the default dim argument exactly once. */
1538 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1539 {
1540 oacc_parse_default_dims (flag_openacc_dims);
1541 flag_openacc_dims = (char *)&flag_openacc_dims;
1542 }
1543
1544 bool is_oacc_parallel
1545 = (lookup_attribute ("oacc parallel",
1546 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1547 bool is_oacc_kernels
1548 = (lookup_attribute ("oacc kernels",
1549 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1550 bool is_oacc_serial
1551 = (lookup_attribute ("oacc serial",
1552 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1553 int fn_level = oacc_fn_attrib_level (attrs);
1554 bool is_oacc_routine = (fn_level >= 0);
1555 gcc_checking_assert (is_oacc_parallel
1556 + is_oacc_kernels
1557 + is_oacc_serial
1558 + is_oacc_routine
1559 == 1);
1560
1561 bool is_oacc_kernels_parallelized
1562 = (lookup_attribute ("oacc kernels parallelized",
1563 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1564 if (is_oacc_kernels_parallelized)
1565 gcc_checking_assert (is_oacc_kernels);
1566
1567 if (dump_file)
1568 {
1569 if (is_oacc_parallel)
1570 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1571 else if (is_oacc_kernels)
1572 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1573 (is_oacc_kernels_parallelized
1574 ? "parallelized" : "unparallelized"));
1575 else if (is_oacc_serial)
1576 fprintf (dump_file, "Function is OpenACC serial offload\n");
1577 else if (is_oacc_routine)
1578 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1579 fn_level);
1580 else
1581 gcc_unreachable ();
1582 }
1583
1584 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1585 kernels, so remove the parallelism dimensions function attributes
1586 potentially set earlier on. */
1587 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1588 {
1589 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1590 attrs = oacc_get_fn_attrib (current_function_decl);
1591 }
1592
1593 /* Discover, partition and process the loops. */
1594 oacc_loop *loops = oacc_loop_discovery ();
1595
1596 unsigned outer_mask = 0;
1597 if (is_oacc_routine)
1598 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
1599 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1600 /* OpenACC kernels constructs are special: they currently don't use the
1601 generic oacc_loop infrastructure and attribute/dimension processing. */
1602 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1603 {
1604 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1605 also tree-parloops.c:create_parallel_loop. */
1606 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1607 }
1608
1609 int dims[GOMP_DIM_MAX];
1610 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1611
1612 if (dump_file)
1613 {
1614 const char *comma = "Compute dimensions [";
1615 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1616 fprintf (dump_file, "%s%d", comma, dims[ix]);
1617 fprintf (dump_file, "]\n");
1618 }
1619
1620 oacc_loop_process (loops);
1621 if (dump_file)
1622 {
1623 fprintf (dump_file, "OpenACC loops\n");
1624 dump_oacc_loop (dump_file, loops, 0);
1625 fprintf (dump_file, "\n");
1626 }
1627 if (dump_enabled_p ())
1628 {
1629 oacc_loop *l = loops;
1630 /* OpenACC kernels constructs are special: they currently don't use the
1631 generic oacc_loop infrastructure. */
1632 if (is_oacc_kernels)
1633 {
1634 /* Create a fake oacc_loop for diagnostic purposes. */
1635 l = new_oacc_loop_raw (NULL,
1636 DECL_SOURCE_LOCATION (current_function_decl));
1637 l->mask = used_mask;
1638 }
1639 else
1640 {
1641 /* Skip the outermost, dummy OpenACC loop */
1642 l = l->child;
1643 }
1644 if (l)
1645 inform_oacc_loop (l);
1646 if (is_oacc_kernels)
1647 free_oacc_loop (l);
1648 }
1649
1650 /* Offloaded targets may introduce new basic blocks, which require
1651 dominance information to update SSA. */
1652 calculate_dominance_info (CDI_DOMINATORS);
1653
1654 /* Now lower internal loop functions to target-specific code
1655 sequences. */
1656 basic_block bb;
1657 FOR_ALL_BB_FN (bb, cfun)
1658 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1659 {
1660 gimple *stmt = gsi_stmt (gsi);
1661 if (!is_gimple_call (stmt))
1662 {
1663 gsi_next (&gsi);
1664 continue;
1665 }
1666
1667 gcall *call = as_a <gcall *> (stmt);
1668 if (!gimple_call_internal_p (call))
1669 {
1670 gsi_next (&gsi);
1671 continue;
1672 }
1673
1674 /* Rewind to allow rescan. */
1675 gsi_prev (&gsi);
1676 bool rescan = false, remove = false;
1677 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1678
1679 switch (ifn_code)
1680 {
1681 default: break;
1682
1683 case IFN_GOACC_TILE:
1684 oacc_xform_tile (call);
1685 rescan = true;
1686 break;
1687
1688 case IFN_GOACC_LOOP:
1689 oacc_xform_loop (call);
1690 rescan = true;
1691 break;
1692
1693 case IFN_GOACC_REDUCTION:
1694 /* Mark the function for SSA renaming. */
1695 mark_virtual_operands_for_renaming (cfun);
1696
1697 /* If the level is -1, this ended up being an unused
1698 axis. Handle as a default. */
1699 if (integer_minus_onep (gimple_call_arg (call, 3)))
1700 default_goacc_reduction (call);
1701 else
1702 targetm.goacc.reduction (call);
1703 rescan = true;
1704 break;
1705
1706 case IFN_UNIQUE:
1707 {
1708 enum ifn_unique_kind kind
1709 = ((enum ifn_unique_kind)
1710 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1711
1712 switch (kind)
1713 {
1714 default:
1715 break;
1716
1717 case IFN_UNIQUE_OACC_FORK:
1718 case IFN_UNIQUE_OACC_JOIN:
1719 if (integer_minus_onep (gimple_call_arg (call, 2)))
1720 remove = true;
1721 else if (!targetm.goacc.fork_join
1722 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1723 remove = true;
1724 break;
1725
1726 case IFN_UNIQUE_OACC_HEAD_MARK:
1727 case IFN_UNIQUE_OACC_TAIL_MARK:
1728 remove = true;
1729 break;
1730 }
1731 break;
1732 }
1733 }
1734
1735 if (gsi_end_p (gsi))
1736 /* We rewound past the beginning of the BB. */
1737 gsi = gsi_start_bb (bb);
1738 else
1739 /* Undo the rewind. */
1740 gsi_next (&gsi);
1741
1742 if (remove)
1743 {
1744 if (gimple_vdef (call))
1745 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1746 if (gimple_call_lhs (call))
1747 {
1748 /* Propagate the data dependency var. */
1749 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1750 gimple_call_arg (call, 1));
1751 gsi_replace (&gsi, ass, false);
1752 }
1753 else
1754 gsi_remove (&gsi, true);
1755 }
1756 else if (!rescan)
1757 /* If not rescanning, advance over the call. */
1758 gsi_next (&gsi);
1759 }
1760
1761 free_oacc_loop (loops);
1762
1763 return 0;
1764 }
1765
1766 /* Default launch dimension validator. Force everything to 1. A
1767 backend that wants to provide larger dimensions must override this
1768 hook. */
1769
1770 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))1771 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1772 int ARG_UNUSED (fn_level),
1773 unsigned ARG_UNUSED (used))
1774 {
1775 bool changed = false;
1776
1777 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1778 {
1779 if (dims[ix] != 1)
1780 {
1781 dims[ix] = 1;
1782 changed = true;
1783 }
1784 }
1785
1786 return changed;
1787 }
1788
1789 /* Default dimension bound is unknown on accelerator and 1 on host. */
1790
1791 int
default_goacc_dim_limit(int ARG_UNUSED (axis))1792 default_goacc_dim_limit (int ARG_UNUSED (axis))
1793 {
1794 #ifdef ACCEL_COMPILER
1795 return 0;
1796 #else
1797 return 1;
1798 #endif
1799 }
1800
1801 namespace {
1802
1803 const pass_data pass_data_oacc_device_lower =
1804 {
1805 GIMPLE_PASS, /* type */
1806 "oaccdevlow", /* name */
1807 OPTGROUP_OMP, /* optinfo_flags */
1808 TV_NONE, /* tv_id */
1809 PROP_cfg, /* properties_required */
1810 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1811 0, /* properties_destroyed */
1812 0, /* todo_flags_start */
1813 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1814 };
1815
1816 class pass_oacc_device_lower : public gimple_opt_pass
1817 {
1818 public:
pass_oacc_device_lower(gcc::context * ctxt)1819 pass_oacc_device_lower (gcc::context *ctxt)
1820 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1821 {}
1822
1823 /* opt_pass methods: */
gate(function *)1824 virtual bool gate (function *) { return flag_openacc; };
1825
execute(function *)1826 virtual unsigned int execute (function *)
1827 {
1828 return execute_oacc_device_lower ();
1829 }
1830
1831 }; // class pass_oacc_device_lower
1832
1833 } // anon namespace
1834
1835 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)1836 make_pass_oacc_device_lower (gcc::context *ctxt)
1837 {
1838 return new pass_oacc_device_lower (ctxt);
1839 }
1840
1841
1842 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1843 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1844 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1845 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1846
1847 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)1848 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1849 {
1850 gimple *alloc_stmt = gsi_stmt (*gsi);
1851 tree simtrec = gimple_call_lhs (alloc_stmt);
1852 tree simduid = gimple_call_arg (alloc_stmt, 0);
1853 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1854 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1855 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1856 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1857 TREE_ADDRESSABLE (rectype) = 1;
1858 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1859 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1860 {
1861 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1862 if (*argp == null_pointer_node)
1863 continue;
1864 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1865 && VAR_P (TREE_OPERAND (*argp, 0)));
1866 tree var = TREE_OPERAND (*argp, 0);
1867
1868 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1869 DECL_NAME (var), TREE_TYPE (var));
1870 SET_DECL_ALIGN (field, DECL_ALIGN (var));
1871 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1872 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1873
1874 insert_field_into_struct (rectype, field);
1875
1876 tree t = build_simple_mem_ref (simtrec);
1877 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1878 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1879 SET_DECL_VALUE_EXPR (var, t);
1880 DECL_HAS_VALUE_EXPR_P (var) = 1;
1881 *regimplify = true;
1882 }
1883 layout_type (rectype);
1884 tree size = TYPE_SIZE_UNIT (rectype);
1885 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1886
1887 alloc_stmt
1888 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1889 gimple_call_set_lhs (alloc_stmt, simtrec);
1890 gsi_replace (gsi, alloc_stmt, false);
1891 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1892 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1893 gsi_replace (&enter_gsi, enter_stmt, false);
1894
1895 use_operand_p use;
1896 gimple *exit_stmt;
1897 if (single_imm_use (simtrec, &use, &exit_stmt))
1898 {
1899 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1900 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1901 tree clobber = build_clobber (rectype);
1902 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1903 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1904 }
1905 else
1906 gcc_checking_assert (has_zero_uses (simtrec));
1907 }
1908
1909 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
1910
1911 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)1912 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1913 {
1914 tree t = *tp;
1915
1916 if (VAR_P (t)
1917 && DECL_HAS_VALUE_EXPR_P (t)
1918 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1919 {
1920 *walk_subtrees = 0;
1921 return t;
1922 }
1923 return NULL_TREE;
1924 }
1925
1926 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1927 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1928 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1929 internal functions on non-SIMT targets, and likewise some SIMD internal
1930 functions on SIMT targets. */
1931
1932 static unsigned int
execute_omp_device_lower()1933 execute_omp_device_lower ()
1934 {
1935 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1936 bool regimplify = false;
1937 basic_block bb;
1938 gimple_stmt_iterator gsi;
1939 FOR_EACH_BB_FN (bb, cfun)
1940 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1941 {
1942 gimple *stmt = gsi_stmt (gsi);
1943 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1944 continue;
1945 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1946 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1947 switch (gimple_call_internal_fn (stmt))
1948 {
1949 case IFN_GOMP_USE_SIMT:
1950 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1951 break;
1952 case IFN_GOMP_SIMT_ENTER:
1953 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1954 goto simtreg_enter_exit;
1955 case IFN_GOMP_SIMT_ENTER_ALLOC:
1956 if (vf != 1)
1957 ompdevlow_adjust_simt_enter (&gsi, ®implify);
1958 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1959 goto simtreg_enter_exit;
1960 case IFN_GOMP_SIMT_EXIT:
1961 simtreg_enter_exit:
1962 if (vf != 1)
1963 continue;
1964 unlink_stmt_vdef (stmt);
1965 break;
1966 case IFN_GOMP_SIMT_LANE:
1967 case IFN_GOMP_SIMT_LAST_LANE:
1968 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1969 break;
1970 case IFN_GOMP_SIMT_VF:
1971 rhs = build_int_cst (type, vf);
1972 break;
1973 case IFN_GOMP_SIMT_ORDERED_PRED:
1974 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1975 if (rhs || !lhs)
1976 unlink_stmt_vdef (stmt);
1977 break;
1978 case IFN_GOMP_SIMT_VOTE_ANY:
1979 case IFN_GOMP_SIMT_XCHG_BFLY:
1980 case IFN_GOMP_SIMT_XCHG_IDX:
1981 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1982 break;
1983 case IFN_GOMP_SIMD_LANE:
1984 case IFN_GOMP_SIMD_LAST_LANE:
1985 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1986 break;
1987 case IFN_GOMP_SIMD_VF:
1988 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1989 break;
1990 default:
1991 continue;
1992 }
1993 if (lhs && !rhs)
1994 continue;
1995 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1996 gsi_replace (&gsi, stmt, false);
1997 }
1998 if (regimplify)
1999 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2000 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2001 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2002 {
2003 if (gimple_clobber_p (gsi_stmt (gsi)))
2004 gsi_remove (&gsi, true);
2005 else
2006 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2007 }
2008 if (vf != 1)
2009 cfun->has_force_vectorize_loops = false;
2010 return 0;
2011 }
2012
2013 namespace {
2014
2015 const pass_data pass_data_omp_device_lower =
2016 {
2017 GIMPLE_PASS, /* type */
2018 "ompdevlow", /* name */
2019 OPTGROUP_OMP, /* optinfo_flags */
2020 TV_NONE, /* tv_id */
2021 PROP_cfg, /* properties_required */
2022 PROP_gimple_lomp_dev, /* properties_provided */
2023 0, /* properties_destroyed */
2024 0, /* todo_flags_start */
2025 TODO_update_ssa, /* todo_flags_finish */
2026 };
2027
2028 class pass_omp_device_lower : public gimple_opt_pass
2029 {
2030 public:
pass_omp_device_lower(gcc::context * ctxt)2031 pass_omp_device_lower (gcc::context *ctxt)
2032 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2033 {}
2034
2035 /* opt_pass methods: */
gate(function * fun)2036 virtual bool gate (function *fun)
2037 {
2038 return !(fun->curr_properties & PROP_gimple_lomp_dev);
2039 }
execute(function *)2040 virtual unsigned int execute (function *)
2041 {
2042 return execute_omp_device_lower ();
2043 }
2044
2045 }; // class pass_expand_omp_ssa
2046
2047 } // anon namespace
2048
2049 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2050 make_pass_omp_device_lower (gcc::context *ctxt)
2051 {
2052 return new pass_omp_device_lower (ctxt);
2053 }
2054
2055 /* "omp declare target link" handling pass. */
2056
2057 namespace {
2058
2059 const pass_data pass_data_omp_target_link =
2060 {
2061 GIMPLE_PASS, /* type */
2062 "omptargetlink", /* name */
2063 OPTGROUP_OMP, /* optinfo_flags */
2064 TV_NONE, /* tv_id */
2065 PROP_ssa, /* properties_required */
2066 0, /* properties_provided */
2067 0, /* properties_destroyed */
2068 0, /* todo_flags_start */
2069 TODO_update_ssa, /* todo_flags_finish */
2070 };
2071
2072 class pass_omp_target_link : public gimple_opt_pass
2073 {
2074 public:
pass_omp_target_link(gcc::context * ctxt)2075 pass_omp_target_link (gcc::context *ctxt)
2076 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2077 {}
2078
2079 /* opt_pass methods: */
gate(function * fun)2080 virtual bool gate (function *fun)
2081 {
2082 #ifdef ACCEL_COMPILER
2083 return offloading_function_p (fun->decl);
2084 #else
2085 (void) fun;
2086 return false;
2087 #endif
2088 }
2089
2090 virtual unsigned execute (function *);
2091 };
2092
2093 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2094
2095 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2096 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2097 {
2098 tree t = *tp;
2099
2100 if (VAR_P (t)
2101 && DECL_HAS_VALUE_EXPR_P (t)
2102 && is_global_var (t)
2103 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2104 {
2105 *walk_subtrees = 0;
2106 return t;
2107 }
2108
2109 return NULL_TREE;
2110 }
2111
2112 unsigned
execute(function * fun)2113 pass_omp_target_link::execute (function *fun)
2114 {
2115 basic_block bb;
2116 FOR_EACH_BB_FN (bb, fun)
2117 {
2118 gimple_stmt_iterator gsi;
2119 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2120 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2121 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2122 }
2123
2124 return 0;
2125 }
2126
2127 } // anon namespace
2128
2129 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2130 make_pass_omp_target_link (gcc::context *ctxt)
2131 {
2132 return new pass_omp_target_link (ctxt);
2133 }
2134