1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2019 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 
56 /* Describe the OpenACC looping structure of a function.  The entire
57    function is held in a 'NULL' loop.  */
58 
59 struct oacc_loop
60 {
61   oacc_loop *parent; /* Containing loop.  */
62 
63   oacc_loop *child; /* First inner loop.  */
64 
65   oacc_loop *sibling; /* Next loop within same parent.  */
66 
67   location_t loc; /* Location of the loop start.  */
68 
69   gcall *marker; /* Initial head marker.  */
70 
71   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
72   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
73 
74   tree routine;  /* Pseudo-loop enclosing a routine.  */
75 
76   unsigned mask;   /* Partitioning mask.  */
77   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
78   unsigned inner;  /* Partitioning of inner loops.  */
79   unsigned flags;  /* Partitioning flags.  */
80   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
81   tree chunk_size; /* Chunk size.  */
82   gcall *head_end; /* Final marker of head sequence.  */
83 };
84 
85 /* Holds offload tables with decls.  */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
87 
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89    -1 if it is not a routine (i.e. is an offload fn).  */
90 
91 int
oacc_fn_attrib_level(tree attr)92 oacc_fn_attrib_level (tree attr)
93 {
94   tree pos = TREE_VALUE (attr);
95 
96   if (!TREE_PURPOSE (pos))
97     return -1;
98 
99   int ix = 0;
100   for (ix = 0; ix != GOMP_DIM_MAX;
101        ix++, pos = TREE_CHAIN (pos))
102     if (!integer_zerop (TREE_PURPOSE (pos)))
103       break;
104 
105   return ix;
106 }
107 
108 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
109    adds their addresses and sizes to constructor-vector V_CTOR.  */
110 
111 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 					 vec<constructor_elt, va_gc> *v_ctor)
114 {
115   unsigned len = vec_safe_length (v_decls);
116   for (unsigned i = 0; i < len; i++)
117     {
118       tree it = (*v_decls)[i];
119       bool is_var = VAR_P (it);
120       bool is_link_var
121 	= is_var
122 #ifdef ACCEL_COMPILER
123 	  && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126 
127       tree size = NULL_TREE;
128       if (is_var)
129 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
130 
131       tree addr;
132       if (!is_link_var)
133 	addr = build_fold_addr_expr (it);
134       else
135 	{
136 #ifdef ACCEL_COMPILER
137 	  /* For "omp declare target link" vars add address of the pointer to
138 	     the target table, instead of address of the var.  */
139 	  tree value_expr = DECL_VALUE_EXPR (it);
140 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
141 	  varpool_node::finalize_decl (link_ptr_decl);
142 	  addr = build_fold_addr_expr (link_ptr_decl);
143 #else
144 	  addr = build_fold_addr_expr (it);
145 #endif
146 
147 	  /* Most significant bit of the size marks "omp declare target link"
148 	     vars in host and target tables.  */
149 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
150 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
151 			    * BITS_PER_UNIT - 1);
152 	  size = wide_int_to_tree (const_ptr_type_node, isize);
153 	}
154 
155       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
156       if (is_var)
157 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
158     }
159 }
160 
161 /* Create new symbols containing (address, size) pairs for global variables,
162    marked with "omp declare target" attribute, as well as addresses for the
163    functions, which are outlined offloading regions.  */
164 void
omp_finish_file(void)165 omp_finish_file (void)
166 {
167   unsigned num_funcs = vec_safe_length (offload_funcs);
168   unsigned num_vars = vec_safe_length (offload_vars);
169 
170   if (num_funcs == 0 && num_vars == 0)
171     return;
172 
173   if (targetm_common.have_named_sections)
174     {
175       vec<constructor_elt, va_gc> *v_f, *v_v;
176       vec_alloc (v_f, num_funcs);
177       vec_alloc (v_v, num_vars * 2);
178 
179       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
180       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
181 
182       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
183 						    num_vars * 2);
184       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
185 						     num_funcs);
186       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
187       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
188       tree ctor_v = build_constructor (vars_decl_type, v_v);
189       tree ctor_f = build_constructor (funcs_decl_type, v_f);
190       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
191       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
192       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
193 				    get_identifier (".offload_func_table"),
194 				    funcs_decl_type);
195       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
196 				   get_identifier (".offload_var_table"),
197 				   vars_decl_type);
198       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
199       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
200 	 otherwise a joint table in a binary will contain padding between
201 	 tables from multiple object files.  */
202       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
203       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
204       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
205       DECL_INITIAL (funcs_decl) = ctor_f;
206       DECL_INITIAL (vars_decl) = ctor_v;
207       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
208       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
209 
210       varpool_node::finalize_decl (vars_decl);
211       varpool_node::finalize_decl (funcs_decl);
212     }
213   else
214     {
215       for (unsigned i = 0; i < num_funcs; i++)
216 	{
217 	  tree it = (*offload_funcs)[i];
218 	  targetm.record_offload_symbol (it);
219 	}
220       for (unsigned i = 0; i < num_vars; i++)
221 	{
222 	  tree it = (*offload_vars)[i];
223 	  targetm.record_offload_symbol (it);
224 	}
225     }
226 }
227 
228 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
229    axis DIM.  Return a tmp var holding the result.  */
230 
231 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)232 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
233 {
234   tree arg = build_int_cst (unsigned_type_node, dim);
235   tree size = create_tmp_var (integer_type_node);
236   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
237   gimple *call = gimple_build_call_internal (fn, 1, arg);
238 
239   gimple_call_set_lhs (call, size);
240   gimple_seq_add_stmt (seq, call);
241 
242   return size;
243 }
244 
245 /* Find the number of threads (POS = false), or thread number (POS =
246    true) for an OpenACC region partitioned as MASK.  Setup code
247    required for the calculation is added to SEQ.  */
248 
249 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)250 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
251 {
252   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
253   unsigned ix;
254 
255   /* Start at gang level, and examine relevant dimension indices.  */
256   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
257     if (GOMP_DIM_MASK (ix) & mask)
258       {
259 	if (res)
260 	  {
261 	    /* We had an outer index, so scale that by the size of
262 	       this dimension.  */
263 	    tree n = oacc_dim_call (false, ix, seq);
264 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
265 	  }
266 	if (pos)
267 	  {
268 	    /* Determine index in this dimension.  */
269 	    tree id = oacc_dim_call (true, ix, seq);
270 	    if (res)
271 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
272 	    else
273 	      res = id;
274 	  }
275       }
276 
277   if (res == NULL_TREE)
278     res = integer_zero_node;
279 
280   return res;
281 }
282 
283 /* Transform IFN_GOACC_LOOP calls to actual code.  See
284    expand_oacc_for for where these are generated.  At the vector
285    level, we stride loops, such that each member of a warp will
286    operate on adjacent iterations.  At the worker and gang level,
287    each gang/warp executes a set of contiguous iterations.  Chunking
288    can override this such that each iteration engine executes a
289    contiguous chunk, and then moves on to stride to the next chunk.  */
290 
291 static void
oacc_xform_loop(gcall * call)292 oacc_xform_loop (gcall *call)
293 {
294   gimple_stmt_iterator gsi = gsi_for_stmt (call);
295   enum ifn_goacc_loop_kind code
296     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
297   tree dir = gimple_call_arg (call, 1);
298   tree range = gimple_call_arg (call, 2);
299   tree step = gimple_call_arg (call, 3);
300   tree chunk_size = NULL_TREE;
301   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
302   tree lhs = gimple_call_lhs (call);
303   tree type = TREE_TYPE (lhs);
304   tree diff_type = TREE_TYPE (range);
305   tree r = NULL_TREE;
306   gimple_seq seq = NULL;
307   bool chunking = false, striding = true;
308   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
309   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
310 
311 #ifdef ACCEL_COMPILER
312   chunk_size = gimple_call_arg (call, 4);
313   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
314       || integer_zerop (chunk_size))   /* Default (also static).  */
315     {
316       /* If we're at the gang level, we want each to execute a
317 	 contiguous run of iterations.  Otherwise we want each element
318 	 to stride.  */
319       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
320       chunking = false;
321     }
322   else
323     {
324       /* Chunk of size 1 is striding.  */
325       striding = integer_onep (chunk_size);
326       chunking = !striding;
327     }
328 #endif
329 
330   /* striding=true, chunking=true
331        -> invalid.
332      striding=true, chunking=false
333        -> chunks=1
334      striding=false,chunking=true
335        -> chunks=ceil (range/(chunksize*threads*step))
336      striding=false,chunking=false
337        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
338   push_gimplify_context (true);
339 
340   switch (code)
341     {
342     default: gcc_unreachable ();
343 
344     case IFN_GOACC_LOOP_CHUNKS:
345       if (!chunking)
346 	r = build_int_cst (type, 1);
347       else
348 	{
349 	  /* chunk_max
350 	     = (range - dir) / (chunks * step * num_threads) + dir  */
351 	  tree per = oacc_thread_numbers (false, mask, &seq);
352 	  per = fold_convert (type, per);
353 	  chunk_size = fold_convert (type, chunk_size);
354 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
355 	  per = fold_build2 (MULT_EXPR, type, per, step);
356 	  r = build2 (MINUS_EXPR, type, range, dir);
357 	  r = build2 (PLUS_EXPR, type, r, per);
358 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
359 	}
360       break;
361 
362     case IFN_GOACC_LOOP_STEP:
363       {
364 	/* If striding, step by the entire compute volume, otherwise
365 	   step by the inner volume.  */
366 	unsigned volume = striding ? mask : inner_mask;
367 
368 	r = oacc_thread_numbers (false, volume, &seq);
369 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
370       }
371       break;
372 
373     case IFN_GOACC_LOOP_OFFSET:
374       /* Enable vectorization on non-SIMT targets.  */
375       if (!targetm.simt.vf
376 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
377 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
378 	     the loop.  */
379 	  && (flag_tree_loop_vectorize
380 	      || !global_options_set.x_flag_tree_loop_vectorize))
381 	{
382 	  basic_block bb = gsi_bb (gsi);
383 	  struct loop *parent = bb->loop_father;
384 	  struct loop *body = parent->inner;
385 
386 	  parent->force_vectorize = true;
387 	  parent->safelen = INT_MAX;
388 
389 	  /* "Chunking loops" may have inner loops.  */
390 	  if (parent->inner)
391 	    {
392 	      body->force_vectorize = true;
393 	      body->safelen = INT_MAX;
394 	    }
395 
396 	  cfun->has_force_vectorize_loops = true;
397 	}
398       if (striding)
399 	{
400 	  r = oacc_thread_numbers (true, mask, &seq);
401 	  r = fold_convert (diff_type, r);
402 	}
403       else
404 	{
405 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
406 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
407 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
408 				     inner_size, outer_size);
409 
410 	  volume = fold_convert (diff_type, volume);
411 	  if (chunking)
412 	    chunk_size = fold_convert (diff_type, chunk_size);
413 	  else
414 	    {
415 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
416 
417 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
418 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
419 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
420 	    }
421 
422 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
423 			      fold_convert (diff_type, inner_size));
424 	  r = oacc_thread_numbers (true, outer_mask, &seq);
425 	  r = fold_convert (diff_type, r);
426 	  r = build2 (MULT_EXPR, diff_type, r, span);
427 
428 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
429 	  inner = fold_convert (diff_type, inner);
430 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
431 
432 	  if (chunking)
433 	    {
434 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
435 	      tree per
436 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
437 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
438 
439 	      r = build2 (PLUS_EXPR, diff_type, r, per);
440 	    }
441 	}
442       r = fold_build2 (MULT_EXPR, diff_type, r, step);
443       if (type != diff_type)
444 	r = fold_convert (type, r);
445       break;
446 
447     case IFN_GOACC_LOOP_BOUND:
448       if (striding)
449 	r = range;
450       else
451 	{
452 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
453 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
454 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
455 				     inner_size, outer_size);
456 
457 	  volume = fold_convert (diff_type, volume);
458 	  if (chunking)
459 	    chunk_size = fold_convert (diff_type, chunk_size);
460 	  else
461 	    {
462 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
463 
464 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
465 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
466 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
467 	    }
468 
469 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
470 			      fold_convert (diff_type, inner_size));
471 
472 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
473 
474 	  tree offset = gimple_call_arg (call, 6);
475 	  r = build2 (PLUS_EXPR, diff_type, r,
476 		      fold_convert (diff_type, offset));
477 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
478 		      diff_type, r, range);
479 	}
480       if (diff_type != type)
481 	r = fold_convert (type, r);
482       break;
483     }
484 
485   gimplify_assign (lhs, r, &seq);
486 
487   pop_gimplify_context (NULL);
488 
489   gsi_replace_with_seq (&gsi, seq, true);
490 }
491 
492 /* Transform a GOACC_TILE call.  Determines the element loop span for
493    the specified loop of the nest.  This is 1 if we're not tiling.
494 
495    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
496 
497 static void
oacc_xform_tile(gcall * call)498 oacc_xform_tile (gcall *call)
499 {
500   gimple_stmt_iterator gsi = gsi_for_stmt (call);
501   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
502   /* Inner loops have higher loop_nos.  */
503   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
504   tree tile_size = gimple_call_arg (call, 2);
505   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
506   tree lhs = gimple_call_lhs (call);
507   tree type = TREE_TYPE (lhs);
508   gimple_seq seq = NULL;
509   tree span = build_int_cst (type, 1);
510 
511   gcc_assert (!(e_mask
512 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
513 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
514   push_gimplify_context (!seen_error ());
515 
516 #ifndef ACCEL_COMPILER
517   /* Partitioning disabled on host compilers.  */
518   e_mask = 0;
519 #endif
520   if (!e_mask)
521     /* Not paritioning.  */
522     span = integer_one_node;
523   else if (!integer_zerop (tile_size))
524     /* User explicitly specified size.  */
525     span = tile_size;
526   else
527     {
528       /* Pick a size based on the paritioning of the element loop and
529 	 the number of loop nests.  */
530       tree first_size = NULL_TREE;
531       tree second_size = NULL_TREE;
532 
533       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
534 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
535       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
536 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
537 
538       if (!first_size)
539 	{
540 	  first_size = second_size;
541 	  second_size = NULL_TREE;
542 	}
543 
544       if (loop_no + 1 == collapse)
545 	{
546 	  span = first_size;
547 	  if (!loop_no && second_size)
548 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
549 				span, second_size);
550 	}
551       else if (loop_no + 2 == collapse)
552 	span = second_size;
553       else
554 	span = NULL_TREE;
555 
556       if (!span)
557 	/* There's no obvious element size for this loop.  Options
558 	   are 1, first_size or some non-unity constant (32 is my
559 	   favourite).   We should gather some statistics.  */
560 	span = first_size;
561     }
562 
563   span = fold_convert (type, span);
564   gimplify_assign (lhs, span, &seq);
565 
566   pop_gimplify_context (NULL);
567 
568   gsi_replace_with_seq (&gsi, seq, true);
569 }
570 
571 /* Default partitioned and minimum partitioned dimensions.  */
572 
573 static int oacc_default_dims[GOMP_DIM_MAX];
574 static int oacc_min_dims[GOMP_DIM_MAX];
575 
576 int
oacc_get_default_dim(int dim)577 oacc_get_default_dim (int dim)
578 {
579   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
580   return oacc_default_dims[dim];
581 }
582 
583 int
oacc_get_min_dim(int dim)584 oacc_get_min_dim (int dim)
585 {
586   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
587   return oacc_min_dims[dim];
588 }
589 
590 /* Parse the default dimension parameter.  This is a set of
591    :-separated optional compute dimensions.  Each specified dimension
592    is a positive integer.  When device type support is added, it is
593    planned to be a comma separated list of such compute dimensions,
594    with all but the first prefixed by the colon-terminated device
595    type.  */
596 
597 static void
oacc_parse_default_dims(const char * dims)598 oacc_parse_default_dims (const char *dims)
599 {
600   int ix;
601 
602   for (ix = GOMP_DIM_MAX; ix--;)
603     {
604       oacc_default_dims[ix] = -1;
605       oacc_min_dims[ix] = 1;
606     }
607 
608 #ifndef ACCEL_COMPILER
609   /* Cannot be overridden on the host.  */
610   dims = NULL;
611 #endif
612   if (dims)
613     {
614       const char *pos = dims;
615 
616       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
617 	{
618 	  if (ix)
619 	    {
620 	      if (*pos != ':')
621 		goto malformed;
622 	      pos++;
623 	    }
624 
625 	  if (*pos != ':')
626 	    {
627 	      long val;
628 	      const char *eptr;
629 
630 	      errno = 0;
631 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
632 	      if (errno || val <= 0 || (int) val != val)
633 		goto malformed;
634 	      pos = eptr;
635 	      oacc_default_dims[ix] = (int) val;
636 	    }
637 	}
638       if (*pos)
639 	{
640 	malformed:
641 	  error_at (UNKNOWN_LOCATION,
642 		    "%<-fopenacc-dim%> operand is malformed at %qs", pos);
643 	}
644     }
645 
646   /* Allow the backend to validate the dimensions.  */
647   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
648   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
649 }
650 
651 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
652    raw attribute.  DIMS is an array of dimensions, which is filled in.
653    LEVEL is the partitioning level of a routine, or -1 for an offload
654    region itself.  USED is the mask of partitioned execution in the
655    function.  */
656 
657 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)658 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
659 {
660   tree purpose[GOMP_DIM_MAX];
661   unsigned ix;
662   tree pos = TREE_VALUE (attrs);
663 
664   /* Make sure the attribute creator attached the dimension
665      information.  */
666   gcc_assert (pos);
667 
668   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
669     {
670       purpose[ix] = TREE_PURPOSE (pos);
671       tree val = TREE_VALUE (pos);
672       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
673       pos = TREE_CHAIN (pos);
674     }
675 
676   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
677 
678   /* Default anything left to 1 or a partitioned default.  */
679   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
680     if (dims[ix] < 0)
681       {
682 	/* The OpenACC spec says 'If the [num_gangs] clause is not
683 	   specified, an implementation-defined default will be used;
684 	   the default may depend on the code within the construct.'
685 	   (2.5.6).  Thus an implementation is free to choose
686 	   non-unity default for a parallel region that doesn't have
687 	   any gang-partitioned loops.  However, it appears that there
688 	   is a sufficient body of user code that expects non-gang
689 	   partitioned regions to not execute in gang-redundant mode.
690 	   So we (a) don't warn about the non-portability and (b) pick
691 	   the minimum permissible dimension size when there is no
692 	   partitioned execution.  Otherwise we pick the global
693 	   default for the dimension, which the user can control.  The
694 	   same wording and logic applies to num_workers and
695 	   vector_length, however the worker- or vector- single
696 	   execution doesn't have the same impact as gang-redundant
697 	   execution.  (If the minimum gang-level partioning is not 1,
698 	   the target is probably too confusing.)  */
699 	dims[ix] = (used & GOMP_DIM_MASK (ix)
700 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
701 	changed = true;
702       }
703 
704   if (changed)
705     {
706       /* Replace the attribute with new values.  */
707       pos = NULL_TREE;
708       for (ix = GOMP_DIM_MAX; ix--;)
709 	pos = tree_cons (purpose[ix],
710 			 build_int_cst (integer_type_node, dims[ix]), pos);
711       oacc_replace_fn_attrib (fn, pos);
712     }
713 }
714 
715 /* Create an empty OpenACC loop structure at LOC.  */
716 
717 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)718 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
719 {
720   oacc_loop *loop = XCNEW (oacc_loop);
721 
722   loop->parent = parent;
723 
724   if (parent)
725     {
726       loop->sibling = parent->child;
727       parent->child = loop;
728     }
729 
730   loop->loc = loc;
731   return loop;
732 }
733 
734 /* Create an outermost, dummy OpenACC loop for offloaded function
735    DECL.  */
736 
737 static oacc_loop *
new_oacc_loop_outer(tree decl)738 new_oacc_loop_outer (tree decl)
739 {
740   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
741 }
742 
743 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
744    Link into PARENT loop.  Return the new loop.  */
745 
746 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)747 new_oacc_loop (oacc_loop *parent, gcall *marker)
748 {
749   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
750 
751   loop->marker = marker;
752 
753   /* TODO: This is where device_type flattening would occur for the loop
754      flags.  */
755 
756   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
757 
758   tree chunk_size = integer_zero_node;
759   if (loop->flags & OLF_GANG_STATIC)
760     chunk_size = gimple_call_arg (marker, 4);
761   loop->chunk_size = chunk_size;
762 
763   return loop;
764 }
765 
766 /* Create a dummy loop encompassing a call to a openACC routine.
767    Extract the routine's partitioning requirements.  */
768 
769 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)770 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
771 {
772   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
773   int level = oacc_fn_attrib_level (attrs);
774 
775   gcc_assert (level >= 0);
776 
777   loop->marker = call;
778   loop->routine = decl;
779   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
780 		^ (GOMP_DIM_MASK (level) - 1));
781 }
782 
783 /* Finish off the current OpenACC loop ending at tail marker TAIL.
784    Return the parent loop.  */
785 
786 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)787 finish_oacc_loop (oacc_loop *loop)
788 {
789   /* If the loop has been collapsed, don't partition it.  */
790   if (loop->ifns.is_empty ())
791     loop->mask = loop->flags = 0;
792   return loop->parent;
793 }
794 
795 /* Free all OpenACC loop structures within LOOP (inclusive).  */
796 
797 static void
free_oacc_loop(oacc_loop * loop)798 free_oacc_loop (oacc_loop *loop)
799 {
800   if (loop->sibling)
801     free_oacc_loop (loop->sibling);
802   if (loop->child)
803     free_oacc_loop (loop->child);
804 
805   loop->ifns.release ();
806   free (loop);
807 }
808 
809 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
810 
811 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)812 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
813 		     const char *title, int level)
814 {
815   enum ifn_unique_kind kind
816     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
817 
818   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
819   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
820     {
821       gimple *stmt = gsi_stmt (gsi);
822 
823       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
824 	{
825 	  enum ifn_unique_kind k
826 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
827 	       (gimple_call_arg (stmt, 0)));
828 
829 	  if (k == kind && stmt != from)
830 	    break;
831 	}
832       print_gimple_stmt (file, stmt, depth * 2 + 2);
833 
834       gsi_next (&gsi);
835       while (gsi_end_p (gsi))
836 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
837     }
838 }
839 
840 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
841 
842 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)843 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
844 {
845   int ix;
846 
847   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
848 	   loop->flags, loop->mask,
849 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
850 
851   if (loop->marker)
852     print_gimple_stmt (file, loop->marker, depth * 2);
853 
854   if (loop->routine)
855     fprintf (file, "%*sRoutine %s:%u:%s\n",
856 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
857 	     DECL_SOURCE_LINE (loop->routine),
858 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
859 
860   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
861     if (loop->heads[ix])
862       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
863   for (ix = GOMP_DIM_MAX; ix--;)
864     if (loop->tails[ix])
865       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
866 
867   if (loop->child)
868     dump_oacc_loop (file, loop->child, depth + 1);
869   if (loop->sibling)
870     dump_oacc_loop (file, loop->sibling, depth);
871 }
872 
873 void debug_oacc_loop (oacc_loop *);
874 
875 /* Dump loops to stderr.  */
876 
877 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)878 debug_oacc_loop (oacc_loop *loop)
879 {
880   dump_oacc_loop (stderr, loop, 0);
881 }
882 
883 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
884    siblings.  */
885 
886 static void
inform_oacc_loop(const oacc_loop * loop)887 inform_oacc_loop (const oacc_loop *loop)
888 {
889   const char *gang
890     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
891   const char *worker
892     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
893   const char *vector
894     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
895   const char *seq = loop->mask == 0 ? " seq" : "";
896   const dump_user_location_t loc
897     = dump_user_location_t::from_location_t (loop->loc);
898   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
899 		   "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
900 		   vector, seq);
901 
902   if (loop->child)
903     inform_oacc_loop (loop->child);
904   if (loop->sibling)
905     inform_oacc_loop (loop->sibling);
906 }
907 
908 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
909    structures as we go.  By construction these loops are properly
910    nested.  */
911 
912 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)913 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
914 {
915   int marker = 0;
916   int remaining = 0;
917 
918   if (bb->flags & BB_VISITED)
919     return;
920 
921  follow:
922   bb->flags |= BB_VISITED;
923 
924   /* Scan for loop markers.  */
925   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
926        gsi_next (&gsi))
927     {
928       gimple *stmt = gsi_stmt (gsi);
929 
930       if (!is_gimple_call (stmt))
931 	continue;
932 
933       gcall *call = as_a <gcall *> (stmt);
934 
935       /* If this is a routine, make a dummy loop for it.  */
936       if (tree decl = gimple_call_fndecl (call))
937 	if (tree attrs = oacc_get_fn_attrib (decl))
938 	  {
939 	    gcc_assert (!marker);
940 	    new_oacc_loop_routine (loop, call, decl, attrs);
941 	  }
942 
943       if (!gimple_call_internal_p (call))
944 	continue;
945 
946       switch (gimple_call_internal_fn (call))
947 	{
948 	default:
949 	  break;
950 
951 	case IFN_GOACC_LOOP:
952 	case IFN_GOACC_TILE:
953 	  /* Record the abstraction function, so we can manipulate it
954 	     later.  */
955 	  loop->ifns.safe_push (call);
956 	  break;
957 
958 	case IFN_UNIQUE:
959 	  enum ifn_unique_kind kind
960 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
961 				      (gimple_call_arg (call, 0)));
962 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
963 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
964 	    {
965 	      if (gimple_call_num_args (call) == 2)
966 		{
967 		  gcc_assert (marker && !remaining);
968 		  marker = 0;
969 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
970 		    loop = finish_oacc_loop (loop);
971 		  else
972 		    loop->head_end = call;
973 		}
974 	      else
975 		{
976 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
977 
978 		  if (!marker)
979 		    {
980 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
981 			loop = new_oacc_loop (loop, call);
982 		      remaining = count;
983 		    }
984 		  gcc_assert (count == remaining);
985 		  if (remaining)
986 		    {
987 		      remaining--;
988 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
989 			loop->heads[marker] = call;
990 		      else
991 			loop->tails[remaining] = call;
992 		    }
993 		  marker++;
994 		}
995 	    }
996 	}
997     }
998   if (remaining || marker)
999     {
1000       bb = single_succ (bb);
1001       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1002       goto follow;
1003     }
1004 
1005   /* Walk successor blocks.  */
1006   edge e;
1007   edge_iterator ei;
1008 
1009   FOR_EACH_EDGE (e, ei, bb->succs)
1010     oacc_loop_discover_walk (loop, e->dest);
1011 }
1012 
1013 /* LOOP is the first sibling.  Reverse the order in place and return
1014    the new first sibling.  Recurse to child loops.  */
1015 
1016 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1017 oacc_loop_sibling_nreverse (oacc_loop *loop)
1018 {
1019   oacc_loop *last = NULL;
1020   do
1021     {
1022       if (loop->child)
1023 	loop->child = oacc_loop_sibling_nreverse (loop->child);
1024 
1025       oacc_loop *next = loop->sibling;
1026       loop->sibling = last;
1027       last = loop;
1028       loop = next;
1029     }
1030   while (loop);
1031 
1032   return last;
1033 }
1034 
1035 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1036    the current function.  */
1037 
1038 static oacc_loop *
oacc_loop_discovery()1039 oacc_loop_discovery ()
1040 {
1041   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1042      in the following.  */
1043   clear_bb_flags ();
1044 
1045   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1046   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1047 
1048   /* The siblings were constructed in reverse order, reverse them so
1049      that diagnostics come out in an unsurprising order.  */
1050   top = oacc_loop_sibling_nreverse (top);
1051 
1052   return top;
1053 }
1054 
1055 /* Transform the abstract internal function markers starting at FROM
1056    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1057    or TAIL  marker.  */
1058 
1059 static void
oacc_loop_xform_head_tail(gcall * from,int level)1060 oacc_loop_xform_head_tail (gcall *from, int level)
1061 {
1062   enum ifn_unique_kind kind
1063     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1064   tree replacement = build_int_cst (unsigned_type_node, level);
1065 
1066   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1067     {
1068       gimple *stmt = gsi_stmt (gsi);
1069 
1070       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1071 	{
1072 	  enum ifn_unique_kind k
1073 	    = ((enum ifn_unique_kind)
1074 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1075 
1076 	  if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1077 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
1078 	  else if (k == kind && stmt != from)
1079 	    break;
1080 	}
1081       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1082 	*gimple_call_arg_ptr (stmt, 3) = replacement;
1083 
1084       gsi_next (&gsi);
1085       while (gsi_end_p (gsi))
1086 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1087     }
1088 }
1089 
1090 /* Process the discovered OpenACC loops, setting the correct
1091    partitioning level etc.  */
1092 
1093 static void
oacc_loop_process(oacc_loop * loop)1094 oacc_loop_process (oacc_loop *loop)
1095 {
1096   if (loop->child)
1097     oacc_loop_process (loop->child);
1098 
1099   if (loop->mask && !loop->routine)
1100     {
1101       int ix;
1102       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1103       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1104       tree chunk_arg = loop->chunk_size;
1105       gcall *call;
1106 
1107       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1108 	switch (gimple_call_internal_fn (call))
1109 	  {
1110 	  case IFN_GOACC_LOOP:
1111 	    {
1112 	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1113 	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1114 	      if (!is_e)
1115 		gimple_call_set_arg (call, 4, chunk_arg);
1116 	    }
1117 	    break;
1118 
1119 	  case IFN_GOACC_TILE:
1120 	    gimple_call_set_arg (call, 3, mask_arg);
1121 	    gimple_call_set_arg (call, 4, e_mask_arg);
1122 	    break;
1123 
1124 	  default:
1125 	    gcc_unreachable ();
1126 	  }
1127 
1128       unsigned dim = GOMP_DIM_GANG;
1129       unsigned mask = loop->mask | loop->e_mask;
1130       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1131 	{
1132 	  while (!(GOMP_DIM_MASK (dim) & mask))
1133 	    dim++;
1134 
1135 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
1136 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
1137 
1138 	  mask ^= GOMP_DIM_MASK (dim);
1139 	}
1140     }
1141 
1142   if (loop->sibling)
1143     oacc_loop_process (loop->sibling);
1144 }
1145 
1146 /* Walk the OpenACC loop heirarchy checking and assigning the
1147    programmer-specified partitionings.  OUTER_MASK is the partitioning
1148    this loop is contained within.  Return mask of partitioning
1149    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1150    bit.  */
1151 
1152 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1153 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1154 {
1155   unsigned this_mask = loop->mask;
1156   unsigned mask_all = 0;
1157   bool noisy = true;
1158 
1159 #ifdef ACCEL_COMPILER
1160   /* When device_type is supported, we want the device compiler to be
1161      noisy, if the loop parameters are device_type-specific.  */
1162   noisy = false;
1163 #endif
1164 
1165   if (!loop->routine)
1166     {
1167       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1168       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1169       bool tiling = (loop->flags & OLF_TILE) != 0;
1170 
1171       this_mask = ((loop->flags >> OLF_DIM_BASE)
1172 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1173 
1174       /* Apply auto partitioning if this is a non-partitioned regular
1175 	 loop, or (no more than) single axis tiled loop.  */
1176       bool maybe_auto
1177 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1178 
1179       if ((this_mask != 0) + auto_par + seq_par > 1)
1180 	{
1181 	  if (noisy)
1182 	    error_at (loop->loc,
1183 		      seq_par
1184 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
1185 		      : G_("%<auto%> conflicts with other OpenACC loop "
1186 			   "specifiers"));
1187 	  maybe_auto = false;
1188 	  loop->flags &= ~OLF_AUTO;
1189 	  if (seq_par)
1190 	    {
1191 	      loop->flags
1192 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1193 	      this_mask = 0;
1194 	    }
1195 	}
1196 
1197       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1198 	{
1199 	  loop->flags |= OLF_AUTO;
1200 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1201 	}
1202     }
1203 
1204   if (this_mask & outer_mask)
1205     {
1206       const oacc_loop *outer;
1207       for (outer = loop->parent; outer; outer = outer->parent)
1208 	if ((outer->mask | outer->e_mask) & this_mask)
1209 	  break;
1210 
1211       if (noisy)
1212 	{
1213 	  if (outer)
1214 	    {
1215 	      error_at (loop->loc,
1216 			loop->routine
1217 			? G_("routine call uses same OpenACC parallelism"
1218 			     " as containing loop")
1219 			: G_("inner loop uses same OpenACC parallelism"
1220 			     " as containing loop"));
1221 	      inform (outer->loc, "containing loop here");
1222 	    }
1223 	  else
1224 	    error_at (loop->loc,
1225 		      loop->routine
1226 		      ? G_("routine call uses OpenACC parallelism disallowed"
1227 			   " by containing routine")
1228 		      : G_("loop uses OpenACC parallelism disallowed"
1229 			   " by containing routine"));
1230 
1231 	  if (loop->routine)
1232 	    inform (DECL_SOURCE_LOCATION (loop->routine),
1233 		    "routine %qD declared here", loop->routine);
1234 	}
1235       this_mask &= ~outer_mask;
1236     }
1237   else
1238     {
1239       unsigned outermost = least_bit_hwi (this_mask);
1240 
1241       if (outermost && outermost <= outer_mask)
1242 	{
1243 	  if (noisy)
1244 	    {
1245 	      error_at (loop->loc,
1246 			"incorrectly nested OpenACC loop parallelism");
1247 
1248 	      const oacc_loop *outer;
1249 	      for (outer = loop->parent;
1250 		   outer->flags && outer->flags < outermost;
1251 		   outer = outer->parent)
1252 		continue;
1253 	      inform (outer->loc, "containing loop here");
1254 	    }
1255 
1256 	  this_mask &= ~outermost;
1257 	}
1258     }
1259 
1260   mask_all |= this_mask;
1261 
1262   if (loop->flags & OLF_TILE)
1263     {
1264       /* When tiling, vector goes to the element loop, and failing
1265 	 that we put worker there.  The std doesn't contemplate
1266 	 specifying all three.  We choose to put worker and vector on
1267 	 the element loops in that case.  */
1268       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1269       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1270 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1271 
1272       loop->e_mask = this_e_mask;
1273       this_mask ^= this_e_mask;
1274     }
1275 
1276   loop->mask = this_mask;
1277 
1278   if (dump_file)
1279     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1280 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1281 	     loop->mask, loop->e_mask);
1282 
1283   if (loop->child)
1284     {
1285       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1286       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1287       mask_all |= loop->inner;
1288     }
1289 
1290   if (loop->sibling)
1291     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1292 
1293   return mask_all;
1294 }
1295 
1296 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1297    OUTER_MASK is the partitioning this loop is contained within.
1298    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1299    Return the cumulative partitioning used by this loop, siblings and
1300    children.  */
1301 
1302 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1303 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1304 			   bool outer_assign)
1305 {
1306   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1307   bool noisy = true;
1308   bool tiling = loop->flags & OLF_TILE;
1309 
1310 #ifdef ACCEL_COMPILER
1311   /* When device_type is supported, we want the device compiler to be
1312      noisy, if the loop parameters are device_type-specific.  */
1313   noisy = false;
1314 #endif
1315 
1316   if (assign && (!outer_assign || loop->inner))
1317     {
1318       /* Allocate outermost and non-innermost loops at the outermost
1319 	 non-innermost available level.  */
1320       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1321 
1322       /* Find the first outermost available partition. */
1323       while (this_mask <= outer_mask)
1324 	this_mask <<= 1;
1325 
1326       /* Grab two axes if tiling, and we've not assigned anything  */
1327       if (tiling && !(loop->mask | loop->e_mask))
1328 	this_mask |= this_mask << 1;
1329 
1330       /* Prohibit the innermost partitioning at the moment.  */
1331       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1332 
1333       /* Don't use any dimension explicitly claimed by an inner loop. */
1334       this_mask &= ~loop->inner;
1335 
1336       if (tiling && !loop->e_mask)
1337 	{
1338 	  /* If we got two axes, allocate the inner one to the element
1339 	     loop.  */
1340 	  loop->e_mask = this_mask & (this_mask << 1);
1341 	  this_mask ^= loop->e_mask;
1342 	}
1343 
1344       loop->mask |= this_mask;
1345     }
1346 
1347   if (loop->child)
1348     {
1349       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1350       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1351 					       outer_assign | assign);
1352     }
1353 
1354   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1355     {
1356       /* Allocate the loop at the innermost available level.  Note
1357 	 that we do this even if we already assigned this loop the
1358 	 outermost available level above.  That way we'll partition
1359 	 this along 2 axes, if they are available.  */
1360       unsigned this_mask = 0;
1361 
1362       /* Determine the outermost partitioning used within this loop.  */
1363       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1364       this_mask = least_bit_hwi (this_mask);
1365 
1366       /* Pick the partitioning just inside that one.  */
1367       this_mask >>= 1;
1368 
1369       /* And avoid picking one use by an outer loop.  */
1370       this_mask &= ~outer_mask;
1371 
1372       /* If tiling and we failed completely above, grab the next one
1373 	 too.  Making sure it doesn't hit an outer loop.  */
1374       if (tiling)
1375 	{
1376 	  this_mask &= ~(loop->e_mask | loop->mask);
1377 	  unsigned tile_mask = ((this_mask >> 1)
1378 				& ~(outer_mask | loop->e_mask | loop->mask));
1379 
1380 	  if (tile_mask || loop->mask)
1381 	    {
1382 	      loop->e_mask |= this_mask;
1383 	      this_mask = tile_mask;
1384 	    }
1385 	  if (!loop->e_mask && noisy)
1386 	    warning_at (loop->loc, 0,
1387 			"insufficient partitioning available"
1388 			" to parallelize element loop");
1389 	}
1390 
1391       loop->mask |= this_mask;
1392       if (!loop->mask && noisy)
1393 	warning_at (loop->loc, 0,
1394 		    tiling
1395 		    ? G_("insufficient partitioning available"
1396 			 " to parallelize tile loop")
1397 		    : G_("insufficient partitioning available"
1398 			 " to parallelize loop"));
1399     }
1400 
1401   if (assign && dump_file)
1402     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1403 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1404 	     loop->mask, loop->e_mask);
1405 
1406   unsigned inner_mask = 0;
1407 
1408   if (loop->sibling)
1409     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1410 					     outer_mask, outer_assign);
1411 
1412   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1413 
1414   return inner_mask;
1415 }
1416 
1417 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1418    axes.  Return mask of partitioning.  */
1419 
1420 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1421 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1422 {
1423   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1424 
1425   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1426     {
1427       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1428       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1429     }
1430   return mask_all;
1431 }
1432 
1433 /* Default fork/join early expander.  Delete the function calls if
1434    there is no RTL expander.  */
1435 
1436 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1437 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1438 			 const int *ARG_UNUSED (dims), bool is_fork)
1439 {
1440   if (is_fork)
1441     return targetm.have_oacc_fork ();
1442   else
1443     return targetm.have_oacc_join ();
1444 }
1445 
1446 /* Default goacc.reduction early expander.
1447 
1448    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1449    If RES_PTR is not integer-zerop:
1450        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1451        TEARDOWN - emit '*RES_PTR = VAR'
1452    If LHS is not NULL
1453        emit 'LHS = VAR'   */
1454 
1455 void
default_goacc_reduction(gcall * call)1456 default_goacc_reduction (gcall *call)
1457 {
1458   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1459   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1460   tree lhs = gimple_call_lhs (call);
1461   tree var = gimple_call_arg (call, 2);
1462   gimple_seq seq = NULL;
1463 
1464   if (code == IFN_GOACC_REDUCTION_SETUP
1465       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1466     {
1467       /* Setup and Teardown need to copy from/to the receiver object,
1468 	 if there is one.  */
1469       tree ref_to_res = gimple_call_arg (call, 1);
1470 
1471       if (!integer_zerop (ref_to_res))
1472 	{
1473 	  tree dst = build_simple_mem_ref (ref_to_res);
1474 	  tree src = var;
1475 
1476 	  if (code == IFN_GOACC_REDUCTION_SETUP)
1477 	    {
1478 	      src = dst;
1479 	      dst = lhs;
1480 	      lhs = NULL;
1481 	    }
1482 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1483 	}
1484     }
1485 
1486   /* Copy VAR to LHS, if there is an LHS.  */
1487   if (lhs)
1488     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1489 
1490   gsi_replace_with_seq (&gsi, seq, true);
1491 }
1492 
1493 /* Main entry point for oacc transformations which run on the device
1494    compiler after LTO, so we know what the target device is at this
1495    point (including the host fallback).  */
1496 
1497 static unsigned int
execute_oacc_device_lower()1498 execute_oacc_device_lower ()
1499 {
1500   tree attrs = oacc_get_fn_attrib (current_function_decl);
1501 
1502   if (!attrs)
1503     /* Not an offloaded function.  */
1504     return 0;
1505 
1506   /* Parse the default dim argument exactly once.  */
1507   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1508     {
1509       oacc_parse_default_dims (flag_openacc_dims);
1510       flag_openacc_dims = (char *)&flag_openacc_dims;
1511     }
1512 
1513   bool is_oacc_kernels
1514     = (lookup_attribute ("oacc kernels",
1515 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1516   bool is_oacc_kernels_parallelized
1517     = (lookup_attribute ("oacc kernels parallelized",
1518 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1519 
1520   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1521      kernels, so remove the parallelism dimensions function attributes
1522      potentially set earlier on.  */
1523   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1524     {
1525       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1526       attrs = oacc_get_fn_attrib (current_function_decl);
1527     }
1528 
1529   /* Discover, partition and process the loops.  */
1530   oacc_loop *loops = oacc_loop_discovery ();
1531   int fn_level = oacc_fn_attrib_level (attrs);
1532 
1533   if (dump_file)
1534     {
1535       if (fn_level >= 0)
1536 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
1537 		 fn_level);
1538       else if (is_oacc_kernels)
1539 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1540 		 (is_oacc_kernels_parallelized
1541 		  ? "parallelized" : "unparallelized"));
1542       else
1543 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
1544     }
1545 
1546   unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1547   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1548   /* OpenACC kernels constructs are special: they currently don't use the
1549      generic oacc_loop infrastructure and attribute/dimension processing.  */
1550   if (is_oacc_kernels && is_oacc_kernels_parallelized)
1551     {
1552       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
1553 	 also tree-parloops.c:create_parallel_loop.  */
1554       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1555     }
1556 
1557   int dims[GOMP_DIM_MAX];
1558   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1559 
1560   if (dump_file)
1561     {
1562       const char *comma = "Compute dimensions [";
1563       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1564 	fprintf (dump_file, "%s%d", comma, dims[ix]);
1565       fprintf (dump_file, "]\n");
1566     }
1567 
1568   oacc_loop_process (loops);
1569   if (dump_file)
1570     {
1571       fprintf (dump_file, "OpenACC loops\n");
1572       dump_oacc_loop (dump_file, loops, 0);
1573       fprintf (dump_file, "\n");
1574     }
1575   if (dump_enabled_p ())
1576     {
1577       oacc_loop *l = loops;
1578       /* OpenACC kernels constructs are special: they currently don't use the
1579 	 generic oacc_loop infrastructure.  */
1580       if (is_oacc_kernels)
1581 	{
1582 	  /* Create a fake oacc_loop for diagnostic purposes.  */
1583 	  l = new_oacc_loop_raw (NULL,
1584 				 DECL_SOURCE_LOCATION (current_function_decl));
1585 	  l->mask = used_mask;
1586 	}
1587       else
1588 	{
1589 	  /* Skip the outermost, dummy OpenACC loop  */
1590 	  l = l->child;
1591 	}
1592       if (l)
1593 	inform_oacc_loop (l);
1594       if (is_oacc_kernels)
1595 	free_oacc_loop (l);
1596     }
1597 
1598   /* Offloaded targets may introduce new basic blocks, which require
1599      dominance information to update SSA.  */
1600   calculate_dominance_info (CDI_DOMINATORS);
1601 
1602   /* Now lower internal loop functions to target-specific code
1603      sequences.  */
1604   basic_block bb;
1605   FOR_ALL_BB_FN (bb, cfun)
1606     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1607       {
1608 	gimple *stmt = gsi_stmt (gsi);
1609 	if (!is_gimple_call (stmt))
1610 	  {
1611 	    gsi_next (&gsi);
1612 	    continue;
1613 	  }
1614 
1615 	gcall *call = as_a <gcall *> (stmt);
1616 	if (!gimple_call_internal_p (call))
1617 	  {
1618 	    gsi_next (&gsi);
1619 	    continue;
1620 	  }
1621 
1622 	/* Rewind to allow rescan.  */
1623 	gsi_prev (&gsi);
1624 	bool rescan = false, remove = false;
1625 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
1626 
1627 	switch (ifn_code)
1628 	  {
1629 	  default: break;
1630 
1631 	  case IFN_GOACC_TILE:
1632 	    oacc_xform_tile (call);
1633 	    rescan = true;
1634 	    break;
1635 
1636 	  case IFN_GOACC_LOOP:
1637 	    oacc_xform_loop (call);
1638 	    rescan = true;
1639 	    break;
1640 
1641 	  case IFN_GOACC_REDUCTION:
1642 	    /* Mark the function for SSA renaming.  */
1643 	    mark_virtual_operands_for_renaming (cfun);
1644 
1645 	    /* If the level is -1, this ended up being an unused
1646 	       axis.  Handle as a default.  */
1647 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
1648 	      default_goacc_reduction (call);
1649 	    else
1650 	      targetm.goacc.reduction (call);
1651 	    rescan = true;
1652 	    break;
1653 
1654 	  case IFN_UNIQUE:
1655 	    {
1656 	      enum ifn_unique_kind kind
1657 		= ((enum ifn_unique_kind)
1658 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1659 
1660 	      switch (kind)
1661 		{
1662 		default:
1663 		  break;
1664 
1665 		case IFN_UNIQUE_OACC_FORK:
1666 		case IFN_UNIQUE_OACC_JOIN:
1667 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
1668 		    remove = true;
1669 		  else if (!targetm.goacc.fork_join
1670 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1671 		    remove = true;
1672 		  break;
1673 
1674 		case IFN_UNIQUE_OACC_HEAD_MARK:
1675 		case IFN_UNIQUE_OACC_TAIL_MARK:
1676 		  remove = true;
1677 		  break;
1678 		}
1679 	      break;
1680 	    }
1681 	  }
1682 
1683 	if (gsi_end_p (gsi))
1684 	  /* We rewound past the beginning of the BB.  */
1685 	  gsi = gsi_start_bb (bb);
1686 	else
1687 	  /* Undo the rewind.  */
1688 	  gsi_next (&gsi);
1689 
1690 	if (remove)
1691 	  {
1692 	    if (gimple_vdef (call))
1693 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1694 	    if (gimple_call_lhs (call))
1695 	      {
1696 		/* Propagate the data dependency var.  */
1697 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1698 						   gimple_call_arg (call, 1));
1699 		gsi_replace (&gsi, ass,  false);
1700 	      }
1701 	    else
1702 	      gsi_remove (&gsi, true);
1703 	  }
1704 	else if (!rescan)
1705 	  /* If not rescanning, advance over the call.  */
1706 	  gsi_next (&gsi);
1707       }
1708 
1709   free_oacc_loop (loops);
1710 
1711   return 0;
1712 }
1713 
1714 /* Default launch dimension validator.  Force everything to 1.  A
1715    backend that wants to provide larger dimensions must override this
1716    hook.  */
1717 
1718 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))1719 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1720 			     int ARG_UNUSED (fn_level),
1721 			     unsigned ARG_UNUSED (used))
1722 {
1723   bool changed = false;
1724 
1725   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1726     {
1727       if (dims[ix] != 1)
1728 	{
1729 	  dims[ix] = 1;
1730 	  changed = true;
1731 	}
1732     }
1733 
1734   return changed;
1735 }
1736 
1737 /* Default dimension bound is unknown on accelerator and 1 on host.  */
1738 
1739 int
default_goacc_dim_limit(int ARG_UNUSED (axis))1740 default_goacc_dim_limit (int ARG_UNUSED (axis))
1741 {
1742 #ifdef ACCEL_COMPILER
1743   return 0;
1744 #else
1745   return 1;
1746 #endif
1747 }
1748 
1749 namespace {
1750 
1751 const pass_data pass_data_oacc_device_lower =
1752 {
1753   GIMPLE_PASS, /* type */
1754   "oaccdevlow", /* name */
1755   OPTGROUP_OMP, /* optinfo_flags */
1756   TV_NONE, /* tv_id */
1757   PROP_cfg, /* properties_required */
1758   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
1759   0, /* properties_destroyed */
1760   0, /* todo_flags_start */
1761   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1762 };
1763 
1764 class pass_oacc_device_lower : public gimple_opt_pass
1765 {
1766 public:
pass_oacc_device_lower(gcc::context * ctxt)1767   pass_oacc_device_lower (gcc::context *ctxt)
1768     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1769   {}
1770 
1771   /* opt_pass methods: */
gate(function *)1772   virtual bool gate (function *) { return flag_openacc; };
1773 
execute(function *)1774   virtual unsigned int execute (function *)
1775     {
1776       return execute_oacc_device_lower ();
1777     }
1778 
1779 }; // class pass_oacc_device_lower
1780 
1781 } // anon namespace
1782 
1783 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)1784 make_pass_oacc_device_lower (gcc::context *ctxt)
1785 {
1786   return new pass_oacc_device_lower (ctxt);
1787 }
1788 
1789 
1790 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1791    GOMP_SIMT_ENTER call identifying the privatized variables, which are
1792    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1793    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
1794 
1795 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)1796 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1797 {
1798   gimple *alloc_stmt = gsi_stmt (*gsi);
1799   tree simtrec = gimple_call_lhs (alloc_stmt);
1800   tree simduid = gimple_call_arg (alloc_stmt, 0);
1801   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1802   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1803   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1804   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1805   TREE_ADDRESSABLE (rectype) = 1;
1806   TREE_TYPE (simtrec) = build_pointer_type (rectype);
1807   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1808     {
1809       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1810       if (*argp == null_pointer_node)
1811 	continue;
1812       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1813 		  && VAR_P (TREE_OPERAND (*argp, 0)));
1814       tree var = TREE_OPERAND (*argp, 0);
1815 
1816       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1817 			       DECL_NAME (var), TREE_TYPE (var));
1818       SET_DECL_ALIGN (field, DECL_ALIGN (var));
1819       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1820       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1821 
1822       insert_field_into_struct (rectype, field);
1823 
1824       tree t = build_simple_mem_ref (simtrec);
1825       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1826       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1827       SET_DECL_VALUE_EXPR (var, t);
1828       DECL_HAS_VALUE_EXPR_P (var) = 1;
1829       *regimplify = true;
1830     }
1831   layout_type (rectype);
1832   tree size = TYPE_SIZE_UNIT (rectype);
1833   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1834 
1835   alloc_stmt
1836     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1837   gimple_call_set_lhs (alloc_stmt, simtrec);
1838   gsi_replace (gsi, alloc_stmt, false);
1839   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1840   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1841   gsi_replace (&enter_gsi, enter_stmt, false);
1842 
1843   use_operand_p use;
1844   gimple *exit_stmt;
1845   if (single_imm_use (simtrec, &use, &exit_stmt))
1846     {
1847       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1848       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1849       tree clobber = build_constructor (rectype, NULL);
1850       TREE_THIS_VOLATILE (clobber) = 1;
1851       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1852       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1853     }
1854   else
1855     gcc_checking_assert (has_zero_uses (simtrec));
1856 }
1857 
1858 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
1859 
1860 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)1861 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1862 {
1863   tree t = *tp;
1864 
1865   if (VAR_P (t)
1866       && DECL_HAS_VALUE_EXPR_P (t)
1867       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1868     {
1869       *walk_subtrees = 0;
1870       return t;
1871     }
1872   return NULL_TREE;
1873 }
1874 
1875 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1876    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1877    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
1878    internal functions on non-SIMT targets, and likewise some SIMD internal
1879    functions on SIMT targets.  */
1880 
1881 static unsigned int
execute_omp_device_lower()1882 execute_omp_device_lower ()
1883 {
1884   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1885   bool regimplify = false;
1886   basic_block bb;
1887   gimple_stmt_iterator gsi;
1888   FOR_EACH_BB_FN (bb, cfun)
1889     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1890       {
1891 	gimple *stmt = gsi_stmt (gsi);
1892 	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1893 	  continue;
1894 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1895 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1896 	switch (gimple_call_internal_fn (stmt))
1897 	  {
1898 	  case IFN_GOMP_USE_SIMT:
1899 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
1900 	    break;
1901 	  case IFN_GOMP_SIMT_ENTER:
1902 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1903 	    goto simtreg_enter_exit;
1904 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
1905 	    if (vf != 1)
1906 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1907 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1908 	    goto simtreg_enter_exit;
1909 	  case IFN_GOMP_SIMT_EXIT:
1910 	  simtreg_enter_exit:
1911 	    if (vf != 1)
1912 	      continue;
1913 	    unlink_stmt_vdef (stmt);
1914 	    break;
1915 	  case IFN_GOMP_SIMT_LANE:
1916 	  case IFN_GOMP_SIMT_LAST_LANE:
1917 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1918 	    break;
1919 	  case IFN_GOMP_SIMT_VF:
1920 	    rhs = build_int_cst (type, vf);
1921 	    break;
1922 	  case IFN_GOMP_SIMT_ORDERED_PRED:
1923 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1924 	    if (rhs || !lhs)
1925 	      unlink_stmt_vdef (stmt);
1926 	    break;
1927 	  case IFN_GOMP_SIMT_VOTE_ANY:
1928 	  case IFN_GOMP_SIMT_XCHG_BFLY:
1929 	  case IFN_GOMP_SIMT_XCHG_IDX:
1930 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1931 	    break;
1932 	  case IFN_GOMP_SIMD_LANE:
1933 	  case IFN_GOMP_SIMD_LAST_LANE:
1934 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1935 	    break;
1936 	  case IFN_GOMP_SIMD_VF:
1937 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1938 	    break;
1939 	  default:
1940 	    continue;
1941 	  }
1942 	if (lhs && !rhs)
1943 	  continue;
1944 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1945 	gsi_replace (&gsi, stmt, false);
1946       }
1947   if (regimplify)
1948     FOR_EACH_BB_REVERSE_FN (bb, cfun)
1949       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1950 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1951 	  {
1952 	    if (gimple_clobber_p (gsi_stmt (gsi)))
1953 	      gsi_remove (&gsi, true);
1954 	    else
1955 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1956 	  }
1957   if (vf != 1)
1958     cfun->has_force_vectorize_loops = false;
1959   return 0;
1960 }
1961 
1962 namespace {
1963 
1964 const pass_data pass_data_omp_device_lower =
1965 {
1966   GIMPLE_PASS, /* type */
1967   "ompdevlow", /* name */
1968   OPTGROUP_OMP, /* optinfo_flags */
1969   TV_NONE, /* tv_id */
1970   PROP_cfg, /* properties_required */
1971   PROP_gimple_lomp_dev, /* properties_provided */
1972   0, /* properties_destroyed */
1973   0, /* todo_flags_start */
1974   TODO_update_ssa, /* todo_flags_finish */
1975 };
1976 
1977 class pass_omp_device_lower : public gimple_opt_pass
1978 {
1979 public:
pass_omp_device_lower(gcc::context * ctxt)1980   pass_omp_device_lower (gcc::context *ctxt)
1981     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1982   {}
1983 
1984   /* opt_pass methods: */
gate(function * fun)1985   virtual bool gate (function *fun)
1986     {
1987       return !(fun->curr_properties & PROP_gimple_lomp_dev);
1988     }
execute(function *)1989   virtual unsigned int execute (function *)
1990     {
1991       return execute_omp_device_lower ();
1992     }
1993 
1994 }; // class pass_expand_omp_ssa
1995 
1996 } // anon namespace
1997 
1998 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)1999 make_pass_omp_device_lower (gcc::context *ctxt)
2000 {
2001   return new pass_omp_device_lower (ctxt);
2002 }
2003 
2004 /* "omp declare target link" handling pass.  */
2005 
2006 namespace {
2007 
2008 const pass_data pass_data_omp_target_link =
2009 {
2010   GIMPLE_PASS,			/* type */
2011   "omptargetlink",		/* name */
2012   OPTGROUP_OMP,			/* optinfo_flags */
2013   TV_NONE,			/* tv_id */
2014   PROP_ssa,			/* properties_required */
2015   0,				/* properties_provided */
2016   0,				/* properties_destroyed */
2017   0,				/* todo_flags_start */
2018   TODO_update_ssa,		/* todo_flags_finish */
2019 };
2020 
2021 class pass_omp_target_link : public gimple_opt_pass
2022 {
2023 public:
pass_omp_target_link(gcc::context * ctxt)2024   pass_omp_target_link (gcc::context *ctxt)
2025     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2026   {}
2027 
2028   /* opt_pass methods: */
gate(function * fun)2029   virtual bool gate (function *fun)
2030     {
2031 #ifdef ACCEL_COMPILER
2032       return offloading_function_p (fun->decl);
2033 #else
2034       (void) fun;
2035       return false;
2036 #endif
2037     }
2038 
2039   virtual unsigned execute (function *);
2040 };
2041 
2042 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
2043 
2044 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2045 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2046 {
2047   tree t = *tp;
2048 
2049   if (VAR_P (t)
2050       && DECL_HAS_VALUE_EXPR_P (t)
2051       && is_global_var (t)
2052       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2053     {
2054       *walk_subtrees = 0;
2055       return t;
2056     }
2057 
2058   return NULL_TREE;
2059 }
2060 
2061 unsigned
execute(function * fun)2062 pass_omp_target_link::execute (function *fun)
2063 {
2064   basic_block bb;
2065   FOR_EACH_BB_FN (bb, fun)
2066     {
2067       gimple_stmt_iterator gsi;
2068       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2069 	if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2070 	  gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2071     }
2072 
2073   return 0;
2074 }
2075 
2076 } // anon namespace
2077 
2078 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2079 make_pass_omp_target_link (gcc::context *ctxt)
2080 {
2081   return new pass_omp_target_link (ctxt);
2082 }
2083