1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2020 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 
56 /* Describe the OpenACC looping structure of a function.  The entire
57    function is held in a 'NULL' loop.  */
58 
59 struct oacc_loop
60 {
61   oacc_loop *parent; /* Containing loop.  */
62 
63   oacc_loop *child; /* First inner loop.  */
64 
65   oacc_loop *sibling; /* Next loop within same parent.  */
66 
67   location_t loc; /* Location of the loop start.  */
68 
69   gcall *marker; /* Initial head marker.  */
70 
71   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
72   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
73 
74   tree routine;  /* Pseudo-loop enclosing a routine.  */
75 
76   unsigned mask;   /* Partitioning mask.  */
77   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
78   unsigned inner;  /* Partitioning of inner loops.  */
79   unsigned flags;  /* Partitioning flags.  */
80   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
81   tree chunk_size; /* Chunk size.  */
82   gcall *head_end; /* Final marker of head sequence.  */
83 };
84 
85 /* Holds offload tables with decls.  */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
87 
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89    -1 if it is not a routine (i.e. is an offload fn).  */
90 
91 int
oacc_fn_attrib_level(tree attr)92 oacc_fn_attrib_level (tree attr)
93 {
94   tree pos = TREE_VALUE (attr);
95 
96   if (!TREE_PURPOSE (pos))
97     return -1;
98 
99   int ix = 0;
100   for (ix = 0; ix != GOMP_DIM_MAX;
101        ix++, pos = TREE_CHAIN (pos))
102     if (!integer_zerop (TREE_PURPOSE (pos)))
103       break;
104 
105   return ix;
106 }
107 
108 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
109    adds their addresses and sizes to constructor-vector V_CTOR.  */
110 
111 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 					 vec<constructor_elt, va_gc> *v_ctor)
114 {
115   unsigned len = vec_safe_length (v_decls);
116   for (unsigned i = 0; i < len; i++)
117     {
118       tree it = (*v_decls)[i];
119       bool is_var = VAR_P (it);
120       bool is_link_var
121 	= is_var
122 #ifdef ACCEL_COMPILER
123 	  && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126 
127       /* See also omp_finish_file and output_offload_tables in lto-cgraph.c.  */
128       if (!in_lto_p && !symtab_node::get (it))
129 	continue;
130 
131       tree size = NULL_TREE;
132       if (is_var)
133 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
134 
135       tree addr;
136       if (!is_link_var)
137 	addr = build_fold_addr_expr (it);
138       else
139 	{
140 #ifdef ACCEL_COMPILER
141 	  /* For "omp declare target link" vars add address of the pointer to
142 	     the target table, instead of address of the var.  */
143 	  tree value_expr = DECL_VALUE_EXPR (it);
144 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
145 	  varpool_node::finalize_decl (link_ptr_decl);
146 	  addr = build_fold_addr_expr (link_ptr_decl);
147 #else
148 	  addr = build_fold_addr_expr (it);
149 #endif
150 
151 	  /* Most significant bit of the size marks "omp declare target link"
152 	     vars in host and target tables.  */
153 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
154 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
155 			    * BITS_PER_UNIT - 1);
156 	  size = wide_int_to_tree (const_ptr_type_node, isize);
157 	}
158 
159       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
160       if (is_var)
161 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
162     }
163 }
164 
165 /* Create new symbols containing (address, size) pairs for global variables,
166    marked with "omp declare target" attribute, as well as addresses for the
167    functions, which are outlined offloading regions.  */
168 void
omp_finish_file(void)169 omp_finish_file (void)
170 {
171   unsigned num_funcs = vec_safe_length (offload_funcs);
172   unsigned num_vars = vec_safe_length (offload_vars);
173 
174   if (num_funcs == 0 && num_vars == 0)
175     return;
176 
177   if (targetm_common.have_named_sections)
178     {
179       vec<constructor_elt, va_gc> *v_f, *v_v;
180       vec_alloc (v_f, num_funcs);
181       vec_alloc (v_v, num_vars * 2);
182 
183       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
184       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
185 
186       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
187 						    vec_safe_length (v_v));
188       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
189 						     num_funcs);
190       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
191       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
192       tree ctor_v = build_constructor (vars_decl_type, v_v);
193       tree ctor_f = build_constructor (funcs_decl_type, v_f);
194       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
195       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
196       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
197 				    get_identifier (".offload_func_table"),
198 				    funcs_decl_type);
199       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
200 				   get_identifier (".offload_var_table"),
201 				   vars_decl_type);
202       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
203       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
204 	 otherwise a joint table in a binary will contain padding between
205 	 tables from multiple object files.  */
206       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
207       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
208       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
209       DECL_INITIAL (funcs_decl) = ctor_f;
210       DECL_INITIAL (vars_decl) = ctor_v;
211       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
212       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
213 
214       varpool_node::finalize_decl (vars_decl);
215       varpool_node::finalize_decl (funcs_decl);
216     }
217   else
218     {
219       for (unsigned i = 0; i < num_funcs; i++)
220 	{
221 	  tree it = (*offload_funcs)[i];
222 	  /* See also add_decls_addresses_to_decl_constructor
223 	     and output_offload_tables in lto-cgraph.c.  */
224 	  if (!in_lto_p && !symtab_node::get (it))
225 	    continue;
226 	  targetm.record_offload_symbol (it);
227 	}
228       for (unsigned i = 0; i < num_vars; i++)
229 	{
230 	  tree it = (*offload_vars)[i];
231 	  if (!in_lto_p && !symtab_node::get (it))
232 	    continue;
233 #ifdef ACCEL_COMPILER
234 	  if (DECL_HAS_VALUE_EXPR_P (it)
235 	      && lookup_attribute ("omp declare target link",
236 				   DECL_ATTRIBUTES (it)))
237 	    {
238 	      tree value_expr = DECL_VALUE_EXPR (it);
239 	      tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
240 	      targetm.record_offload_symbol (link_ptr_decl);
241 	      varpool_node::finalize_decl (link_ptr_decl);
242 	    }
243 	  else
244 #endif
245 	    targetm.record_offload_symbol (it);
246 	}
247     }
248 }
249 
250 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
251    axis DIM.  Return a tmp var holding the result.  */
252 
253 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)254 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
255 {
256   tree arg = build_int_cst (unsigned_type_node, dim);
257   tree size = create_tmp_var (integer_type_node);
258   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
259   gimple *call = gimple_build_call_internal (fn, 1, arg);
260 
261   gimple_call_set_lhs (call, size);
262   gimple_seq_add_stmt (seq, call);
263 
264   return size;
265 }
266 
267 /* Find the number of threads (POS = false), or thread number (POS =
268    true) for an OpenACC region partitioned as MASK.  Setup code
269    required for the calculation is added to SEQ.  */
270 
271 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)272 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
273 {
274   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
275   unsigned ix;
276 
277   /* Start at gang level, and examine relevant dimension indices.  */
278   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
279     if (GOMP_DIM_MASK (ix) & mask)
280       {
281 	if (res)
282 	  {
283 	    /* We had an outer index, so scale that by the size of
284 	       this dimension.  */
285 	    tree n = oacc_dim_call (false, ix, seq);
286 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
287 	  }
288 	if (pos)
289 	  {
290 	    /* Determine index in this dimension.  */
291 	    tree id = oacc_dim_call (true, ix, seq);
292 	    if (res)
293 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
294 	    else
295 	      res = id;
296 	  }
297       }
298 
299   if (res == NULL_TREE)
300     res = integer_zero_node;
301 
302   return res;
303 }
304 
305 /* Transform IFN_GOACC_LOOP calls to actual code.  See
306    expand_oacc_for for where these are generated.  At the vector
307    level, we stride loops, such that each member of a warp will
308    operate on adjacent iterations.  At the worker and gang level,
309    each gang/warp executes a set of contiguous iterations.  Chunking
310    can override this such that each iteration engine executes a
311    contiguous chunk, and then moves on to stride to the next chunk.  */
312 
313 static void
oacc_xform_loop(gcall * call)314 oacc_xform_loop (gcall *call)
315 {
316   gimple_stmt_iterator gsi = gsi_for_stmt (call);
317   enum ifn_goacc_loop_kind code
318     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
319   tree dir = gimple_call_arg (call, 1);
320   tree range = gimple_call_arg (call, 2);
321   tree step = gimple_call_arg (call, 3);
322   tree chunk_size = NULL_TREE;
323   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
324   tree lhs = gimple_call_lhs (call);
325   tree type = NULL_TREE;
326   tree diff_type = TREE_TYPE (range);
327   tree r = NULL_TREE;
328   gimple_seq seq = NULL;
329   bool chunking = false, striding = true;
330   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
331   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
332 
333   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
334   if (!lhs)
335     {
336       gsi_replace_with_seq (&gsi, seq, true);
337       return;
338     }
339 
340   type = TREE_TYPE (lhs);
341 
342 #ifdef ACCEL_COMPILER
343   chunk_size = gimple_call_arg (call, 4);
344   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
345       || integer_zerop (chunk_size))   /* Default (also static).  */
346     {
347       /* If we're at the gang level, we want each to execute a
348 	 contiguous run of iterations.  Otherwise we want each element
349 	 to stride.  */
350       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
351       chunking = false;
352     }
353   else
354     {
355       /* Chunk of size 1 is striding.  */
356       striding = integer_onep (chunk_size);
357       chunking = !striding;
358     }
359 #endif
360 
361   /* striding=true, chunking=true
362        -> invalid.
363      striding=true, chunking=false
364        -> chunks=1
365      striding=false,chunking=true
366        -> chunks=ceil (range/(chunksize*threads*step))
367      striding=false,chunking=false
368        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
369   push_gimplify_context (true);
370 
371   switch (code)
372     {
373     default: gcc_unreachable ();
374 
375     case IFN_GOACC_LOOP_CHUNKS:
376       if (!chunking)
377 	r = build_int_cst (type, 1);
378       else
379 	{
380 	  /* chunk_max
381 	     = (range - dir) / (chunks * step * num_threads) + dir  */
382 	  tree per = oacc_thread_numbers (false, mask, &seq);
383 	  per = fold_convert (type, per);
384 	  chunk_size = fold_convert (type, chunk_size);
385 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
386 	  per = fold_build2 (MULT_EXPR, type, per, step);
387 	  r = build2 (MINUS_EXPR, type, range, dir);
388 	  r = build2 (PLUS_EXPR, type, r, per);
389 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
390 	}
391       break;
392 
393     case IFN_GOACC_LOOP_STEP:
394       {
395 	/* If striding, step by the entire compute volume, otherwise
396 	   step by the inner volume.  */
397 	unsigned volume = striding ? mask : inner_mask;
398 
399 	r = oacc_thread_numbers (false, volume, &seq);
400 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
401       }
402       break;
403 
404     case IFN_GOACC_LOOP_OFFSET:
405       /* Enable vectorization on non-SIMT targets.  */
406       if (!targetm.simt.vf
407 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
408 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
409 	     the loop.  */
410 	  && (flag_tree_loop_vectorize
411 	      || !global_options_set.x_flag_tree_loop_vectorize))
412 	{
413 	  basic_block bb = gsi_bb (gsi);
414 	  class loop *parent = bb->loop_father;
415 	  class loop *body = parent->inner;
416 
417 	  parent->force_vectorize = true;
418 	  parent->safelen = INT_MAX;
419 
420 	  /* "Chunking loops" may have inner loops.  */
421 	  if (parent->inner)
422 	    {
423 	      body->force_vectorize = true;
424 	      body->safelen = INT_MAX;
425 	    }
426 
427 	  cfun->has_force_vectorize_loops = true;
428 	}
429       if (striding)
430 	{
431 	  r = oacc_thread_numbers (true, mask, &seq);
432 	  r = fold_convert (diff_type, r);
433 	}
434       else
435 	{
436 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
437 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
438 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
439 				     inner_size, outer_size);
440 
441 	  volume = fold_convert (diff_type, volume);
442 	  if (chunking)
443 	    chunk_size = fold_convert (diff_type, chunk_size);
444 	  else
445 	    {
446 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
447 
448 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
449 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
450 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
451 	    }
452 
453 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
454 			      fold_convert (diff_type, inner_size));
455 	  r = oacc_thread_numbers (true, outer_mask, &seq);
456 	  r = fold_convert (diff_type, r);
457 	  r = build2 (MULT_EXPR, diff_type, r, span);
458 
459 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
460 	  inner = fold_convert (diff_type, inner);
461 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
462 
463 	  if (chunking)
464 	    {
465 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
466 	      tree per
467 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
468 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
469 
470 	      r = build2 (PLUS_EXPR, diff_type, r, per);
471 	    }
472 	}
473       r = fold_build2 (MULT_EXPR, diff_type, r, step);
474       if (type != diff_type)
475 	r = fold_convert (type, r);
476       break;
477 
478     case IFN_GOACC_LOOP_BOUND:
479       if (striding)
480 	r = range;
481       else
482 	{
483 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
484 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
485 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
486 				     inner_size, outer_size);
487 
488 	  volume = fold_convert (diff_type, volume);
489 	  if (chunking)
490 	    chunk_size = fold_convert (diff_type, chunk_size);
491 	  else
492 	    {
493 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
494 
495 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
496 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
497 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
498 	    }
499 
500 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
501 			      fold_convert (diff_type, inner_size));
502 
503 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
504 
505 	  tree offset = gimple_call_arg (call, 6);
506 	  r = build2 (PLUS_EXPR, diff_type, r,
507 		      fold_convert (diff_type, offset));
508 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
509 		      diff_type, r, range);
510 	}
511       if (diff_type != type)
512 	r = fold_convert (type, r);
513       break;
514     }
515 
516   gimplify_assign (lhs, r, &seq);
517 
518   pop_gimplify_context (NULL);
519 
520   gsi_replace_with_seq (&gsi, seq, true);
521 }
522 
523 /* Transform a GOACC_TILE call.  Determines the element loop span for
524    the specified loop of the nest.  This is 1 if we're not tiling.
525 
526    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
527 
528 static void
oacc_xform_tile(gcall * call)529 oacc_xform_tile (gcall *call)
530 {
531   gimple_stmt_iterator gsi = gsi_for_stmt (call);
532   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
533   /* Inner loops have higher loop_nos.  */
534   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
535   tree tile_size = gimple_call_arg (call, 2);
536   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
537   tree lhs = gimple_call_lhs (call);
538   tree type = TREE_TYPE (lhs);
539   gimple_seq seq = NULL;
540   tree span = build_int_cst (type, 1);
541 
542   gcc_assert (!(e_mask
543 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
544 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
545   push_gimplify_context (!seen_error ());
546 
547 #ifndef ACCEL_COMPILER
548   /* Partitioning disabled on host compilers.  */
549   e_mask = 0;
550 #endif
551   if (!e_mask)
552     /* Not paritioning.  */
553     span = integer_one_node;
554   else if (!integer_zerop (tile_size))
555     /* User explicitly specified size.  */
556     span = tile_size;
557   else
558     {
559       /* Pick a size based on the paritioning of the element loop and
560 	 the number of loop nests.  */
561       tree first_size = NULL_TREE;
562       tree second_size = NULL_TREE;
563 
564       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
565 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
566       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
567 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
568 
569       if (!first_size)
570 	{
571 	  first_size = second_size;
572 	  second_size = NULL_TREE;
573 	}
574 
575       if (loop_no + 1 == collapse)
576 	{
577 	  span = first_size;
578 	  if (!loop_no && second_size)
579 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
580 				span, second_size);
581 	}
582       else if (loop_no + 2 == collapse)
583 	span = second_size;
584       else
585 	span = NULL_TREE;
586 
587       if (!span)
588 	/* There's no obvious element size for this loop.  Options
589 	   are 1, first_size or some non-unity constant (32 is my
590 	   favourite).   We should gather some statistics.  */
591 	span = first_size;
592     }
593 
594   span = fold_convert (type, span);
595   gimplify_assign (lhs, span, &seq);
596 
597   pop_gimplify_context (NULL);
598 
599   gsi_replace_with_seq (&gsi, seq, true);
600 }
601 
602 /* Default partitioned and minimum partitioned dimensions.  */
603 
604 static int oacc_default_dims[GOMP_DIM_MAX];
605 static int oacc_min_dims[GOMP_DIM_MAX];
606 
607 int
oacc_get_default_dim(int dim)608 oacc_get_default_dim (int dim)
609 {
610   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
611   return oacc_default_dims[dim];
612 }
613 
614 int
oacc_get_min_dim(int dim)615 oacc_get_min_dim (int dim)
616 {
617   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
618   return oacc_min_dims[dim];
619 }
620 
621 /* Parse the default dimension parameter.  This is a set of
622    :-separated optional compute dimensions.  Each specified dimension
623    is a positive integer.  When device type support is added, it is
624    planned to be a comma separated list of such compute dimensions,
625    with all but the first prefixed by the colon-terminated device
626    type.  */
627 
628 static void
oacc_parse_default_dims(const char * dims)629 oacc_parse_default_dims (const char *dims)
630 {
631   int ix;
632 
633   for (ix = GOMP_DIM_MAX; ix--;)
634     {
635       oacc_default_dims[ix] = -1;
636       oacc_min_dims[ix] = 1;
637     }
638 
639 #ifndef ACCEL_COMPILER
640   /* Cannot be overridden on the host.  */
641   dims = NULL;
642 #endif
643   if (dims)
644     {
645       const char *pos = dims;
646 
647       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
648 	{
649 	  if (ix)
650 	    {
651 	      if (*pos != ':')
652 		goto malformed;
653 	      pos++;
654 	    }
655 
656 	  if (*pos != ':')
657 	    {
658 	      long val;
659 	      const char *eptr;
660 
661 	      errno = 0;
662 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
663 	      if (errno || val <= 0 || (int) val != val)
664 		goto malformed;
665 	      pos = eptr;
666 	      oacc_default_dims[ix] = (int) val;
667 	    }
668 	}
669       if (*pos)
670 	{
671 	malformed:
672 	  error_at (UNKNOWN_LOCATION,
673 		    "%<-fopenacc-dim%> operand is malformed at %qs", pos);
674 	}
675     }
676 
677   /* Allow the backend to validate the dimensions.  */
678   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
679   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
680 }
681 
682 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
683    raw attribute.  DIMS is an array of dimensions, which is filled in.
684    LEVEL is the partitioning level of a routine, or -1 for an offload
685    region itself.  USED is the mask of partitioned execution in the
686    function.  */
687 
688 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)689 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
690 {
691   tree purpose[GOMP_DIM_MAX];
692   unsigned ix;
693   tree pos = TREE_VALUE (attrs);
694 
695   /* Make sure the attribute creator attached the dimension
696      information.  */
697   gcc_assert (pos);
698 
699   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
700     {
701       purpose[ix] = TREE_PURPOSE (pos);
702       tree val = TREE_VALUE (pos);
703       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
704       pos = TREE_CHAIN (pos);
705     }
706 
707   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
708 
709   /* Default anything left to 1 or a partitioned default.  */
710   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
711     if (dims[ix] < 0)
712       {
713 	/* The OpenACC spec says 'If the [num_gangs] clause is not
714 	   specified, an implementation-defined default will be used;
715 	   the default may depend on the code within the construct.'
716 	   (2.5.6).  Thus an implementation is free to choose
717 	   non-unity default for a parallel region that doesn't have
718 	   any gang-partitioned loops.  However, it appears that there
719 	   is a sufficient body of user code that expects non-gang
720 	   partitioned regions to not execute in gang-redundant mode.
721 	   So we (a) don't warn about the non-portability and (b) pick
722 	   the minimum permissible dimension size when there is no
723 	   partitioned execution.  Otherwise we pick the global
724 	   default for the dimension, which the user can control.  The
725 	   same wording and logic applies to num_workers and
726 	   vector_length, however the worker- or vector- single
727 	   execution doesn't have the same impact as gang-redundant
728 	   execution.  (If the minimum gang-level partioning is not 1,
729 	   the target is probably too confusing.)  */
730 	dims[ix] = (used & GOMP_DIM_MASK (ix)
731 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
732 	changed = true;
733       }
734 
735   if (changed)
736     {
737       /* Replace the attribute with new values.  */
738       pos = NULL_TREE;
739       for (ix = GOMP_DIM_MAX; ix--;)
740 	pos = tree_cons (purpose[ix],
741 			 build_int_cst (integer_type_node, dims[ix]), pos);
742       oacc_replace_fn_attrib (fn, pos);
743     }
744 }
745 
746 /* Create an empty OpenACC loop structure at LOC.  */
747 
748 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)749 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
750 {
751   oacc_loop *loop = XCNEW (oacc_loop);
752 
753   loop->parent = parent;
754 
755   if (parent)
756     {
757       loop->sibling = parent->child;
758       parent->child = loop;
759     }
760 
761   loop->loc = loc;
762   return loop;
763 }
764 
765 /* Create an outermost, dummy OpenACC loop for offloaded function
766    DECL.  */
767 
768 static oacc_loop *
new_oacc_loop_outer(tree decl)769 new_oacc_loop_outer (tree decl)
770 {
771   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
772 }
773 
774 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
775    Link into PARENT loop.  Return the new loop.  */
776 
777 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)778 new_oacc_loop (oacc_loop *parent, gcall *marker)
779 {
780   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
781 
782   loop->marker = marker;
783 
784   /* TODO: This is where device_type flattening would occur for the loop
785      flags.  */
786 
787   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
788 
789   tree chunk_size = integer_zero_node;
790   if (loop->flags & OLF_GANG_STATIC)
791     chunk_size = gimple_call_arg (marker, 4);
792   loop->chunk_size = chunk_size;
793 
794   return loop;
795 }
796 
797 /* Create a dummy loop encompassing a call to a openACC routine.
798    Extract the routine's partitioning requirements.  */
799 
800 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)801 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
802 {
803   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
804   int level = oacc_fn_attrib_level (attrs);
805 
806   gcc_assert (level >= 0);
807 
808   loop->marker = call;
809   loop->routine = decl;
810   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
811 		^ (GOMP_DIM_MASK (level) - 1));
812 }
813 
814 /* Finish off the current OpenACC loop ending at tail marker TAIL.
815    Return the parent loop.  */
816 
817 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)818 finish_oacc_loop (oacc_loop *loop)
819 {
820   /* If the loop has been collapsed, don't partition it.  */
821   if (loop->ifns.is_empty ())
822     loop->mask = loop->flags = 0;
823   return loop->parent;
824 }
825 
826 /* Free all OpenACC loop structures within LOOP (inclusive).  */
827 
828 static void
free_oacc_loop(oacc_loop * loop)829 free_oacc_loop (oacc_loop *loop)
830 {
831   if (loop->sibling)
832     free_oacc_loop (loop->sibling);
833   if (loop->child)
834     free_oacc_loop (loop->child);
835 
836   loop->ifns.release ();
837   free (loop);
838 }
839 
840 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
841 
842 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)843 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
844 		     const char *title, int level)
845 {
846   enum ifn_unique_kind kind
847     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
848 
849   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
850   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
851     {
852       gimple *stmt = gsi_stmt (gsi);
853 
854       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
855 	{
856 	  enum ifn_unique_kind k
857 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
858 	       (gimple_call_arg (stmt, 0)));
859 
860 	  if (k == kind && stmt != from)
861 	    break;
862 	}
863       print_gimple_stmt (file, stmt, depth * 2 + 2);
864 
865       gsi_next (&gsi);
866       while (gsi_end_p (gsi))
867 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
868     }
869 }
870 
871 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
872 
873 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)874 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
875 {
876   int ix;
877 
878   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
879 	   loop->flags, loop->mask,
880 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
881 
882   if (loop->marker)
883     print_gimple_stmt (file, loop->marker, depth * 2);
884 
885   if (loop->routine)
886     fprintf (file, "%*sRoutine %s:%u:%s\n",
887 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
888 	     DECL_SOURCE_LINE (loop->routine),
889 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
890 
891   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
892     if (loop->heads[ix])
893       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
894   for (ix = GOMP_DIM_MAX; ix--;)
895     if (loop->tails[ix])
896       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
897 
898   if (loop->child)
899     dump_oacc_loop (file, loop->child, depth + 1);
900   if (loop->sibling)
901     dump_oacc_loop (file, loop->sibling, depth);
902 }
903 
904 void debug_oacc_loop (oacc_loop *);
905 
906 /* Dump loops to stderr.  */
907 
908 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)909 debug_oacc_loop (oacc_loop *loop)
910 {
911   dump_oacc_loop (stderr, loop, 0);
912 }
913 
914 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
915    siblings.  */
916 
917 static void
inform_oacc_loop(const oacc_loop * loop)918 inform_oacc_loop (const oacc_loop *loop)
919 {
920   const char *gang
921     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
922   const char *worker
923     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
924   const char *vector
925     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
926   const char *seq = loop->mask == 0 ? " seq" : "";
927   const dump_user_location_t loc
928     = dump_user_location_t::from_location_t (loop->loc);
929   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
930 		   "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
931 		   vector, seq);
932 
933   if (loop->child)
934     inform_oacc_loop (loop->child);
935   if (loop->sibling)
936     inform_oacc_loop (loop->sibling);
937 }
938 
939 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
940    structures as we go.  By construction these loops are properly
941    nested.  */
942 
943 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)944 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
945 {
946   int marker = 0;
947   int remaining = 0;
948 
949   if (bb->flags & BB_VISITED)
950     return;
951 
952  follow:
953   bb->flags |= BB_VISITED;
954 
955   /* Scan for loop markers.  */
956   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
957        gsi_next (&gsi))
958     {
959       gimple *stmt = gsi_stmt (gsi);
960 
961       if (!is_gimple_call (stmt))
962 	continue;
963 
964       gcall *call = as_a <gcall *> (stmt);
965 
966       /* If this is a routine, make a dummy loop for it.  */
967       if (tree decl = gimple_call_fndecl (call))
968 	if (tree attrs = oacc_get_fn_attrib (decl))
969 	  {
970 	    gcc_assert (!marker);
971 	    new_oacc_loop_routine (loop, call, decl, attrs);
972 	  }
973 
974       if (!gimple_call_internal_p (call))
975 	continue;
976 
977       switch (gimple_call_internal_fn (call))
978 	{
979 	default:
980 	  break;
981 
982 	case IFN_GOACC_LOOP:
983 	case IFN_GOACC_TILE:
984 	  /* Record the abstraction function, so we can manipulate it
985 	     later.  */
986 	  loop->ifns.safe_push (call);
987 	  break;
988 
989 	case IFN_UNIQUE:
990 	  enum ifn_unique_kind kind
991 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
992 				      (gimple_call_arg (call, 0)));
993 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
994 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
995 	    {
996 	      if (gimple_call_num_args (call) == 2)
997 		{
998 		  gcc_assert (marker && !remaining);
999 		  marker = 0;
1000 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1001 		    loop = finish_oacc_loop (loop);
1002 		  else
1003 		    loop->head_end = call;
1004 		}
1005 	      else
1006 		{
1007 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1008 
1009 		  if (!marker)
1010 		    {
1011 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1012 			loop = new_oacc_loop (loop, call);
1013 		      remaining = count;
1014 		    }
1015 		  gcc_assert (count == remaining);
1016 		  if (remaining)
1017 		    {
1018 		      remaining--;
1019 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1020 			loop->heads[marker] = call;
1021 		      else
1022 			loop->tails[remaining] = call;
1023 		    }
1024 		  marker++;
1025 		}
1026 	    }
1027 	}
1028     }
1029   if (remaining || marker)
1030     {
1031       bb = single_succ (bb);
1032       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1033       goto follow;
1034     }
1035 
1036   /* Walk successor blocks.  */
1037   edge e;
1038   edge_iterator ei;
1039 
1040   FOR_EACH_EDGE (e, ei, bb->succs)
1041     oacc_loop_discover_walk (loop, e->dest);
1042 }
1043 
1044 /* LOOP is the first sibling.  Reverse the order in place and return
1045    the new first sibling.  Recurse to child loops.  */
1046 
1047 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1048 oacc_loop_sibling_nreverse (oacc_loop *loop)
1049 {
1050   oacc_loop *last = NULL;
1051   do
1052     {
1053       if (loop->child)
1054 	loop->child = oacc_loop_sibling_nreverse (loop->child);
1055 
1056       oacc_loop *next = loop->sibling;
1057       loop->sibling = last;
1058       last = loop;
1059       loop = next;
1060     }
1061   while (loop);
1062 
1063   return last;
1064 }
1065 
1066 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1067    the current function.  */
1068 
1069 static oacc_loop *
oacc_loop_discovery()1070 oacc_loop_discovery ()
1071 {
1072   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1073      in the following.  */
1074   clear_bb_flags ();
1075 
1076   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1077   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1078 
1079   /* The siblings were constructed in reverse order, reverse them so
1080      that diagnostics come out in an unsurprising order.  */
1081   top = oacc_loop_sibling_nreverse (top);
1082 
1083   return top;
1084 }
1085 
1086 /* Transform the abstract internal function markers starting at FROM
1087    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1088    or TAIL  marker.  */
1089 
1090 static void
oacc_loop_xform_head_tail(gcall * from,int level)1091 oacc_loop_xform_head_tail (gcall *from, int level)
1092 {
1093   enum ifn_unique_kind kind
1094     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1095   tree replacement = build_int_cst (unsigned_type_node, level);
1096 
1097   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1098     {
1099       gimple *stmt = gsi_stmt (gsi);
1100 
1101       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1102 	{
1103 	  enum ifn_unique_kind k
1104 	    = ((enum ifn_unique_kind)
1105 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1106 
1107 	  if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1108 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
1109 	  else if (k == kind && stmt != from)
1110 	    break;
1111 	}
1112       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1113 	*gimple_call_arg_ptr (stmt, 3) = replacement;
1114 
1115       gsi_next (&gsi);
1116       while (gsi_end_p (gsi))
1117 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1118     }
1119 }
1120 
1121 /* Process the discovered OpenACC loops, setting the correct
1122    partitioning level etc.  */
1123 
1124 static void
oacc_loop_process(oacc_loop * loop)1125 oacc_loop_process (oacc_loop *loop)
1126 {
1127   if (loop->child)
1128     oacc_loop_process (loop->child);
1129 
1130   if (loop->mask && !loop->routine)
1131     {
1132       int ix;
1133       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1134       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1135       tree chunk_arg = loop->chunk_size;
1136       gcall *call;
1137 
1138       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1139 	switch (gimple_call_internal_fn (call))
1140 	  {
1141 	  case IFN_GOACC_LOOP:
1142 	    {
1143 	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1144 	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1145 	      if (!is_e)
1146 		gimple_call_set_arg (call, 4, chunk_arg);
1147 	    }
1148 	    break;
1149 
1150 	  case IFN_GOACC_TILE:
1151 	    gimple_call_set_arg (call, 3, mask_arg);
1152 	    gimple_call_set_arg (call, 4, e_mask_arg);
1153 	    break;
1154 
1155 	  default:
1156 	    gcc_unreachable ();
1157 	  }
1158 
1159       unsigned dim = GOMP_DIM_GANG;
1160       unsigned mask = loop->mask | loop->e_mask;
1161       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1162 	{
1163 	  while (!(GOMP_DIM_MASK (dim) & mask))
1164 	    dim++;
1165 
1166 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
1167 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
1168 
1169 	  mask ^= GOMP_DIM_MASK (dim);
1170 	}
1171     }
1172 
1173   if (loop->sibling)
1174     oacc_loop_process (loop->sibling);
1175 }
1176 
1177 /* Walk the OpenACC loop heirarchy checking and assigning the
1178    programmer-specified partitionings.  OUTER_MASK is the partitioning
1179    this loop is contained within.  Return mask of partitioning
1180    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1181    bit.  */
1182 
1183 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1184 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1185 {
1186   unsigned this_mask = loop->mask;
1187   unsigned mask_all = 0;
1188   bool noisy = true;
1189 
1190 #ifdef ACCEL_COMPILER
1191   /* When device_type is supported, we want the device compiler to be
1192      noisy, if the loop parameters are device_type-specific.  */
1193   noisy = false;
1194 #endif
1195 
1196   if (!loop->routine)
1197     {
1198       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1199       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1200       bool tiling = (loop->flags & OLF_TILE) != 0;
1201 
1202       this_mask = ((loop->flags >> OLF_DIM_BASE)
1203 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1204 
1205       /* Apply auto partitioning if this is a non-partitioned regular
1206 	 loop, or (no more than) single axis tiled loop.  */
1207       bool maybe_auto
1208 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1209 
1210       if ((this_mask != 0) + auto_par + seq_par > 1)
1211 	{
1212 	  if (noisy)
1213 	    error_at (loop->loc,
1214 		      seq_par
1215 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
1216 		      : G_("%<auto%> conflicts with other OpenACC loop "
1217 			   "specifiers"));
1218 	  maybe_auto = false;
1219 	  loop->flags &= ~OLF_AUTO;
1220 	  if (seq_par)
1221 	    {
1222 	      loop->flags
1223 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1224 	      this_mask = 0;
1225 	    }
1226 	}
1227 
1228       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1229 	{
1230 	  loop->flags |= OLF_AUTO;
1231 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1232 	}
1233     }
1234 
1235   if (this_mask & outer_mask)
1236     {
1237       const oacc_loop *outer;
1238       for (outer = loop->parent; outer; outer = outer->parent)
1239 	if ((outer->mask | outer->e_mask) & this_mask)
1240 	  break;
1241 
1242       if (noisy)
1243 	{
1244 	  if (outer)
1245 	    {
1246 	      error_at (loop->loc,
1247 			loop->routine
1248 			? G_("routine call uses same OpenACC parallelism"
1249 			     " as containing loop")
1250 			: G_("inner loop uses same OpenACC parallelism"
1251 			     " as containing loop"));
1252 	      inform (outer->loc, "containing loop here");
1253 	    }
1254 	  else
1255 	    error_at (loop->loc,
1256 		      loop->routine
1257 		      ? G_("routine call uses OpenACC parallelism disallowed"
1258 			   " by containing routine")
1259 		      : G_("loop uses OpenACC parallelism disallowed"
1260 			   " by containing routine"));
1261 
1262 	  if (loop->routine)
1263 	    inform (DECL_SOURCE_LOCATION (loop->routine),
1264 		    "routine %qD declared here", loop->routine);
1265 	}
1266       this_mask &= ~outer_mask;
1267     }
1268   else
1269     {
1270       unsigned outermost = least_bit_hwi (this_mask);
1271 
1272       if (outermost && outermost <= outer_mask)
1273 	{
1274 	  if (noisy)
1275 	    {
1276 	      error_at (loop->loc,
1277 			"incorrectly nested OpenACC loop parallelism");
1278 
1279 	      const oacc_loop *outer;
1280 	      for (outer = loop->parent;
1281 		   outer->flags && outer->flags < outermost;
1282 		   outer = outer->parent)
1283 		continue;
1284 	      inform (outer->loc, "containing loop here");
1285 	    }
1286 
1287 	  this_mask &= ~outermost;
1288 	}
1289     }
1290 
1291   mask_all |= this_mask;
1292 
1293   if (loop->flags & OLF_TILE)
1294     {
1295       /* When tiling, vector goes to the element loop, and failing
1296 	 that we put worker there.  The std doesn't contemplate
1297 	 specifying all three.  We choose to put worker and vector on
1298 	 the element loops in that case.  */
1299       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1300       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1301 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1302 
1303       loop->e_mask = this_e_mask;
1304       this_mask ^= this_e_mask;
1305     }
1306 
1307   loop->mask = this_mask;
1308 
1309   if (dump_file)
1310     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1311 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1312 	     loop->mask, loop->e_mask);
1313 
1314   if (loop->child)
1315     {
1316       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1317       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1318       mask_all |= loop->inner;
1319     }
1320 
1321   if (loop->sibling)
1322     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1323 
1324   return mask_all;
1325 }
1326 
1327 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1328    OUTER_MASK is the partitioning this loop is contained within.
1329    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1330    Return the cumulative partitioning used by this loop, siblings and
1331    children.  */
1332 
1333 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1334 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1335 			   bool outer_assign)
1336 {
1337   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1338   bool noisy = true;
1339   bool tiling = loop->flags & OLF_TILE;
1340 
1341 #ifdef ACCEL_COMPILER
1342   /* When device_type is supported, we want the device compiler to be
1343      noisy, if the loop parameters are device_type-specific.  */
1344   noisy = false;
1345 #endif
1346 
1347   if (assign && (!outer_assign || loop->inner))
1348     {
1349       /* Allocate outermost and non-innermost loops at the outermost
1350 	 non-innermost available level.  */
1351       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1352 
1353       /* Find the first outermost available partition. */
1354       while (this_mask <= outer_mask)
1355 	this_mask <<= 1;
1356 
1357       /* Grab two axes if tiling, and we've not assigned anything  */
1358       if (tiling && !(loop->mask | loop->e_mask))
1359 	this_mask |= this_mask << 1;
1360 
1361       /* Prohibit the innermost partitioning at the moment.  */
1362       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1363 
1364       /* Don't use any dimension explicitly claimed by an inner loop. */
1365       this_mask &= ~loop->inner;
1366 
1367       if (tiling && !loop->e_mask)
1368 	{
1369 	  /* If we got two axes, allocate the inner one to the element
1370 	     loop.  */
1371 	  loop->e_mask = this_mask & (this_mask << 1);
1372 	  this_mask ^= loop->e_mask;
1373 	}
1374 
1375       loop->mask |= this_mask;
1376     }
1377 
1378   if (loop->child)
1379     {
1380       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1381       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1382 					       outer_assign | assign);
1383     }
1384 
1385   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1386     {
1387       /* Allocate the loop at the innermost available level.  Note
1388 	 that we do this even if we already assigned this loop the
1389 	 outermost available level above.  That way we'll partition
1390 	 this along 2 axes, if they are available.  */
1391       unsigned this_mask = 0;
1392 
1393       /* Determine the outermost partitioning used within this loop.  */
1394       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1395       this_mask = least_bit_hwi (this_mask);
1396 
1397       /* Pick the partitioning just inside that one.  */
1398       this_mask >>= 1;
1399 
1400       /* And avoid picking one use by an outer loop.  */
1401       this_mask &= ~outer_mask;
1402 
1403       /* If tiling and we failed completely above, grab the next one
1404 	 too.  Making sure it doesn't hit an outer loop.  */
1405       if (tiling)
1406 	{
1407 	  this_mask &= ~(loop->e_mask | loop->mask);
1408 	  unsigned tile_mask = ((this_mask >> 1)
1409 				& ~(outer_mask | loop->e_mask | loop->mask));
1410 
1411 	  if (tile_mask || loop->mask)
1412 	    {
1413 	      loop->e_mask |= this_mask;
1414 	      this_mask = tile_mask;
1415 	    }
1416 	  if (!loop->e_mask && noisy)
1417 	    warning_at (loop->loc, 0,
1418 			"insufficient partitioning available"
1419 			" to parallelize element loop");
1420 	}
1421 
1422       loop->mask |= this_mask;
1423       if (!loop->mask && noisy)
1424 	warning_at (loop->loc, 0,
1425 		    tiling
1426 		    ? G_("insufficient partitioning available"
1427 			 " to parallelize tile loop")
1428 		    : G_("insufficient partitioning available"
1429 			 " to parallelize loop"));
1430     }
1431 
1432   if (assign && dump_file)
1433     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1434 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1435 	     loop->mask, loop->e_mask);
1436 
1437   unsigned inner_mask = 0;
1438 
1439   if (loop->sibling)
1440     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1441 					     outer_mask, outer_assign);
1442 
1443   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1444 
1445   return inner_mask;
1446 }
1447 
1448 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1449    axes.  Return mask of partitioning.  */
1450 
1451 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1452 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1453 {
1454   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1455 
1456   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1457     {
1458       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1459       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1460     }
1461   return mask_all;
1462 }
1463 
1464 /* Default fork/join early expander.  Delete the function calls if
1465    there is no RTL expander.  */
1466 
1467 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1468 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1469 			 const int *ARG_UNUSED (dims), bool is_fork)
1470 {
1471   if (is_fork)
1472     return targetm.have_oacc_fork ();
1473   else
1474     return targetm.have_oacc_join ();
1475 }
1476 
1477 /* Default goacc.reduction early expander.
1478 
1479    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1480    If RES_PTR is not integer-zerop:
1481        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1482        TEARDOWN - emit '*RES_PTR = VAR'
1483    If LHS is not NULL
1484        emit 'LHS = VAR'   */
1485 
1486 void
default_goacc_reduction(gcall * call)1487 default_goacc_reduction (gcall *call)
1488 {
1489   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1490   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1491   tree lhs = gimple_call_lhs (call);
1492   tree var = gimple_call_arg (call, 2);
1493   gimple_seq seq = NULL;
1494 
1495   if (code == IFN_GOACC_REDUCTION_SETUP
1496       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1497     {
1498       /* Setup and Teardown need to copy from/to the receiver object,
1499 	 if there is one.  */
1500       tree ref_to_res = gimple_call_arg (call, 1);
1501 
1502       if (!integer_zerop (ref_to_res))
1503 	{
1504 	  tree dst = build_simple_mem_ref (ref_to_res);
1505 	  tree src = var;
1506 
1507 	  if (code == IFN_GOACC_REDUCTION_SETUP)
1508 	    {
1509 	      src = dst;
1510 	      dst = lhs;
1511 	      lhs = NULL;
1512 	    }
1513 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1514 	}
1515     }
1516 
1517   /* Copy VAR to LHS, if there is an LHS.  */
1518   if (lhs)
1519     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1520 
1521   gsi_replace_with_seq (&gsi, seq, true);
1522 }
1523 
1524 /* Main entry point for oacc transformations which run on the device
1525    compiler after LTO, so we know what the target device is at this
1526    point (including the host fallback).  */
1527 
1528 static unsigned int
execute_oacc_device_lower()1529 execute_oacc_device_lower ()
1530 {
1531   tree attrs = oacc_get_fn_attrib (current_function_decl);
1532 
1533   if (!attrs)
1534     /* Not an offloaded function.  */
1535     return 0;
1536 
1537   /* Parse the default dim argument exactly once.  */
1538   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1539     {
1540       oacc_parse_default_dims (flag_openacc_dims);
1541       flag_openacc_dims = (char *)&flag_openacc_dims;
1542     }
1543 
1544   bool is_oacc_kernels
1545     = (lookup_attribute ("oacc kernels",
1546 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1547   bool is_oacc_kernels_parallelized
1548     = (lookup_attribute ("oacc kernels parallelized",
1549 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1550 
1551   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1552      kernels, so remove the parallelism dimensions function attributes
1553      potentially set earlier on.  */
1554   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1555     {
1556       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1557       attrs = oacc_get_fn_attrib (current_function_decl);
1558     }
1559 
1560   /* Discover, partition and process the loops.  */
1561   oacc_loop *loops = oacc_loop_discovery ();
1562   int fn_level = oacc_fn_attrib_level (attrs);
1563 
1564   if (dump_file)
1565     {
1566       if (fn_level >= 0)
1567 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
1568 		 fn_level);
1569       else if (is_oacc_kernels)
1570 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1571 		 (is_oacc_kernels_parallelized
1572 		  ? "parallelized" : "unparallelized"));
1573       else
1574 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
1575     }
1576 
1577   unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1578   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1579   /* OpenACC kernels constructs are special: they currently don't use the
1580      generic oacc_loop infrastructure and attribute/dimension processing.  */
1581   if (is_oacc_kernels && is_oacc_kernels_parallelized)
1582     {
1583       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
1584 	 also tree-parloops.c:create_parallel_loop.  */
1585       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1586     }
1587 
1588   int dims[GOMP_DIM_MAX];
1589   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1590 
1591   if (dump_file)
1592     {
1593       const char *comma = "Compute dimensions [";
1594       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1595 	fprintf (dump_file, "%s%d", comma, dims[ix]);
1596       fprintf (dump_file, "]\n");
1597     }
1598 
1599   oacc_loop_process (loops);
1600   if (dump_file)
1601     {
1602       fprintf (dump_file, "OpenACC loops\n");
1603       dump_oacc_loop (dump_file, loops, 0);
1604       fprintf (dump_file, "\n");
1605     }
1606   if (dump_enabled_p ())
1607     {
1608       oacc_loop *l = loops;
1609       /* OpenACC kernels constructs are special: they currently don't use the
1610 	 generic oacc_loop infrastructure.  */
1611       if (is_oacc_kernels)
1612 	{
1613 	  /* Create a fake oacc_loop for diagnostic purposes.  */
1614 	  l = new_oacc_loop_raw (NULL,
1615 				 DECL_SOURCE_LOCATION (current_function_decl));
1616 	  l->mask = used_mask;
1617 	}
1618       else
1619 	{
1620 	  /* Skip the outermost, dummy OpenACC loop  */
1621 	  l = l->child;
1622 	}
1623       if (l)
1624 	inform_oacc_loop (l);
1625       if (is_oacc_kernels)
1626 	free_oacc_loop (l);
1627     }
1628 
1629   /* Offloaded targets may introduce new basic blocks, which require
1630      dominance information to update SSA.  */
1631   calculate_dominance_info (CDI_DOMINATORS);
1632 
1633   /* Now lower internal loop functions to target-specific code
1634      sequences.  */
1635   basic_block bb;
1636   FOR_ALL_BB_FN (bb, cfun)
1637     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1638       {
1639 	gimple *stmt = gsi_stmt (gsi);
1640 	if (!is_gimple_call (stmt))
1641 	  {
1642 	    gsi_next (&gsi);
1643 	    continue;
1644 	  }
1645 
1646 	gcall *call = as_a <gcall *> (stmt);
1647 	if (!gimple_call_internal_p (call))
1648 	  {
1649 	    gsi_next (&gsi);
1650 	    continue;
1651 	  }
1652 
1653 	/* Rewind to allow rescan.  */
1654 	gsi_prev (&gsi);
1655 	bool rescan = false, remove = false;
1656 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
1657 
1658 	switch (ifn_code)
1659 	  {
1660 	  default: break;
1661 
1662 	  case IFN_GOACC_TILE:
1663 	    oacc_xform_tile (call);
1664 	    rescan = true;
1665 	    break;
1666 
1667 	  case IFN_GOACC_LOOP:
1668 	    oacc_xform_loop (call);
1669 	    rescan = true;
1670 	    break;
1671 
1672 	  case IFN_GOACC_REDUCTION:
1673 	    /* Mark the function for SSA renaming.  */
1674 	    mark_virtual_operands_for_renaming (cfun);
1675 
1676 	    /* If the level is -1, this ended up being an unused
1677 	       axis.  Handle as a default.  */
1678 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
1679 	      default_goacc_reduction (call);
1680 	    else
1681 	      targetm.goacc.reduction (call);
1682 	    rescan = true;
1683 	    break;
1684 
1685 	  case IFN_UNIQUE:
1686 	    {
1687 	      enum ifn_unique_kind kind
1688 		= ((enum ifn_unique_kind)
1689 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1690 
1691 	      switch (kind)
1692 		{
1693 		default:
1694 		  break;
1695 
1696 		case IFN_UNIQUE_OACC_FORK:
1697 		case IFN_UNIQUE_OACC_JOIN:
1698 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
1699 		    remove = true;
1700 		  else if (!targetm.goacc.fork_join
1701 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1702 		    remove = true;
1703 		  break;
1704 
1705 		case IFN_UNIQUE_OACC_HEAD_MARK:
1706 		case IFN_UNIQUE_OACC_TAIL_MARK:
1707 		  remove = true;
1708 		  break;
1709 		}
1710 	      break;
1711 	    }
1712 	  }
1713 
1714 	if (gsi_end_p (gsi))
1715 	  /* We rewound past the beginning of the BB.  */
1716 	  gsi = gsi_start_bb (bb);
1717 	else
1718 	  /* Undo the rewind.  */
1719 	  gsi_next (&gsi);
1720 
1721 	if (remove)
1722 	  {
1723 	    if (gimple_vdef (call))
1724 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1725 	    if (gimple_call_lhs (call))
1726 	      {
1727 		/* Propagate the data dependency var.  */
1728 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1729 						   gimple_call_arg (call, 1));
1730 		gsi_replace (&gsi, ass,  false);
1731 	      }
1732 	    else
1733 	      gsi_remove (&gsi, true);
1734 	  }
1735 	else if (!rescan)
1736 	  /* If not rescanning, advance over the call.  */
1737 	  gsi_next (&gsi);
1738       }
1739 
1740   free_oacc_loop (loops);
1741 
1742   return 0;
1743 }
1744 
1745 /* Default launch dimension validator.  Force everything to 1.  A
1746    backend that wants to provide larger dimensions must override this
1747    hook.  */
1748 
1749 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))1750 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1751 			     int ARG_UNUSED (fn_level),
1752 			     unsigned ARG_UNUSED (used))
1753 {
1754   bool changed = false;
1755 
1756   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1757     {
1758       if (dims[ix] != 1)
1759 	{
1760 	  dims[ix] = 1;
1761 	  changed = true;
1762 	}
1763     }
1764 
1765   return changed;
1766 }
1767 
1768 /* Default dimension bound is unknown on accelerator and 1 on host.  */
1769 
1770 int
default_goacc_dim_limit(int ARG_UNUSED (axis))1771 default_goacc_dim_limit (int ARG_UNUSED (axis))
1772 {
1773 #ifdef ACCEL_COMPILER
1774   return 0;
1775 #else
1776   return 1;
1777 #endif
1778 }
1779 
1780 namespace {
1781 
1782 const pass_data pass_data_oacc_device_lower =
1783 {
1784   GIMPLE_PASS, /* type */
1785   "oaccdevlow", /* name */
1786   OPTGROUP_OMP, /* optinfo_flags */
1787   TV_NONE, /* tv_id */
1788   PROP_cfg, /* properties_required */
1789   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
1790   0, /* properties_destroyed */
1791   0, /* todo_flags_start */
1792   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1793 };
1794 
1795 class pass_oacc_device_lower : public gimple_opt_pass
1796 {
1797 public:
pass_oacc_device_lower(gcc::context * ctxt)1798   pass_oacc_device_lower (gcc::context *ctxt)
1799     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1800   {}
1801 
1802   /* opt_pass methods: */
gate(function *)1803   virtual bool gate (function *) { return flag_openacc; };
1804 
execute(function *)1805   virtual unsigned int execute (function *)
1806     {
1807       return execute_oacc_device_lower ();
1808     }
1809 
1810 }; // class pass_oacc_device_lower
1811 
1812 } // anon namespace
1813 
1814 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)1815 make_pass_oacc_device_lower (gcc::context *ctxt)
1816 {
1817   return new pass_oacc_device_lower (ctxt);
1818 }
1819 
1820 
1821 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1822    GOMP_SIMT_ENTER call identifying the privatized variables, which are
1823    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1824    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
1825 
1826 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)1827 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1828 {
1829   gimple *alloc_stmt = gsi_stmt (*gsi);
1830   tree simtrec = gimple_call_lhs (alloc_stmt);
1831   tree simduid = gimple_call_arg (alloc_stmt, 0);
1832   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1833   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1834   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1835   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1836   TREE_ADDRESSABLE (rectype) = 1;
1837   TREE_TYPE (simtrec) = build_pointer_type (rectype);
1838   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1839     {
1840       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1841       if (*argp == null_pointer_node)
1842 	continue;
1843       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1844 		  && VAR_P (TREE_OPERAND (*argp, 0)));
1845       tree var = TREE_OPERAND (*argp, 0);
1846 
1847       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1848 			       DECL_NAME (var), TREE_TYPE (var));
1849       SET_DECL_ALIGN (field, DECL_ALIGN (var));
1850       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1851       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1852 
1853       insert_field_into_struct (rectype, field);
1854 
1855       tree t = build_simple_mem_ref (simtrec);
1856       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1857       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1858       SET_DECL_VALUE_EXPR (var, t);
1859       DECL_HAS_VALUE_EXPR_P (var) = 1;
1860       *regimplify = true;
1861     }
1862   layout_type (rectype);
1863   tree size = TYPE_SIZE_UNIT (rectype);
1864   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1865 
1866   alloc_stmt
1867     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1868   gimple_call_set_lhs (alloc_stmt, simtrec);
1869   gsi_replace (gsi, alloc_stmt, false);
1870   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1871   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1872   gsi_replace (&enter_gsi, enter_stmt, false);
1873 
1874   use_operand_p use;
1875   gimple *exit_stmt;
1876   if (single_imm_use (simtrec, &use, &exit_stmt))
1877     {
1878       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1879       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1880       tree clobber = build_clobber (rectype);
1881       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1882       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1883     }
1884   else
1885     gcc_checking_assert (has_zero_uses (simtrec));
1886 }
1887 
1888 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
1889 
1890 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)1891 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1892 {
1893   tree t = *tp;
1894 
1895   if (VAR_P (t)
1896       && DECL_HAS_VALUE_EXPR_P (t)
1897       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1898     {
1899       *walk_subtrees = 0;
1900       return t;
1901     }
1902   return NULL_TREE;
1903 }
1904 
1905 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1906    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1907    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
1908    internal functions on non-SIMT targets, and likewise some SIMD internal
1909    functions on SIMT targets.  */
1910 
1911 static unsigned int
execute_omp_device_lower()1912 execute_omp_device_lower ()
1913 {
1914   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1915   bool regimplify = false;
1916   basic_block bb;
1917   gimple_stmt_iterator gsi;
1918   FOR_EACH_BB_FN (bb, cfun)
1919     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1920       {
1921 	gimple *stmt = gsi_stmt (gsi);
1922 	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1923 	  continue;
1924 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1925 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1926 	switch (gimple_call_internal_fn (stmt))
1927 	  {
1928 	  case IFN_GOMP_USE_SIMT:
1929 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
1930 	    break;
1931 	  case IFN_GOMP_SIMT_ENTER:
1932 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1933 	    goto simtreg_enter_exit;
1934 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
1935 	    if (vf != 1)
1936 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1937 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1938 	    goto simtreg_enter_exit;
1939 	  case IFN_GOMP_SIMT_EXIT:
1940 	  simtreg_enter_exit:
1941 	    if (vf != 1)
1942 	      continue;
1943 	    unlink_stmt_vdef (stmt);
1944 	    break;
1945 	  case IFN_GOMP_SIMT_LANE:
1946 	  case IFN_GOMP_SIMT_LAST_LANE:
1947 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1948 	    break;
1949 	  case IFN_GOMP_SIMT_VF:
1950 	    rhs = build_int_cst (type, vf);
1951 	    break;
1952 	  case IFN_GOMP_SIMT_ORDERED_PRED:
1953 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1954 	    if (rhs || !lhs)
1955 	      unlink_stmt_vdef (stmt);
1956 	    break;
1957 	  case IFN_GOMP_SIMT_VOTE_ANY:
1958 	  case IFN_GOMP_SIMT_XCHG_BFLY:
1959 	  case IFN_GOMP_SIMT_XCHG_IDX:
1960 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1961 	    break;
1962 	  case IFN_GOMP_SIMD_LANE:
1963 	  case IFN_GOMP_SIMD_LAST_LANE:
1964 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1965 	    break;
1966 	  case IFN_GOMP_SIMD_VF:
1967 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1968 	    break;
1969 	  default:
1970 	    continue;
1971 	  }
1972 	if (lhs && !rhs)
1973 	  continue;
1974 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1975 	gsi_replace (&gsi, stmt, false);
1976       }
1977   if (regimplify)
1978     FOR_EACH_BB_REVERSE_FN (bb, cfun)
1979       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1980 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1981 	  {
1982 	    if (gimple_clobber_p (gsi_stmt (gsi)))
1983 	      gsi_remove (&gsi, true);
1984 	    else
1985 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1986 	  }
1987   if (vf != 1)
1988     cfun->has_force_vectorize_loops = false;
1989   return 0;
1990 }
1991 
1992 namespace {
1993 
1994 const pass_data pass_data_omp_device_lower =
1995 {
1996   GIMPLE_PASS, /* type */
1997   "ompdevlow", /* name */
1998   OPTGROUP_OMP, /* optinfo_flags */
1999   TV_NONE, /* tv_id */
2000   PROP_cfg, /* properties_required */
2001   PROP_gimple_lomp_dev, /* properties_provided */
2002   0, /* properties_destroyed */
2003   0, /* todo_flags_start */
2004   TODO_update_ssa, /* todo_flags_finish */
2005 };
2006 
2007 class pass_omp_device_lower : public gimple_opt_pass
2008 {
2009 public:
pass_omp_device_lower(gcc::context * ctxt)2010   pass_omp_device_lower (gcc::context *ctxt)
2011     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2012   {}
2013 
2014   /* opt_pass methods: */
gate(function * fun)2015   virtual bool gate (function *fun)
2016     {
2017       return !(fun->curr_properties & PROP_gimple_lomp_dev);
2018     }
execute(function *)2019   virtual unsigned int execute (function *)
2020     {
2021       return execute_omp_device_lower ();
2022     }
2023 
2024 }; // class pass_expand_omp_ssa
2025 
2026 } // anon namespace
2027 
2028 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2029 make_pass_omp_device_lower (gcc::context *ctxt)
2030 {
2031   return new pass_omp_device_lower (ctxt);
2032 }
2033 
2034 /* "omp declare target link" handling pass.  */
2035 
2036 namespace {
2037 
2038 const pass_data pass_data_omp_target_link =
2039 {
2040   GIMPLE_PASS,			/* type */
2041   "omptargetlink",		/* name */
2042   OPTGROUP_OMP,			/* optinfo_flags */
2043   TV_NONE,			/* tv_id */
2044   PROP_ssa,			/* properties_required */
2045   0,				/* properties_provided */
2046   0,				/* properties_destroyed */
2047   0,				/* todo_flags_start */
2048   TODO_update_ssa,		/* todo_flags_finish */
2049 };
2050 
2051 class pass_omp_target_link : public gimple_opt_pass
2052 {
2053 public:
pass_omp_target_link(gcc::context * ctxt)2054   pass_omp_target_link (gcc::context *ctxt)
2055     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2056   {}
2057 
2058   /* opt_pass methods: */
gate(function * fun)2059   virtual bool gate (function *fun)
2060     {
2061 #ifdef ACCEL_COMPILER
2062       return offloading_function_p (fun->decl);
2063 #else
2064       (void) fun;
2065       return false;
2066 #endif
2067     }
2068 
2069   virtual unsigned execute (function *);
2070 };
2071 
2072 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
2073 
2074 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2075 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2076 {
2077   tree t = *tp;
2078 
2079   if (VAR_P (t)
2080       && DECL_HAS_VALUE_EXPR_P (t)
2081       && is_global_var (t)
2082       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2083     {
2084       *walk_subtrees = 0;
2085       return t;
2086     }
2087 
2088   return NULL_TREE;
2089 }
2090 
2091 unsigned
execute(function * fun)2092 pass_omp_target_link::execute (function *fun)
2093 {
2094   basic_block bb;
2095   FOR_EACH_BB_FN (bb, fun)
2096     {
2097       gimple_stmt_iterator gsi;
2098       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2099 	if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2100 	  gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2101     }
2102 
2103   return 0;
2104 }
2105 
2106 } // anon namespace
2107 
2108 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2109 make_pass_omp_target_link (gcc::context *ctxt)
2110 {
2111   return new pass_omp_target_link (ctxt);
2112 }
2113