1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2020 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 
56 /* Describe the OpenACC looping structure of a function.  The entire
57    function is held in a 'NULL' loop.  */
58 
59 struct oacc_loop
60 {
61   oacc_loop *parent; /* Containing loop.  */
62 
63   oacc_loop *child; /* First inner loop.  */
64 
65   oacc_loop *sibling; /* Next loop within same parent.  */
66 
67   location_t loc; /* Location of the loop start.  */
68 
69   gcall *marker; /* Initial head marker.  */
70 
71   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
72   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
73 
74   tree routine;  /* Pseudo-loop enclosing a routine.  */
75 
76   unsigned mask;   /* Partitioning mask.  */
77   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
78   unsigned inner;  /* Partitioning of inner loops.  */
79   unsigned flags;  /* Partitioning flags.  */
80   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
81   tree chunk_size; /* Chunk size.  */
82   gcall *head_end; /* Final marker of head sequence.  */
83 };
84 
85 /* Holds offload tables with decls.  */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
87 
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89    -1 if it is not a routine (i.e. is an offload fn).  */
90 
91 int
oacc_fn_attrib_level(tree attr)92 oacc_fn_attrib_level (tree attr)
93 {
94   tree pos = TREE_VALUE (attr);
95 
96   if (!TREE_PURPOSE (pos))
97     return -1;
98 
99   int ix = 0;
100   for (ix = 0; ix != GOMP_DIM_MAX;
101        ix++, pos = TREE_CHAIN (pos))
102     if (!integer_zerop (TREE_PURPOSE (pos)))
103       break;
104 
105   return ix;
106 }
107 
108 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
109    adds their addresses and sizes to constructor-vector V_CTOR.  */
110 
111 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 					 vec<constructor_elt, va_gc> *v_ctor)
114 {
115   unsigned len = vec_safe_length (v_decls);
116   for (unsigned i = 0; i < len; i++)
117     {
118       tree it = (*v_decls)[i];
119       bool is_var = VAR_P (it);
120       bool is_link_var
121 	= is_var
122 #ifdef ACCEL_COMPILER
123 	  && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126 
127       /* See also omp_finish_file and output_offload_tables in lto-cgraph.c.  */
128       if (!in_lto_p && !symtab_node::get (it))
129 	continue;
130 
131       tree size = NULL_TREE;
132       if (is_var)
133 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
134 
135       tree addr;
136       if (!is_link_var)
137 	addr = build_fold_addr_expr (it);
138       else
139 	{
140 #ifdef ACCEL_COMPILER
141 	  /* For "omp declare target link" vars add address of the pointer to
142 	     the target table, instead of address of the var.  */
143 	  tree value_expr = DECL_VALUE_EXPR (it);
144 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
145 	  varpool_node::finalize_decl (link_ptr_decl);
146 	  addr = build_fold_addr_expr (link_ptr_decl);
147 #else
148 	  addr = build_fold_addr_expr (it);
149 #endif
150 
151 	  /* Most significant bit of the size marks "omp declare target link"
152 	     vars in host and target tables.  */
153 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
154 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
155 			    * BITS_PER_UNIT - 1);
156 	  size = wide_int_to_tree (const_ptr_type_node, isize);
157 	}
158 
159       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
160       if (is_var)
161 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
162     }
163 }
164 
165 /* Create new symbols containing (address, size) pairs for global variables,
166    marked with "omp declare target" attribute, as well as addresses for the
167    functions, which are outlined offloading regions.  */
168 void
omp_finish_file(void)169 omp_finish_file (void)
170 {
171   unsigned num_funcs = vec_safe_length (offload_funcs);
172   unsigned num_vars = vec_safe_length (offload_vars);
173 
174   if (num_funcs == 0 && num_vars == 0)
175     return;
176 
177   if (targetm_common.have_named_sections)
178     {
179       vec<constructor_elt, va_gc> *v_f, *v_v;
180       vec_alloc (v_f, num_funcs);
181       vec_alloc (v_v, num_vars * 2);
182 
183       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
184       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
185 
186       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
187 						    vec_safe_length (v_v));
188       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
189 						     num_funcs);
190       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
191       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
192       tree ctor_v = build_constructor (vars_decl_type, v_v);
193       tree ctor_f = build_constructor (funcs_decl_type, v_f);
194       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
195       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
196       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
197 				    get_identifier (".offload_func_table"),
198 				    funcs_decl_type);
199       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
200 				   get_identifier (".offload_var_table"),
201 				   vars_decl_type);
202       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
203       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
204 	 otherwise a joint table in a binary will contain padding between
205 	 tables from multiple object files.  */
206       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
207       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
208       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
209       DECL_INITIAL (funcs_decl) = ctor_f;
210       DECL_INITIAL (vars_decl) = ctor_v;
211       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
212       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
213 
214       varpool_node::finalize_decl (vars_decl);
215       varpool_node::finalize_decl (funcs_decl);
216     }
217   else
218     {
219       for (unsigned i = 0; i < num_funcs; i++)
220 	{
221 	  tree it = (*offload_funcs)[i];
222 	  /* See also add_decls_addresses_to_decl_constructor
223 	     and output_offload_tables in lto-cgraph.c.  */
224 	  if (!in_lto_p && !symtab_node::get (it))
225 	    continue;
226 	  targetm.record_offload_symbol (it);
227 	}
228       for (unsigned i = 0; i < num_vars; i++)
229 	{
230 	  tree it = (*offload_vars)[i];
231 	  if (!in_lto_p && !symtab_node::get (it))
232 	    continue;
233 #ifdef ACCEL_COMPILER
234 	  if (DECL_HAS_VALUE_EXPR_P (it)
235 	      && lookup_attribute ("omp declare target link",
236 				   DECL_ATTRIBUTES (it)))
237 	    {
238 	      tree value_expr = DECL_VALUE_EXPR (it);
239 	      tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
240 	      targetm.record_offload_symbol (link_ptr_decl);
241 	      varpool_node::finalize_decl (link_ptr_decl);
242 	    }
243 	  else
244 #endif
245 	    targetm.record_offload_symbol (it);
246 	}
247     }
248 }
249 
250 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
251    axis DIM.  Return a tmp var holding the result.  */
252 
253 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)254 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
255 {
256   tree arg = build_int_cst (unsigned_type_node, dim);
257   tree size = create_tmp_var (integer_type_node);
258   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
259   gimple *call = gimple_build_call_internal (fn, 1, arg);
260 
261   gimple_call_set_lhs (call, size);
262   gimple_seq_add_stmt (seq, call);
263 
264   return size;
265 }
266 
267 /* Find the number of threads (POS = false), or thread number (POS =
268    true) for an OpenACC region partitioned as MASK.  Setup code
269    required for the calculation is added to SEQ.  */
270 
271 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)272 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
273 {
274   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
275   unsigned ix;
276 
277   /* Start at gang level, and examine relevant dimension indices.  */
278   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
279     if (GOMP_DIM_MASK (ix) & mask)
280       {
281 	if (res)
282 	  {
283 	    /* We had an outer index, so scale that by the size of
284 	       this dimension.  */
285 	    tree n = oacc_dim_call (false, ix, seq);
286 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
287 	  }
288 	if (pos)
289 	  {
290 	    /* Determine index in this dimension.  */
291 	    tree id = oacc_dim_call (true, ix, seq);
292 	    if (res)
293 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
294 	    else
295 	      res = id;
296 	  }
297       }
298 
299   if (res == NULL_TREE)
300     res = integer_zero_node;
301 
302   return res;
303 }
304 
305 /* Transform IFN_GOACC_LOOP calls to actual code.  See
306    expand_oacc_for for where these are generated.  At the vector
307    level, we stride loops, such that each member of a warp will
308    operate on adjacent iterations.  At the worker and gang level,
309    each gang/warp executes a set of contiguous iterations.  Chunking
310    can override this such that each iteration engine executes a
311    contiguous chunk, and then moves on to stride to the next chunk.  */
312 
313 static void
oacc_xform_loop(gcall * call)314 oacc_xform_loop (gcall *call)
315 {
316   gimple_stmt_iterator gsi = gsi_for_stmt (call);
317   enum ifn_goacc_loop_kind code
318     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
319   tree dir = gimple_call_arg (call, 1);
320   tree range = gimple_call_arg (call, 2);
321   tree step = gimple_call_arg (call, 3);
322   tree chunk_size = NULL_TREE;
323   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
324   tree lhs = gimple_call_lhs (call);
325   tree type = NULL_TREE;
326   tree diff_type = TREE_TYPE (range);
327   tree r = NULL_TREE;
328   gimple_seq seq = NULL;
329   bool chunking = false, striding = true;
330   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
331   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
332 
333   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
334   if (!lhs)
335     {
336       gsi_replace_with_seq (&gsi, seq, true);
337       return;
338     }
339 
340   type = TREE_TYPE (lhs);
341 
342 #ifdef ACCEL_COMPILER
343   chunk_size = gimple_call_arg (call, 4);
344   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
345       || integer_zerop (chunk_size))   /* Default (also static).  */
346     {
347       /* If we're at the gang level, we want each to execute a
348 	 contiguous run of iterations.  Otherwise we want each element
349 	 to stride.  */
350       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
351       chunking = false;
352     }
353   else
354     {
355       /* Chunk of size 1 is striding.  */
356       striding = integer_onep (chunk_size);
357       chunking = !striding;
358     }
359 #endif
360 
361   /* striding=true, chunking=true
362        -> invalid.
363      striding=true, chunking=false
364        -> chunks=1
365      striding=false,chunking=true
366        -> chunks=ceil (range/(chunksize*threads*step))
367      striding=false,chunking=false
368        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
369   push_gimplify_context (true);
370 
371   switch (code)
372     {
373     default: gcc_unreachable ();
374 
375     case IFN_GOACC_LOOP_CHUNKS:
376       if (!chunking)
377 	r = build_int_cst (type, 1);
378       else
379 	{
380 	  /* chunk_max
381 	     = (range - dir) / (chunks * step * num_threads) + dir  */
382 	  tree per = oacc_thread_numbers (false, mask, &seq);
383 	  per = fold_convert (type, per);
384 	  chunk_size = fold_convert (type, chunk_size);
385 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
386 	  per = fold_build2 (MULT_EXPR, type, per, step);
387 	  r = build2 (MINUS_EXPR, type, range, dir);
388 	  r = build2 (PLUS_EXPR, type, r, per);
389 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
390 	}
391       break;
392 
393     case IFN_GOACC_LOOP_STEP:
394       {
395 	/* If striding, step by the entire compute volume, otherwise
396 	   step by the inner volume.  */
397 	unsigned volume = striding ? mask : inner_mask;
398 
399 	r = oacc_thread_numbers (false, volume, &seq);
400 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
401       }
402       break;
403 
404     case IFN_GOACC_LOOP_OFFSET:
405       /* Enable vectorization on non-SIMT targets.  */
406       if (!targetm.simt.vf
407 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
408 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
409 	     the loop.  */
410 	  && (flag_tree_loop_vectorize
411 	      || !global_options_set.x_flag_tree_loop_vectorize))
412 	{
413 	  basic_block bb = gsi_bb (gsi);
414 	  class loop *parent = bb->loop_father;
415 	  class loop *body = parent->inner;
416 
417 	  parent->force_vectorize = true;
418 	  parent->safelen = INT_MAX;
419 
420 	  /* "Chunking loops" may have inner loops.  */
421 	  if (parent->inner)
422 	    {
423 	      body->force_vectorize = true;
424 	      body->safelen = INT_MAX;
425 	    }
426 
427 	  cfun->has_force_vectorize_loops = true;
428 	}
429       if (striding)
430 	{
431 	  r = oacc_thread_numbers (true, mask, &seq);
432 	  r = fold_convert (diff_type, r);
433 	}
434       else
435 	{
436 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
437 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
438 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
439 				     inner_size, outer_size);
440 
441 	  volume = fold_convert (diff_type, volume);
442 	  if (chunking)
443 	    chunk_size = fold_convert (diff_type, chunk_size);
444 	  else
445 	    {
446 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
447 
448 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
449 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
450 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
451 	    }
452 
453 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
454 			      fold_convert (diff_type, inner_size));
455 	  r = oacc_thread_numbers (true, outer_mask, &seq);
456 	  r = fold_convert (diff_type, r);
457 	  r = build2 (MULT_EXPR, diff_type, r, span);
458 
459 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
460 	  inner = fold_convert (diff_type, inner);
461 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
462 
463 	  if (chunking)
464 	    {
465 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
466 	      tree per
467 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
468 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
469 
470 	      r = build2 (PLUS_EXPR, diff_type, r, per);
471 	    }
472 	}
473       r = fold_build2 (MULT_EXPR, diff_type, r, step);
474       if (type != diff_type)
475 	r = fold_convert (type, r);
476       break;
477 
478     case IFN_GOACC_LOOP_BOUND:
479       if (striding)
480 	r = range;
481       else
482 	{
483 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
484 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
485 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
486 				     inner_size, outer_size);
487 
488 	  volume = fold_convert (diff_type, volume);
489 	  if (chunking)
490 	    chunk_size = fold_convert (diff_type, chunk_size);
491 	  else
492 	    {
493 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
494 
495 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
496 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
497 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
498 	    }
499 
500 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
501 			      fold_convert (diff_type, inner_size));
502 
503 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
504 
505 	  tree offset = gimple_call_arg (call, 6);
506 	  r = build2 (PLUS_EXPR, diff_type, r,
507 		      fold_convert (diff_type, offset));
508 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
509 		      diff_type, r, range);
510 	}
511       if (diff_type != type)
512 	r = fold_convert (type, r);
513       break;
514     }
515 
516   gimplify_assign (lhs, r, &seq);
517 
518   pop_gimplify_context (NULL);
519 
520   gsi_replace_with_seq (&gsi, seq, true);
521 }
522 
523 /* Transform a GOACC_TILE call.  Determines the element loop span for
524    the specified loop of the nest.  This is 1 if we're not tiling.
525 
526    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
527 
528 static void
oacc_xform_tile(gcall * call)529 oacc_xform_tile (gcall *call)
530 {
531   gimple_stmt_iterator gsi = gsi_for_stmt (call);
532   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
533   /* Inner loops have higher loop_nos.  */
534   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
535   tree tile_size = gimple_call_arg (call, 2);
536   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
537   tree lhs = gimple_call_lhs (call);
538   tree type = TREE_TYPE (lhs);
539   gimple_seq seq = NULL;
540   tree span = build_int_cst (type, 1);
541 
542   gcc_assert (!(e_mask
543 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
544 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
545   push_gimplify_context (!seen_error ());
546 
547 #ifndef ACCEL_COMPILER
548   /* Partitioning disabled on host compilers.  */
549   e_mask = 0;
550 #endif
551   if (!e_mask)
552     /* Not paritioning.  */
553     span = integer_one_node;
554   else if (!integer_zerop (tile_size))
555     /* User explicitly specified size.  */
556     span = tile_size;
557   else
558     {
559       /* Pick a size based on the paritioning of the element loop and
560 	 the number of loop nests.  */
561       tree first_size = NULL_TREE;
562       tree second_size = NULL_TREE;
563 
564       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
565 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
566       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
567 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
568 
569       if (!first_size)
570 	{
571 	  first_size = second_size;
572 	  second_size = NULL_TREE;
573 	}
574 
575       if (loop_no + 1 == collapse)
576 	{
577 	  span = first_size;
578 	  if (!loop_no && second_size)
579 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
580 				span, second_size);
581 	}
582       else if (loop_no + 2 == collapse)
583 	span = second_size;
584       else
585 	span = NULL_TREE;
586 
587       if (!span)
588 	/* There's no obvious element size for this loop.  Options
589 	   are 1, first_size or some non-unity constant (32 is my
590 	   favourite).   We should gather some statistics.  */
591 	span = first_size;
592     }
593 
594   span = fold_convert (type, span);
595   gimplify_assign (lhs, span, &seq);
596 
597   pop_gimplify_context (NULL);
598 
599   gsi_replace_with_seq (&gsi, seq, true);
600 }
601 
602 /* Default partitioned and minimum partitioned dimensions.  */
603 
604 static int oacc_default_dims[GOMP_DIM_MAX];
605 static int oacc_min_dims[GOMP_DIM_MAX];
606 
607 int
oacc_get_default_dim(int dim)608 oacc_get_default_dim (int dim)
609 {
610   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
611   return oacc_default_dims[dim];
612 }
613 
614 int
oacc_get_min_dim(int dim)615 oacc_get_min_dim (int dim)
616 {
617   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
618   return oacc_min_dims[dim];
619 }
620 
621 /* Parse the default dimension parameter.  This is a set of
622    :-separated optional compute dimensions.  Each specified dimension
623    is a positive integer.  When device type support is added, it is
624    planned to be a comma separated list of such compute dimensions,
625    with all but the first prefixed by the colon-terminated device
626    type.  */
627 
628 static void
oacc_parse_default_dims(const char * dims)629 oacc_parse_default_dims (const char *dims)
630 {
631   int ix;
632 
633   for (ix = GOMP_DIM_MAX; ix--;)
634     {
635       oacc_default_dims[ix] = -1;
636       oacc_min_dims[ix] = 1;
637     }
638 
639 #ifndef ACCEL_COMPILER
640   /* Cannot be overridden on the host.  */
641   dims = NULL;
642 #endif
643   if (dims)
644     {
645       const char *pos = dims;
646 
647       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
648 	{
649 	  if (ix)
650 	    {
651 	      if (*pos != ':')
652 		goto malformed;
653 	      pos++;
654 	    }
655 
656 	  if (*pos != ':')
657 	    {
658 	      long val;
659 	      const char *eptr;
660 
661 	      errno = 0;
662 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
663 	      if (errno || val <= 0 || (int) val != val)
664 		goto malformed;
665 	      pos = eptr;
666 	      oacc_default_dims[ix] = (int) val;
667 	    }
668 	}
669       if (*pos)
670 	{
671 	malformed:
672 	  error_at (UNKNOWN_LOCATION,
673 		    "%<-fopenacc-dim%> operand is malformed at %qs", pos);
674 	}
675     }
676 
677   /* Allow the backend to validate the dimensions.  */
678   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
679   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
680 }
681 
682 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
683    raw attribute.  DIMS is an array of dimensions, which is filled in.
684    LEVEL is the partitioning level of a routine, or -1 for an offload
685    region itself.  USED is the mask of partitioned execution in the
686    function.  */
687 
688 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)689 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
690 {
691   tree purpose[GOMP_DIM_MAX];
692   unsigned ix;
693   tree pos = TREE_VALUE (attrs);
694 
695   /* Make sure the attribute creator attached the dimension
696      information.  */
697   gcc_assert (pos);
698 
699   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
700     {
701       purpose[ix] = TREE_PURPOSE (pos);
702       tree val = TREE_VALUE (pos);
703       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
704       pos = TREE_CHAIN (pos);
705     }
706 
707   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
708 
709   /* Default anything left to 1 or a partitioned default.  */
710   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
711     if (dims[ix] < 0)
712       {
713 	/* The OpenACC spec says 'If the [num_gangs] clause is not
714 	   specified, an implementation-defined default will be used;
715 	   the default may depend on the code within the construct.'
716 	   (2.5.6).  Thus an implementation is free to choose
717 	   non-unity default for a parallel region that doesn't have
718 	   any gang-partitioned loops.  However, it appears that there
719 	   is a sufficient body of user code that expects non-gang
720 	   partitioned regions to not execute in gang-redundant mode.
721 	   So we (a) don't warn about the non-portability and (b) pick
722 	   the minimum permissible dimension size when there is no
723 	   partitioned execution.  Otherwise we pick the global
724 	   default for the dimension, which the user can control.  The
725 	   same wording and logic applies to num_workers and
726 	   vector_length, however the worker- or vector- single
727 	   execution doesn't have the same impact as gang-redundant
728 	   execution.  (If the minimum gang-level partioning is not 1,
729 	   the target is probably too confusing.)  */
730 	dims[ix] = (used & GOMP_DIM_MASK (ix)
731 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
732 	changed = true;
733       }
734 
735   if (changed)
736     {
737       /* Replace the attribute with new values.  */
738       pos = NULL_TREE;
739       for (ix = GOMP_DIM_MAX; ix--;)
740 	pos = tree_cons (purpose[ix],
741 			 build_int_cst (integer_type_node, dims[ix]), pos);
742       oacc_replace_fn_attrib (fn, pos);
743     }
744 }
745 
746 /* Create an empty OpenACC loop structure at LOC.  */
747 
748 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)749 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
750 {
751   oacc_loop *loop = XCNEW (oacc_loop);
752 
753   loop->parent = parent;
754 
755   if (parent)
756     {
757       loop->sibling = parent->child;
758       parent->child = loop;
759     }
760 
761   loop->loc = loc;
762   return loop;
763 }
764 
765 /* Create an outermost, dummy OpenACC loop for offloaded function
766    DECL.  */
767 
768 static oacc_loop *
new_oacc_loop_outer(tree decl)769 new_oacc_loop_outer (tree decl)
770 {
771   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
772 }
773 
774 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
775    Link into PARENT loop.  Return the new loop.  */
776 
777 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)778 new_oacc_loop (oacc_loop *parent, gcall *marker)
779 {
780   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
781 
782   loop->marker = marker;
783 
784   /* TODO: This is where device_type flattening would occur for the loop
785      flags.  */
786 
787   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
788 
789   tree chunk_size = integer_zero_node;
790   if (loop->flags & OLF_GANG_STATIC)
791     chunk_size = gimple_call_arg (marker, 4);
792   loop->chunk_size = chunk_size;
793 
794   return loop;
795 }
796 
797 /* Create a dummy loop encompassing a call to a openACC routine.
798    Extract the routine's partitioning requirements.  */
799 
800 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)801 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
802 {
803   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
804   int level = oacc_fn_attrib_level (attrs);
805 
806   gcc_assert (level >= 0);
807 
808   loop->marker = call;
809   loop->routine = decl;
810   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
811 		^ (GOMP_DIM_MASK (level) - 1));
812 }
813 
814 /* Finish off the current OpenACC loop ending at tail marker TAIL.
815    Return the parent loop.  */
816 
817 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)818 finish_oacc_loop (oacc_loop *loop)
819 {
820   /* If the loop has been collapsed, don't partition it.  */
821   if (loop->ifns.is_empty ())
822     loop->mask = loop->flags = 0;
823   return loop->parent;
824 }
825 
826 /* Free all OpenACC loop structures within LOOP (inclusive).  */
827 
828 static void
free_oacc_loop(oacc_loop * loop)829 free_oacc_loop (oacc_loop *loop)
830 {
831   if (loop->sibling)
832     free_oacc_loop (loop->sibling);
833   if (loop->child)
834     free_oacc_loop (loop->child);
835 
836   loop->ifns.release ();
837   free (loop);
838 }
839 
840 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
841 
842 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)843 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
844 		     const char *title, int level)
845 {
846   enum ifn_unique_kind kind
847     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
848 
849   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
850   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
851     {
852       gimple *stmt = gsi_stmt (gsi);
853 
854       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
855 	{
856 	  enum ifn_unique_kind k
857 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
858 	       (gimple_call_arg (stmt, 0)));
859 
860 	  if (k == kind && stmt != from)
861 	    break;
862 	}
863       print_gimple_stmt (file, stmt, depth * 2 + 2);
864 
865       gsi_next (&gsi);
866       while (gsi_end_p (gsi))
867 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
868     }
869 }
870 
871 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
872 
873 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)874 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
875 {
876   int ix;
877 
878   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
879 	   loop->flags, loop->mask,
880 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
881 
882   if (loop->marker)
883     print_gimple_stmt (file, loop->marker, depth * 2);
884 
885   if (loop->routine)
886     fprintf (file, "%*sRoutine %s:%u:%s\n",
887 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
888 	     DECL_SOURCE_LINE (loop->routine),
889 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
890 
891   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
892     if (loop->heads[ix])
893       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
894   for (ix = GOMP_DIM_MAX; ix--;)
895     if (loop->tails[ix])
896       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
897 
898   if (loop->child)
899     dump_oacc_loop (file, loop->child, depth + 1);
900   if (loop->sibling)
901     dump_oacc_loop (file, loop->sibling, depth);
902 }
903 
904 void debug_oacc_loop (oacc_loop *);
905 
906 /* Dump loops to stderr.  */
907 
908 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)909 debug_oacc_loop (oacc_loop *loop)
910 {
911   dump_oacc_loop (stderr, loop, 0);
912 }
913 
914 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
915    siblings.  */
916 
917 static void
inform_oacc_loop(const oacc_loop * loop)918 inform_oacc_loop (const oacc_loop *loop)
919 {
920   const char *gang
921     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
922   const char *worker
923     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
924   const char *vector
925     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
926   const char *seq = loop->mask == 0 ? " seq" : "";
927   const dump_user_location_t loc
928     = dump_user_location_t::from_location_t (loop->loc);
929   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
930 		   "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
931 		   vector, seq);
932 
933   if (loop->child)
934     inform_oacc_loop (loop->child);
935   if (loop->sibling)
936     inform_oacc_loop (loop->sibling);
937 }
938 
939 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
940    structures as we go.  By construction these loops are properly
941    nested.  */
942 
943 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)944 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
945 {
946   int marker = 0;
947   int remaining = 0;
948 
949   if (bb->flags & BB_VISITED)
950     return;
951 
952  follow:
953   bb->flags |= BB_VISITED;
954 
955   /* Scan for loop markers.  */
956   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
957        gsi_next (&gsi))
958     {
959       gimple *stmt = gsi_stmt (gsi);
960 
961       if (!is_gimple_call (stmt))
962 	continue;
963 
964       gcall *call = as_a <gcall *> (stmt);
965 
966       /* If this is a routine, make a dummy loop for it.  */
967       if (tree decl = gimple_call_fndecl (call))
968 	if (tree attrs = oacc_get_fn_attrib (decl))
969 	  {
970 	    gcc_assert (!marker);
971 	    new_oacc_loop_routine (loop, call, decl, attrs);
972 	  }
973 
974       if (!gimple_call_internal_p (call))
975 	continue;
976 
977       switch (gimple_call_internal_fn (call))
978 	{
979 	default:
980 	  break;
981 
982 	case IFN_GOACC_LOOP:
983 	case IFN_GOACC_TILE:
984 	  /* Record the abstraction function, so we can manipulate it
985 	     later.  */
986 	  loop->ifns.safe_push (call);
987 	  break;
988 
989 	case IFN_UNIQUE:
990 	  enum ifn_unique_kind kind
991 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
992 				      (gimple_call_arg (call, 0)));
993 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
994 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
995 	    {
996 	      if (gimple_call_num_args (call) == 2)
997 		{
998 		  gcc_assert (marker && !remaining);
999 		  marker = 0;
1000 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1001 		    loop = finish_oacc_loop (loop);
1002 		  else
1003 		    loop->head_end = call;
1004 		}
1005 	      else
1006 		{
1007 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1008 
1009 		  if (!marker)
1010 		    {
1011 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1012 			loop = new_oacc_loop (loop, call);
1013 		      remaining = count;
1014 		    }
1015 		  gcc_assert (count == remaining);
1016 		  if (remaining)
1017 		    {
1018 		      remaining--;
1019 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1020 			loop->heads[marker] = call;
1021 		      else
1022 			loop->tails[remaining] = call;
1023 		    }
1024 		  marker++;
1025 		}
1026 	    }
1027 	}
1028     }
1029   if (remaining || marker)
1030     {
1031       bb = single_succ (bb);
1032       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1033       goto follow;
1034     }
1035 
1036   /* Walk successor blocks.  */
1037   edge e;
1038   edge_iterator ei;
1039 
1040   FOR_EACH_EDGE (e, ei, bb->succs)
1041     oacc_loop_discover_walk (loop, e->dest);
1042 }
1043 
1044 /* LOOP is the first sibling.  Reverse the order in place and return
1045    the new first sibling.  Recurse to child loops.  */
1046 
1047 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1048 oacc_loop_sibling_nreverse (oacc_loop *loop)
1049 {
1050   oacc_loop *last = NULL;
1051   do
1052     {
1053       if (loop->child)
1054 	loop->child = oacc_loop_sibling_nreverse (loop->child);
1055 
1056       oacc_loop *next = loop->sibling;
1057       loop->sibling = last;
1058       last = loop;
1059       loop = next;
1060     }
1061   while (loop);
1062 
1063   return last;
1064 }
1065 
1066 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1067    the current function.  */
1068 
1069 static oacc_loop *
oacc_loop_discovery()1070 oacc_loop_discovery ()
1071 {
1072   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1073      in the following.  */
1074   clear_bb_flags ();
1075 
1076   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1077   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1078 
1079   /* The siblings were constructed in reverse order, reverse them so
1080      that diagnostics come out in an unsurprising order.  */
1081   top = oacc_loop_sibling_nreverse (top);
1082 
1083   return top;
1084 }
1085 
1086 /* Transform the abstract internal function markers starting at FROM
1087    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1088    or TAIL  marker.  */
1089 
1090 static void
oacc_loop_xform_head_tail(gcall * from,int level)1091 oacc_loop_xform_head_tail (gcall *from, int level)
1092 {
1093   enum ifn_unique_kind kind
1094     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1095   tree replacement = build_int_cst (unsigned_type_node, level);
1096 
1097   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1098     {
1099       gimple *stmt = gsi_stmt (gsi);
1100 
1101       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1102 	{
1103 	  enum ifn_unique_kind k
1104 	    = ((enum ifn_unique_kind)
1105 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1106 
1107 	  if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1108 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
1109 	  else if (k == kind && stmt != from)
1110 	    break;
1111 	}
1112       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1113 	*gimple_call_arg_ptr (stmt, 3) = replacement;
1114 
1115       gsi_next (&gsi);
1116       while (gsi_end_p (gsi))
1117 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1118     }
1119 }
1120 
1121 /* Process the discovered OpenACC loops, setting the correct
1122    partitioning level etc.  */
1123 
1124 static void
oacc_loop_process(oacc_loop * loop)1125 oacc_loop_process (oacc_loop *loop)
1126 {
1127   if (loop->child)
1128     oacc_loop_process (loop->child);
1129 
1130   if (loop->mask && !loop->routine)
1131     {
1132       int ix;
1133       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1134       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1135       tree chunk_arg = loop->chunk_size;
1136       gcall *call;
1137 
1138       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1139 	switch (gimple_call_internal_fn (call))
1140 	  {
1141 	  case IFN_GOACC_LOOP:
1142 	    {
1143 	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1144 	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1145 	      if (!is_e)
1146 		gimple_call_set_arg (call, 4, chunk_arg);
1147 	    }
1148 	    break;
1149 
1150 	  case IFN_GOACC_TILE:
1151 	    gimple_call_set_arg (call, 3, mask_arg);
1152 	    gimple_call_set_arg (call, 4, e_mask_arg);
1153 	    break;
1154 
1155 	  default:
1156 	    gcc_unreachable ();
1157 	  }
1158 
1159       unsigned dim = GOMP_DIM_GANG;
1160       unsigned mask = loop->mask | loop->e_mask;
1161       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1162 	{
1163 	  while (!(GOMP_DIM_MASK (dim) & mask))
1164 	    dim++;
1165 
1166 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
1167 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
1168 
1169 	  mask ^= GOMP_DIM_MASK (dim);
1170 	}
1171     }
1172 
1173   if (loop->sibling)
1174     oacc_loop_process (loop->sibling);
1175 }
1176 
1177 /* Walk the OpenACC loop heirarchy checking and assigning the
1178    programmer-specified partitionings.  OUTER_MASK is the partitioning
1179    this loop is contained within.  Return mask of partitioning
1180    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1181    bit.  */
1182 
1183 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1184 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1185 {
1186   unsigned this_mask = loop->mask;
1187   unsigned mask_all = 0;
1188   bool noisy = true;
1189 
1190 #ifdef ACCEL_COMPILER
1191   /* When device_type is supported, we want the device compiler to be
1192      noisy, if the loop parameters are device_type-specific.  */
1193   noisy = false;
1194 #endif
1195 
1196   if (!loop->routine)
1197     {
1198       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1199       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1200       bool tiling = (loop->flags & OLF_TILE) != 0;
1201 
1202       this_mask = ((loop->flags >> OLF_DIM_BASE)
1203 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1204 
1205       /* Apply auto partitioning if this is a non-partitioned regular
1206 	 loop, or (no more than) single axis tiled loop.  */
1207       bool maybe_auto
1208 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1209 
1210       if ((this_mask != 0) + auto_par + seq_par > 1)
1211 	{
1212 	  if (noisy)
1213 	    error_at (loop->loc,
1214 		      seq_par
1215 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
1216 		      : G_("%<auto%> conflicts with other OpenACC loop "
1217 			   "specifiers"));
1218 	  maybe_auto = false;
1219 	  loop->flags &= ~OLF_AUTO;
1220 	  if (seq_par)
1221 	    {
1222 	      loop->flags
1223 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1224 	      this_mask = 0;
1225 	    }
1226 	}
1227 
1228       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1229 	{
1230 	  loop->flags |= OLF_AUTO;
1231 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1232 	}
1233     }
1234 
1235   if (this_mask & outer_mask)
1236     {
1237       const oacc_loop *outer;
1238       for (outer = loop->parent; outer; outer = outer->parent)
1239 	if ((outer->mask | outer->e_mask) & this_mask)
1240 	  break;
1241 
1242       if (noisy)
1243 	{
1244 	  if (outer)
1245 	    {
1246 	      error_at (loop->loc,
1247 			loop->routine
1248 			? G_("routine call uses same OpenACC parallelism"
1249 			     " as containing loop")
1250 			: G_("inner loop uses same OpenACC parallelism"
1251 			     " as containing loop"));
1252 	      inform (outer->loc, "containing loop here");
1253 	    }
1254 	  else
1255 	    error_at (loop->loc,
1256 		      loop->routine
1257 		      ? G_("routine call uses OpenACC parallelism disallowed"
1258 			   " by containing routine")
1259 		      : G_("loop uses OpenACC parallelism disallowed"
1260 			   " by containing routine"));
1261 
1262 	  if (loop->routine)
1263 	    inform (DECL_SOURCE_LOCATION (loop->routine),
1264 		    "routine %qD declared here", loop->routine);
1265 	}
1266       this_mask &= ~outer_mask;
1267     }
1268   else
1269     {
1270       unsigned outermost = least_bit_hwi (this_mask);
1271 
1272       if (outermost && outermost <= outer_mask)
1273 	{
1274 	  if (noisy)
1275 	    {
1276 	      error_at (loop->loc,
1277 			"incorrectly nested OpenACC loop parallelism");
1278 
1279 	      const oacc_loop *outer;
1280 	      for (outer = loop->parent;
1281 		   outer->flags && outer->flags < outermost;
1282 		   outer = outer->parent)
1283 		continue;
1284 	      inform (outer->loc, "containing loop here");
1285 	    }
1286 
1287 	  this_mask &= ~outermost;
1288 	}
1289     }
1290 
1291   mask_all |= this_mask;
1292 
1293   if (loop->flags & OLF_TILE)
1294     {
1295       /* When tiling, vector goes to the element loop, and failing
1296 	 that we put worker there.  The std doesn't contemplate
1297 	 specifying all three.  We choose to put worker and vector on
1298 	 the element loops in that case.  */
1299       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1300       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1301 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1302 
1303       loop->e_mask = this_e_mask;
1304       this_mask ^= this_e_mask;
1305     }
1306 
1307   loop->mask = this_mask;
1308 
1309   if (dump_file)
1310     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1311 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1312 	     loop->mask, loop->e_mask);
1313 
1314   if (loop->child)
1315     {
1316       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1317       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1318       mask_all |= loop->inner;
1319     }
1320 
1321   if (loop->sibling)
1322     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1323 
1324   return mask_all;
1325 }
1326 
1327 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1328    OUTER_MASK is the partitioning this loop is contained within.
1329    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1330    Return the cumulative partitioning used by this loop, siblings and
1331    children.  */
1332 
1333 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1334 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1335 			   bool outer_assign)
1336 {
1337   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1338   bool noisy = true;
1339   bool tiling = loop->flags & OLF_TILE;
1340 
1341 #ifdef ACCEL_COMPILER
1342   /* When device_type is supported, we want the device compiler to be
1343      noisy, if the loop parameters are device_type-specific.  */
1344   noisy = false;
1345 #endif
1346 
1347   if (assign && (!outer_assign || loop->inner))
1348     {
1349       /* Allocate outermost and non-innermost loops at the outermost
1350 	 non-innermost available level.  */
1351       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1352 
1353       /* Find the first outermost available partition. */
1354       while (this_mask <= outer_mask)
1355 	this_mask <<= 1;
1356 
1357       /* Grab two axes if tiling, and we've not assigned anything  */
1358       if (tiling && !(loop->mask | loop->e_mask))
1359 	this_mask |= this_mask << 1;
1360 
1361       /* Prohibit the innermost partitioning at the moment.  */
1362       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1363 
1364       /* Don't use any dimension explicitly claimed by an inner loop. */
1365       this_mask &= ~loop->inner;
1366 
1367       if (tiling && !loop->e_mask)
1368 	{
1369 	  /* If we got two axes, allocate the inner one to the element
1370 	     loop.  */
1371 	  loop->e_mask = this_mask & (this_mask << 1);
1372 	  this_mask ^= loop->e_mask;
1373 	}
1374 
1375       loop->mask |= this_mask;
1376     }
1377 
1378   if (loop->child)
1379     {
1380       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1381       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1382 					       outer_assign | assign);
1383     }
1384 
1385   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1386     {
1387       /* Allocate the loop at the innermost available level.  Note
1388 	 that we do this even if we already assigned this loop the
1389 	 outermost available level above.  That way we'll partition
1390 	 this along 2 axes, if they are available.  */
1391       unsigned this_mask = 0;
1392 
1393       /* Determine the outermost partitioning used within this loop.  */
1394       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1395       this_mask = least_bit_hwi (this_mask);
1396 
1397       /* Pick the partitioning just inside that one.  */
1398       this_mask >>= 1;
1399 
1400       /* And avoid picking one use by an outer loop.  */
1401       this_mask &= ~outer_mask;
1402 
1403       /* If tiling and we failed completely above, grab the next one
1404 	 too.  Making sure it doesn't hit an outer loop.  */
1405       if (tiling)
1406 	{
1407 	  this_mask &= ~(loop->e_mask | loop->mask);
1408 	  unsigned tile_mask = ((this_mask >> 1)
1409 				& ~(outer_mask | loop->e_mask | loop->mask));
1410 
1411 	  if (tile_mask || loop->mask)
1412 	    {
1413 	      loop->e_mask |= this_mask;
1414 	      this_mask = tile_mask;
1415 	    }
1416 	  if (!loop->e_mask && noisy)
1417 	    warning_at (loop->loc, 0,
1418 			"insufficient partitioning available"
1419 			" to parallelize element loop");
1420 	}
1421 
1422       loop->mask |= this_mask;
1423       if (!loop->mask && noisy)
1424 	warning_at (loop->loc, 0,
1425 		    tiling
1426 		    ? G_("insufficient partitioning available"
1427 			 " to parallelize tile loop")
1428 		    : G_("insufficient partitioning available"
1429 			 " to parallelize loop"));
1430     }
1431 
1432   if (assign && dump_file)
1433     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1434 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1435 	     loop->mask, loop->e_mask);
1436 
1437   unsigned inner_mask = 0;
1438 
1439   if (loop->sibling)
1440     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1441 					     outer_mask, outer_assign);
1442 
1443   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1444 
1445   return inner_mask;
1446 }
1447 
1448 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1449    axes.  Return mask of partitioning.  */
1450 
1451 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1452 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1453 {
1454   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1455 
1456   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1457     {
1458       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1459       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1460     }
1461   return mask_all;
1462 }
1463 
1464 /* Default fork/join early expander.  Delete the function calls if
1465    there is no RTL expander.  */
1466 
1467 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1468 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1469 			 const int *ARG_UNUSED (dims), bool is_fork)
1470 {
1471   if (is_fork)
1472     return targetm.have_oacc_fork ();
1473   else
1474     return targetm.have_oacc_join ();
1475 }
1476 
1477 /* Default goacc.reduction early expander.
1478 
1479    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1480    If RES_PTR is not integer-zerop:
1481        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1482        TEARDOWN - emit '*RES_PTR = VAR'
1483    If LHS is not NULL
1484        emit 'LHS = VAR'   */
1485 
1486 void
default_goacc_reduction(gcall * call)1487 default_goacc_reduction (gcall *call)
1488 {
1489   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1490   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1491   tree lhs = gimple_call_lhs (call);
1492   tree var = gimple_call_arg (call, 2);
1493   gimple_seq seq = NULL;
1494 
1495   if (code == IFN_GOACC_REDUCTION_SETUP
1496       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1497     {
1498       /* Setup and Teardown need to copy from/to the receiver object,
1499 	 if there is one.  */
1500       tree ref_to_res = gimple_call_arg (call, 1);
1501 
1502       if (!integer_zerop (ref_to_res))
1503 	{
1504 	  tree dst = build_simple_mem_ref (ref_to_res);
1505 	  tree src = var;
1506 
1507 	  if (code == IFN_GOACC_REDUCTION_SETUP)
1508 	    {
1509 	      src = dst;
1510 	      dst = lhs;
1511 	      lhs = NULL;
1512 	    }
1513 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1514 	}
1515     }
1516 
1517   /* Copy VAR to LHS, if there is an LHS.  */
1518   if (lhs)
1519     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1520 
1521   gsi_replace_with_seq (&gsi, seq, true);
1522 }
1523 
1524 /* Main entry point for oacc transformations which run on the device
1525    compiler after LTO, so we know what the target device is at this
1526    point (including the host fallback).  */
1527 
1528 static unsigned int
execute_oacc_device_lower()1529 execute_oacc_device_lower ()
1530 {
1531   tree attrs = oacc_get_fn_attrib (current_function_decl);
1532 
1533   if (!attrs)
1534     /* Not an offloaded function.  */
1535     return 0;
1536 
1537   /* Parse the default dim argument exactly once.  */
1538   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1539     {
1540       oacc_parse_default_dims (flag_openacc_dims);
1541       flag_openacc_dims = (char *)&flag_openacc_dims;
1542     }
1543 
1544   bool is_oacc_parallel
1545     = (lookup_attribute ("oacc parallel",
1546 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1547   bool is_oacc_kernels
1548     = (lookup_attribute ("oacc kernels",
1549 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1550   bool is_oacc_serial
1551     = (lookup_attribute ("oacc serial",
1552 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1553   int fn_level = oacc_fn_attrib_level (attrs);
1554   bool is_oacc_routine = (fn_level >= 0);
1555   gcc_checking_assert (is_oacc_parallel
1556 		       + is_oacc_kernels
1557 		       + is_oacc_serial
1558 		       + is_oacc_routine
1559 		       == 1);
1560 
1561   bool is_oacc_kernels_parallelized
1562     = (lookup_attribute ("oacc kernels parallelized",
1563 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1564   if (is_oacc_kernels_parallelized)
1565     gcc_checking_assert (is_oacc_kernels);
1566 
1567   if (dump_file)
1568     {
1569       if (is_oacc_parallel)
1570 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
1571       else if (is_oacc_kernels)
1572 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1573 		 (is_oacc_kernels_parallelized
1574 		  ? "parallelized" : "unparallelized"));
1575       else if (is_oacc_serial)
1576 	fprintf (dump_file, "Function is OpenACC serial offload\n");
1577       else if (is_oacc_routine)
1578 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
1579 		 fn_level);
1580       else
1581 	gcc_unreachable ();
1582     }
1583 
1584   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1585      kernels, so remove the parallelism dimensions function attributes
1586      potentially set earlier on.  */
1587   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1588     {
1589       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1590       attrs = oacc_get_fn_attrib (current_function_decl);
1591     }
1592 
1593   /* Discover, partition and process the loops.  */
1594   oacc_loop *loops = oacc_loop_discovery ();
1595 
1596   unsigned outer_mask = 0;
1597   if (is_oacc_routine)
1598     outer_mask = GOMP_DIM_MASK (fn_level) - 1;
1599   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1600   /* OpenACC kernels constructs are special: they currently don't use the
1601      generic oacc_loop infrastructure and attribute/dimension processing.  */
1602   if (is_oacc_kernels && is_oacc_kernels_parallelized)
1603     {
1604       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
1605 	 also tree-parloops.c:create_parallel_loop.  */
1606       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1607     }
1608 
1609   int dims[GOMP_DIM_MAX];
1610   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1611 
1612   if (dump_file)
1613     {
1614       const char *comma = "Compute dimensions [";
1615       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1616 	fprintf (dump_file, "%s%d", comma, dims[ix]);
1617       fprintf (dump_file, "]\n");
1618     }
1619 
1620   oacc_loop_process (loops);
1621   if (dump_file)
1622     {
1623       fprintf (dump_file, "OpenACC loops\n");
1624       dump_oacc_loop (dump_file, loops, 0);
1625       fprintf (dump_file, "\n");
1626     }
1627   if (dump_enabled_p ())
1628     {
1629       oacc_loop *l = loops;
1630       /* OpenACC kernels constructs are special: they currently don't use the
1631 	 generic oacc_loop infrastructure.  */
1632       if (is_oacc_kernels)
1633 	{
1634 	  /* Create a fake oacc_loop for diagnostic purposes.  */
1635 	  l = new_oacc_loop_raw (NULL,
1636 				 DECL_SOURCE_LOCATION (current_function_decl));
1637 	  l->mask = used_mask;
1638 	}
1639       else
1640 	{
1641 	  /* Skip the outermost, dummy OpenACC loop  */
1642 	  l = l->child;
1643 	}
1644       if (l)
1645 	inform_oacc_loop (l);
1646       if (is_oacc_kernels)
1647 	free_oacc_loop (l);
1648     }
1649 
1650   /* Offloaded targets may introduce new basic blocks, which require
1651      dominance information to update SSA.  */
1652   calculate_dominance_info (CDI_DOMINATORS);
1653 
1654   /* Now lower internal loop functions to target-specific code
1655      sequences.  */
1656   basic_block bb;
1657   FOR_ALL_BB_FN (bb, cfun)
1658     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1659       {
1660 	gimple *stmt = gsi_stmt (gsi);
1661 	if (!is_gimple_call (stmt))
1662 	  {
1663 	    gsi_next (&gsi);
1664 	    continue;
1665 	  }
1666 
1667 	gcall *call = as_a <gcall *> (stmt);
1668 	if (!gimple_call_internal_p (call))
1669 	  {
1670 	    gsi_next (&gsi);
1671 	    continue;
1672 	  }
1673 
1674 	/* Rewind to allow rescan.  */
1675 	gsi_prev (&gsi);
1676 	bool rescan = false, remove = false;
1677 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
1678 
1679 	switch (ifn_code)
1680 	  {
1681 	  default: break;
1682 
1683 	  case IFN_GOACC_TILE:
1684 	    oacc_xform_tile (call);
1685 	    rescan = true;
1686 	    break;
1687 
1688 	  case IFN_GOACC_LOOP:
1689 	    oacc_xform_loop (call);
1690 	    rescan = true;
1691 	    break;
1692 
1693 	  case IFN_GOACC_REDUCTION:
1694 	    /* Mark the function for SSA renaming.  */
1695 	    mark_virtual_operands_for_renaming (cfun);
1696 
1697 	    /* If the level is -1, this ended up being an unused
1698 	       axis.  Handle as a default.  */
1699 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
1700 	      default_goacc_reduction (call);
1701 	    else
1702 	      targetm.goacc.reduction (call);
1703 	    rescan = true;
1704 	    break;
1705 
1706 	  case IFN_UNIQUE:
1707 	    {
1708 	      enum ifn_unique_kind kind
1709 		= ((enum ifn_unique_kind)
1710 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1711 
1712 	      switch (kind)
1713 		{
1714 		default:
1715 		  break;
1716 
1717 		case IFN_UNIQUE_OACC_FORK:
1718 		case IFN_UNIQUE_OACC_JOIN:
1719 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
1720 		    remove = true;
1721 		  else if (!targetm.goacc.fork_join
1722 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1723 		    remove = true;
1724 		  break;
1725 
1726 		case IFN_UNIQUE_OACC_HEAD_MARK:
1727 		case IFN_UNIQUE_OACC_TAIL_MARK:
1728 		  remove = true;
1729 		  break;
1730 		}
1731 	      break;
1732 	    }
1733 	  }
1734 
1735 	if (gsi_end_p (gsi))
1736 	  /* We rewound past the beginning of the BB.  */
1737 	  gsi = gsi_start_bb (bb);
1738 	else
1739 	  /* Undo the rewind.  */
1740 	  gsi_next (&gsi);
1741 
1742 	if (remove)
1743 	  {
1744 	    if (gimple_vdef (call))
1745 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1746 	    if (gimple_call_lhs (call))
1747 	      {
1748 		/* Propagate the data dependency var.  */
1749 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1750 						   gimple_call_arg (call, 1));
1751 		gsi_replace (&gsi, ass,  false);
1752 	      }
1753 	    else
1754 	      gsi_remove (&gsi, true);
1755 	  }
1756 	else if (!rescan)
1757 	  /* If not rescanning, advance over the call.  */
1758 	  gsi_next (&gsi);
1759       }
1760 
1761   free_oacc_loop (loops);
1762 
1763   return 0;
1764 }
1765 
1766 /* Default launch dimension validator.  Force everything to 1.  A
1767    backend that wants to provide larger dimensions must override this
1768    hook.  */
1769 
1770 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))1771 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1772 			     int ARG_UNUSED (fn_level),
1773 			     unsigned ARG_UNUSED (used))
1774 {
1775   bool changed = false;
1776 
1777   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1778     {
1779       if (dims[ix] != 1)
1780 	{
1781 	  dims[ix] = 1;
1782 	  changed = true;
1783 	}
1784     }
1785 
1786   return changed;
1787 }
1788 
1789 /* Default dimension bound is unknown on accelerator and 1 on host.  */
1790 
1791 int
default_goacc_dim_limit(int ARG_UNUSED (axis))1792 default_goacc_dim_limit (int ARG_UNUSED (axis))
1793 {
1794 #ifdef ACCEL_COMPILER
1795   return 0;
1796 #else
1797   return 1;
1798 #endif
1799 }
1800 
1801 namespace {
1802 
1803 const pass_data pass_data_oacc_device_lower =
1804 {
1805   GIMPLE_PASS, /* type */
1806   "oaccdevlow", /* name */
1807   OPTGROUP_OMP, /* optinfo_flags */
1808   TV_NONE, /* tv_id */
1809   PROP_cfg, /* properties_required */
1810   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
1811   0, /* properties_destroyed */
1812   0, /* todo_flags_start */
1813   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1814 };
1815 
1816 class pass_oacc_device_lower : public gimple_opt_pass
1817 {
1818 public:
pass_oacc_device_lower(gcc::context * ctxt)1819   pass_oacc_device_lower (gcc::context *ctxt)
1820     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1821   {}
1822 
1823   /* opt_pass methods: */
gate(function *)1824   virtual bool gate (function *) { return flag_openacc; };
1825 
execute(function *)1826   virtual unsigned int execute (function *)
1827     {
1828       return execute_oacc_device_lower ();
1829     }
1830 
1831 }; // class pass_oacc_device_lower
1832 
1833 } // anon namespace
1834 
1835 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)1836 make_pass_oacc_device_lower (gcc::context *ctxt)
1837 {
1838   return new pass_oacc_device_lower (ctxt);
1839 }
1840 
1841 
1842 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1843    GOMP_SIMT_ENTER call identifying the privatized variables, which are
1844    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1845    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
1846 
1847 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)1848 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1849 {
1850   gimple *alloc_stmt = gsi_stmt (*gsi);
1851   tree simtrec = gimple_call_lhs (alloc_stmt);
1852   tree simduid = gimple_call_arg (alloc_stmt, 0);
1853   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1854   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1855   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1856   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1857   TREE_ADDRESSABLE (rectype) = 1;
1858   TREE_TYPE (simtrec) = build_pointer_type (rectype);
1859   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1860     {
1861       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1862       if (*argp == null_pointer_node)
1863 	continue;
1864       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1865 		  && VAR_P (TREE_OPERAND (*argp, 0)));
1866       tree var = TREE_OPERAND (*argp, 0);
1867 
1868       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1869 			       DECL_NAME (var), TREE_TYPE (var));
1870       SET_DECL_ALIGN (field, DECL_ALIGN (var));
1871       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1872       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1873 
1874       insert_field_into_struct (rectype, field);
1875 
1876       tree t = build_simple_mem_ref (simtrec);
1877       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1878       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1879       SET_DECL_VALUE_EXPR (var, t);
1880       DECL_HAS_VALUE_EXPR_P (var) = 1;
1881       *regimplify = true;
1882     }
1883   layout_type (rectype);
1884   tree size = TYPE_SIZE_UNIT (rectype);
1885   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1886 
1887   alloc_stmt
1888     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1889   gimple_call_set_lhs (alloc_stmt, simtrec);
1890   gsi_replace (gsi, alloc_stmt, false);
1891   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1892   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1893   gsi_replace (&enter_gsi, enter_stmt, false);
1894 
1895   use_operand_p use;
1896   gimple *exit_stmt;
1897   if (single_imm_use (simtrec, &use, &exit_stmt))
1898     {
1899       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1900       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1901       tree clobber = build_clobber (rectype);
1902       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1903       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1904     }
1905   else
1906     gcc_checking_assert (has_zero_uses (simtrec));
1907 }
1908 
1909 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
1910 
1911 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)1912 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1913 {
1914   tree t = *tp;
1915 
1916   if (VAR_P (t)
1917       && DECL_HAS_VALUE_EXPR_P (t)
1918       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1919     {
1920       *walk_subtrees = 0;
1921       return t;
1922     }
1923   return NULL_TREE;
1924 }
1925 
1926 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1927    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1928    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
1929    internal functions on non-SIMT targets, and likewise some SIMD internal
1930    functions on SIMT targets.  */
1931 
1932 static unsigned int
execute_omp_device_lower()1933 execute_omp_device_lower ()
1934 {
1935   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1936   bool regimplify = false;
1937   basic_block bb;
1938   gimple_stmt_iterator gsi;
1939   FOR_EACH_BB_FN (bb, cfun)
1940     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1941       {
1942 	gimple *stmt = gsi_stmt (gsi);
1943 	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1944 	  continue;
1945 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1946 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1947 	switch (gimple_call_internal_fn (stmt))
1948 	  {
1949 	  case IFN_GOMP_USE_SIMT:
1950 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
1951 	    break;
1952 	  case IFN_GOMP_SIMT_ENTER:
1953 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1954 	    goto simtreg_enter_exit;
1955 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
1956 	    if (vf != 1)
1957 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1958 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1959 	    goto simtreg_enter_exit;
1960 	  case IFN_GOMP_SIMT_EXIT:
1961 	  simtreg_enter_exit:
1962 	    if (vf != 1)
1963 	      continue;
1964 	    unlink_stmt_vdef (stmt);
1965 	    break;
1966 	  case IFN_GOMP_SIMT_LANE:
1967 	  case IFN_GOMP_SIMT_LAST_LANE:
1968 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1969 	    break;
1970 	  case IFN_GOMP_SIMT_VF:
1971 	    rhs = build_int_cst (type, vf);
1972 	    break;
1973 	  case IFN_GOMP_SIMT_ORDERED_PRED:
1974 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1975 	    if (rhs || !lhs)
1976 	      unlink_stmt_vdef (stmt);
1977 	    break;
1978 	  case IFN_GOMP_SIMT_VOTE_ANY:
1979 	  case IFN_GOMP_SIMT_XCHG_BFLY:
1980 	  case IFN_GOMP_SIMT_XCHG_IDX:
1981 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1982 	    break;
1983 	  case IFN_GOMP_SIMD_LANE:
1984 	  case IFN_GOMP_SIMD_LAST_LANE:
1985 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1986 	    break;
1987 	  case IFN_GOMP_SIMD_VF:
1988 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1989 	    break;
1990 	  default:
1991 	    continue;
1992 	  }
1993 	if (lhs && !rhs)
1994 	  continue;
1995 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1996 	gsi_replace (&gsi, stmt, false);
1997       }
1998   if (regimplify)
1999     FOR_EACH_BB_REVERSE_FN (bb, cfun)
2000       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2001 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2002 	  {
2003 	    if (gimple_clobber_p (gsi_stmt (gsi)))
2004 	      gsi_remove (&gsi, true);
2005 	    else
2006 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2007 	  }
2008   if (vf != 1)
2009     cfun->has_force_vectorize_loops = false;
2010   return 0;
2011 }
2012 
2013 namespace {
2014 
2015 const pass_data pass_data_omp_device_lower =
2016 {
2017   GIMPLE_PASS, /* type */
2018   "ompdevlow", /* name */
2019   OPTGROUP_OMP, /* optinfo_flags */
2020   TV_NONE, /* tv_id */
2021   PROP_cfg, /* properties_required */
2022   PROP_gimple_lomp_dev, /* properties_provided */
2023   0, /* properties_destroyed */
2024   0, /* todo_flags_start */
2025   TODO_update_ssa, /* todo_flags_finish */
2026 };
2027 
2028 class pass_omp_device_lower : public gimple_opt_pass
2029 {
2030 public:
pass_omp_device_lower(gcc::context * ctxt)2031   pass_omp_device_lower (gcc::context *ctxt)
2032     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2033   {}
2034 
2035   /* opt_pass methods: */
gate(function * fun)2036   virtual bool gate (function *fun)
2037     {
2038       return !(fun->curr_properties & PROP_gimple_lomp_dev);
2039     }
execute(function *)2040   virtual unsigned int execute (function *)
2041     {
2042       return execute_omp_device_lower ();
2043     }
2044 
2045 }; // class pass_expand_omp_ssa
2046 
2047 } // anon namespace
2048 
2049 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2050 make_pass_omp_device_lower (gcc::context *ctxt)
2051 {
2052   return new pass_omp_device_lower (ctxt);
2053 }
2054 
2055 /* "omp declare target link" handling pass.  */
2056 
2057 namespace {
2058 
2059 const pass_data pass_data_omp_target_link =
2060 {
2061   GIMPLE_PASS,			/* type */
2062   "omptargetlink",		/* name */
2063   OPTGROUP_OMP,			/* optinfo_flags */
2064   TV_NONE,			/* tv_id */
2065   PROP_ssa,			/* properties_required */
2066   0,				/* properties_provided */
2067   0,				/* properties_destroyed */
2068   0,				/* todo_flags_start */
2069   TODO_update_ssa,		/* todo_flags_finish */
2070 };
2071 
2072 class pass_omp_target_link : public gimple_opt_pass
2073 {
2074 public:
pass_omp_target_link(gcc::context * ctxt)2075   pass_omp_target_link (gcc::context *ctxt)
2076     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2077   {}
2078 
2079   /* opt_pass methods: */
gate(function * fun)2080   virtual bool gate (function *fun)
2081     {
2082 #ifdef ACCEL_COMPILER
2083       return offloading_function_p (fun->decl);
2084 #else
2085       (void) fun;
2086       return false;
2087 #endif
2088     }
2089 
2090   virtual unsigned execute (function *);
2091 };
2092 
2093 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
2094 
2095 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2096 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2097 {
2098   tree t = *tp;
2099 
2100   if (VAR_P (t)
2101       && DECL_HAS_VALUE_EXPR_P (t)
2102       && is_global_var (t)
2103       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2104     {
2105       *walk_subtrees = 0;
2106       return t;
2107     }
2108 
2109   return NULL_TREE;
2110 }
2111 
2112 unsigned
execute(function * fun)2113 pass_omp_target_link::execute (function *fun)
2114 {
2115   basic_block bb;
2116   FOR_EACH_BB_FN (bb, fun)
2117     {
2118       gimple_stmt_iterator gsi;
2119       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2120 	if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2121 	  gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2122     }
2123 
2124   return 0;
2125 }
2126 
2127 } // anon namespace
2128 
2129 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2130 make_pass_omp_target_link (gcc::context *ctxt)
2131 {
2132   return new pass_omp_target_link (ctxt);
2133 }
2134