xref: /netbsd/external/gpl3/gcc/dist/gcc/omp-offload.cc (revision f0fbc68b)
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2022 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56 #include "convert.h"
57 #include "opts.h"
58 
59 /* Describe the OpenACC looping structure of a function.  The entire
60    function is held in a 'NULL' loop.  */
61 
62 struct oacc_loop
63 {
64   oacc_loop *parent; /* Containing loop.  */
65 
66   oacc_loop *child; /* First inner loop.  */
67 
68   oacc_loop *sibling; /* Next loop within same parent.  */
69 
70   location_t loc; /* Location of the loop start.  */
71 
72   gcall *marker; /* Initial head marker.  */
73 
74   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
75   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
76 
77   tree routine;  /* Pseudo-loop enclosing a routine.  */
78 
79   unsigned mask;   /* Partitioning mask.  */
80   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
81   unsigned inner;  /* Partitioning of inner loops.  */
82   unsigned flags;  /* Partitioning flags.  */
83   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
84   tree chunk_size; /* Chunk size.  */
85   gcall *head_end; /* Final marker of head sequence.  */
86 };
87 
88 /* Holds offload tables with decls.  */
89 vec<tree, va_gc> *offload_funcs, *offload_vars;
90 
91 /* Return level at which oacc routine may spawn a partitioned loop, or
92    -1 if it is not a routine (i.e. is an offload fn).  */
93 
94 int
oacc_fn_attrib_level(tree attr)95 oacc_fn_attrib_level (tree attr)
96 {
97   tree pos = TREE_VALUE (attr);
98 
99   if (!TREE_PURPOSE (pos))
100     return -1;
101 
102   int ix = 0;
103   for (ix = 0; ix != GOMP_DIM_MAX;
104        ix++, pos = TREE_CHAIN (pos))
105     if (!integer_zerop (TREE_PURPOSE (pos)))
106       break;
107 
108   return ix;
109 }
110 
111 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
112    adds their addresses and sizes to constructor-vector V_CTOR.  */
113 
114 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)115 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116 					 vec<constructor_elt, va_gc> *v_ctor)
117 {
118   unsigned len = vec_safe_length (v_decls);
119   for (unsigned i = 0; i < len; i++)
120     {
121       tree it = (*v_decls)[i];
122       bool is_var = VAR_P (it);
123       bool is_link_var
124 	= is_var
125 #ifdef ACCEL_COMPILER
126 	  && DECL_HAS_VALUE_EXPR_P (it)
127 #endif
128 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
129 
130       /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc.  */
131       if (!in_lto_p && !symtab_node::get (it))
132 	continue;
133 
134       tree size = NULL_TREE;
135       if (is_var)
136 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
137 
138       tree addr;
139       if (!is_link_var)
140 	addr = build_fold_addr_expr (it);
141       else
142 	{
143 #ifdef ACCEL_COMPILER
144 	  /* For "omp declare target link" vars add address of the pointer to
145 	     the target table, instead of address of the var.  */
146 	  tree value_expr = DECL_VALUE_EXPR (it);
147 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148 	  varpool_node::finalize_decl (link_ptr_decl);
149 	  addr = build_fold_addr_expr (link_ptr_decl);
150 #else
151 	  addr = build_fold_addr_expr (it);
152 #endif
153 
154 	  /* Most significant bit of the size marks "omp declare target link"
155 	     vars in host and target tables.  */
156 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158 			    * BITS_PER_UNIT - 1);
159 	  size = wide_int_to_tree (const_ptr_type_node, isize);
160 	}
161 
162       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163       if (is_var)
164 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
165     }
166 }
167 
168 /* Return true if DECL is a function for which its references should be
169    analyzed.  */
170 
171 static bool
omp_declare_target_fn_p(tree decl)172 omp_declare_target_fn_p (tree decl)
173 {
174   return (TREE_CODE (decl) == FUNCTION_DECL
175 	  && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
176 	  && !lookup_attribute ("omp declare target host",
177 				DECL_ATTRIBUTES (decl))
178 	  && (!flag_openacc
179 	      || oacc_get_fn_attrib (decl) == NULL_TREE));
180 }
181 
182 /* Return true if DECL Is a variable for which its initializer references
183    should be analyzed.  */
184 
185 static bool
omp_declare_target_var_p(tree decl)186 omp_declare_target_var_p (tree decl)
187 {
188   return (VAR_P (decl)
189 	  && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
190 	  && !lookup_attribute ("omp declare target link",
191 				DECL_ATTRIBUTES (decl)));
192 }
193 
194 /* Helper function for omp_discover_implicit_declare_target, called through
195    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
196    declare target to.  */
197 
198 static tree
omp_discover_declare_target_tgt_fn_r(tree * tp,int * walk_subtrees,void * data)199 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
200 {
201   if (TREE_CODE (*tp) == CALL_EXPR
202       && CALL_EXPR_FN (*tp)
203       && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204       && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205       && lookup_attribute ("omp declare variant base",
206 			   DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207 							  0))))
208     {
209       tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210       for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
211 	{
212 	  attr = lookup_attribute ("omp declare variant base", attr);
213 	  if (attr == NULL_TREE)
214 	    break;
215 	  tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216 	  if (TREE_CODE (purpose) == FUNCTION_DECL)
217 	    omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
218 	}
219     }
220   else if (TREE_CODE (*tp) == FUNCTION_DECL)
221     {
222       tree decl = *tp;
223       tree id = get_identifier ("omp declare target");
224       symtab_node *node = symtab_node::get (*tp);
225       if (node != NULL)
226 	{
227 	  while (node->alias_target
228 		 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
229 	    {
230 	      if (!omp_declare_target_fn_p (node->decl)
231 		  && !lookup_attribute ("omp declare target host",
232 					DECL_ATTRIBUTES (node->decl)))
233 		{
234 		  node->offloadable = 1;
235 		  DECL_ATTRIBUTES (node->decl)
236 		    = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
237 		}
238 	      node = symtab_node::get (node->alias_target);
239 	    }
240 	  symtab_node *new_node = node->ultimate_alias_target ();
241 	  decl = new_node->decl;
242 	  while (node != new_node)
243 	    {
244 	      if (!omp_declare_target_fn_p (node->decl)
245 		  && !lookup_attribute ("omp declare target host",
246 					DECL_ATTRIBUTES (node->decl)))
247 		{
248 		  node->offloadable = 1;
249 		  DECL_ATTRIBUTES (node->decl)
250 		    = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
251 		}
252 	      gcc_assert (node->alias && node->analyzed);
253 	      node = node->get_alias_target ();
254 	    }
255 	  node->offloadable = 1;
256 	  if (ENABLE_OFFLOADING)
257 	    g->have_offload = true;
258 	}
259       if (omp_declare_target_fn_p (decl)
260 	  || lookup_attribute ("omp declare target host",
261 			       DECL_ATTRIBUTES (decl)))
262 	return NULL_TREE;
263 
264       if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
265 	((vec<tree> *) data)->safe_push (decl);
266       DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
267 					  DECL_ATTRIBUTES (decl));
268     }
269   else if (TYPE_P (*tp))
270     *walk_subtrees = 0;
271   /* else if (TREE_CODE (*tp) == OMP_TARGET)
272        {
273 	 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
274 	   if (OMP_DEVICE_ANCESTOR (dev))
275 	     *walk_subtrees = 0;
276        } */
277   return NULL_TREE;
278 }
279 
280 /* Similarly, but ignore references outside of OMP_TARGET regions.  */
281 
282 static tree
omp_discover_declare_target_fn_r(tree * tp,int * walk_subtrees,void * data)283 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
284 {
285   if (TREE_CODE (*tp) == OMP_TARGET)
286     {
287       /* And not OMP_DEVICE_ANCESTOR.  */
288       walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
289 				    omp_discover_declare_target_tgt_fn_r,
290 				    data);
291       *walk_subtrees = 0;
292     }
293   else if (TYPE_P (*tp))
294     *walk_subtrees = 0;
295   return NULL_TREE;
296 }
297 
298 /* Helper function for omp_discover_implicit_declare_target, called through
299    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
300    declare target to.  */
301 
302 static tree
omp_discover_declare_target_var_r(tree * tp,int * walk_subtrees,void * data)303 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
304 {
305   if (TREE_CODE (*tp) == FUNCTION_DECL)
306     return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
307   else if (VAR_P (*tp)
308 	   && is_global_var (*tp)
309 	   && !omp_declare_target_var_p (*tp))
310     {
311       tree id = get_identifier ("omp declare target");
312       if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
313 	{
314 	  error_at (DECL_SOURCE_LOCATION (*tp),
315 		    "%qD specified both in declare target %<link%> and "
316 		    "implicitly in %<to%> clauses", *tp);
317 	  DECL_ATTRIBUTES (*tp)
318 	    = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
319 	}
320       if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
321 	((vec<tree> *) data)->safe_push (*tp);
322       DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
323       symtab_node *node = symtab_node::get (*tp);
324       if (node != NULL && !node->offloadable)
325 	{
326 	  node->offloadable = 1;
327 	  if (ENABLE_OFFLOADING)
328 	    {
329 	      g->have_offload = true;
330 	      if (is_a <varpool_node *> (node))
331 		vec_safe_push (offload_vars, node->decl);
332 	    }
333 	}
334     }
335   else if (TYPE_P (*tp))
336     *walk_subtrees = 0;
337   return NULL_TREE;
338 }
339 
340 /* Perform the OpenMP implicit declare target to discovery.  */
341 
342 void
omp_discover_implicit_declare_target(void)343 omp_discover_implicit_declare_target (void)
344 {
345   cgraph_node *node;
346   varpool_node *vnode;
347   auto_vec<tree> worklist;
348 
349   FOR_EACH_DEFINED_FUNCTION (node)
350     if (DECL_SAVED_TREE (node->decl))
351       {
352 	struct cgraph_node *cgn;
353         if (omp_declare_target_fn_p (node->decl))
354 	  worklist.safe_push (node->decl);
355 	else if (DECL_STRUCT_FUNCTION (node->decl)
356 		 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
357 	  worklist.safe_push (node->decl);
358 	for (cgn = first_nested_function (node);
359 	     cgn; cgn = next_nested_function (cgn))
360 	  if (omp_declare_target_fn_p (cgn->decl))
361 	    worklist.safe_push (cgn->decl);
362 	  else if (DECL_STRUCT_FUNCTION (cgn->decl)
363 		   && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
364 	    worklist.safe_push (cgn->decl);
365       }
366   FOR_EACH_VARIABLE (vnode)
367     if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
368 	&& omp_declare_target_var_p (vnode->decl))
369       worklist.safe_push (vnode->decl);
370   while (!worklist.is_empty ())
371     {
372       tree decl = worklist.pop ();
373       if (VAR_P (decl))
374 	walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
375 				      omp_discover_declare_target_var_r,
376 				      &worklist);
377       else if (omp_declare_target_fn_p (decl))
378 	walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
379 				      omp_discover_declare_target_tgt_fn_r,
380 				      &worklist);
381       else
382 	walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
383 				      omp_discover_declare_target_fn_r,
384 				      &worklist);
385     }
386 
387   lang_hooks.decls.omp_finish_decl_inits ();
388 }
389 
390 
391 /* Create new symbols containing (address, size) pairs for global variables,
392    marked with "omp declare target" attribute, as well as addresses for the
393    functions, which are outlined offloading regions.  */
394 void
omp_finish_file(void)395 omp_finish_file (void)
396 {
397   unsigned num_funcs = vec_safe_length (offload_funcs);
398   unsigned num_vars = vec_safe_length (offload_vars);
399 
400   if (num_funcs == 0 && num_vars == 0)
401     return;
402 
403   if (targetm_common.have_named_sections)
404     {
405       vec<constructor_elt, va_gc> *v_f, *v_v;
406       vec_alloc (v_f, num_funcs);
407       vec_alloc (v_v, num_vars * 2);
408 
409       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
410       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
411 
412       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
413 						    vec_safe_length (v_v));
414       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
415 						     num_funcs);
416       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
417       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
418       tree ctor_v = build_constructor (vars_decl_type, v_v);
419       tree ctor_f = build_constructor (funcs_decl_type, v_f);
420       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
421       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
422       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
423 				    get_identifier (".offload_func_table"),
424 				    funcs_decl_type);
425       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
426 				   get_identifier (".offload_var_table"),
427 				   vars_decl_type);
428       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
429       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
430 	 otherwise a joint table in a binary will contain padding between
431 	 tables from multiple object files.  */
432       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
433       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
434       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
435       DECL_INITIAL (funcs_decl) = ctor_f;
436       DECL_INITIAL (vars_decl) = ctor_v;
437       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
438       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
439 
440       varpool_node::finalize_decl (vars_decl);
441       varpool_node::finalize_decl (funcs_decl);
442     }
443   else
444     {
445       for (unsigned i = 0; i < num_funcs; i++)
446 	{
447 	  tree it = (*offload_funcs)[i];
448 	  /* See also add_decls_addresses_to_decl_constructor
449 	     and output_offload_tables in lto-cgraph.cc.  */
450 	  if (!in_lto_p && !symtab_node::get (it))
451 	    continue;
452 	  targetm.record_offload_symbol (it);
453 	}
454       for (unsigned i = 0; i < num_vars; i++)
455 	{
456 	  tree it = (*offload_vars)[i];
457 	  if (!in_lto_p && !symtab_node::get (it))
458 	    continue;
459 #ifdef ACCEL_COMPILER
460 	  if (DECL_HAS_VALUE_EXPR_P (it)
461 	      && lookup_attribute ("omp declare target link",
462 				   DECL_ATTRIBUTES (it)))
463 	    {
464 	      tree value_expr = DECL_VALUE_EXPR (it);
465 	      tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
466 	      targetm.record_offload_symbol (link_ptr_decl);
467 	      varpool_node::finalize_decl (link_ptr_decl);
468 	    }
469 	  else
470 #endif
471 	    targetm.record_offload_symbol (it);
472 	}
473     }
474 }
475 
476 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
477    axis DIM.  Return a tmp var holding the result.  */
478 
479 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)480 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
481 {
482   tree arg = build_int_cst (unsigned_type_node, dim);
483   tree size = create_tmp_var (integer_type_node);
484   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
485   gimple *call = gimple_build_call_internal (fn, 1, arg);
486 
487   gimple_call_set_lhs (call, size);
488   gimple_seq_add_stmt (seq, call);
489 
490   return size;
491 }
492 
493 /* Find the number of threads (POS = false), or thread number (POS =
494    true) for an OpenACC region partitioned as MASK.  Setup code
495    required for the calculation is added to SEQ.  */
496 
497 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)498 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
499 {
500   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
501   unsigned ix;
502 
503   /* Start at gang level, and examine relevant dimension indices.  */
504   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
505     if (GOMP_DIM_MASK (ix) & mask)
506       {
507 	if (res)
508 	  {
509 	    /* We had an outer index, so scale that by the size of
510 	       this dimension.  */
511 	    tree n = oacc_dim_call (false, ix, seq);
512 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
513 	  }
514 	if (pos)
515 	  {
516 	    /* Determine index in this dimension.  */
517 	    tree id = oacc_dim_call (true, ix, seq);
518 	    if (res)
519 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
520 	    else
521 	      res = id;
522 	  }
523       }
524 
525   if (res == NULL_TREE)
526     res = integer_zero_node;
527 
528   return res;
529 }
530 
531 /* Transform IFN_GOACC_LOOP calls to actual code.  See
532    expand_oacc_for for where these are generated.  At the vector
533    level, we stride loops, such that each member of a warp will
534    operate on adjacent iterations.  At the worker and gang level,
535    each gang/warp executes a set of contiguous iterations.  Chunking
536    can override this such that each iteration engine executes a
537    contiguous chunk, and then moves on to stride to the next chunk.  */
538 
539 static void
oacc_xform_loop(gcall * call)540 oacc_xform_loop (gcall *call)
541 {
542   gimple_stmt_iterator gsi = gsi_for_stmt (call);
543   enum ifn_goacc_loop_kind code
544     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
545   tree dir = gimple_call_arg (call, 1);
546   tree range = gimple_call_arg (call, 2);
547   tree step = gimple_call_arg (call, 3);
548   tree chunk_size = NULL_TREE;
549   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
550   tree lhs = gimple_call_lhs (call);
551   tree type = NULL_TREE;
552   tree diff_type = TREE_TYPE (range);
553   tree r = NULL_TREE;
554   gimple_seq seq = NULL;
555   bool chunking = false, striding = true;
556   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
557   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
558 
559   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
560   if (!lhs)
561     {
562       gsi_replace_with_seq (&gsi, seq, true);
563       return;
564     }
565 
566   type = TREE_TYPE (lhs);
567 
568 #ifdef ACCEL_COMPILER
569   chunk_size = gimple_call_arg (call, 4);
570   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
571       || integer_zerop (chunk_size))   /* Default (also static).  */
572     {
573       /* If we're at the gang level, we want each to execute a
574 	 contiguous run of iterations.  Otherwise we want each element
575 	 to stride.  */
576       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
577       chunking = false;
578     }
579   else
580     {
581       /* Chunk of size 1 is striding.  */
582       striding = integer_onep (chunk_size);
583       chunking = !striding;
584     }
585 #endif
586 
587   /* striding=true, chunking=true
588        -> invalid.
589      striding=true, chunking=false
590        -> chunks=1
591      striding=false,chunking=true
592        -> chunks=ceil (range/(chunksize*threads*step))
593      striding=false,chunking=false
594        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
595   push_gimplify_context (true);
596 
597   switch (code)
598     {
599     default: gcc_unreachable ();
600 
601     case IFN_GOACC_LOOP_CHUNKS:
602       if (!chunking)
603 	r = build_int_cst (type, 1);
604       else
605 	{
606 	  /* chunk_max
607 	     = (range - dir) / (chunks * step * num_threads) + dir  */
608 	  tree per = oacc_thread_numbers (false, mask, &seq);
609 	  per = fold_convert (type, per);
610 	  chunk_size = fold_convert (type, chunk_size);
611 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
612 	  per = fold_build2 (MULT_EXPR, type, per, step);
613 	  r = build2 (MINUS_EXPR, type, range, dir);
614 	  r = build2 (PLUS_EXPR, type, r, per);
615 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
616 	}
617       break;
618 
619     case IFN_GOACC_LOOP_STEP:
620       {
621 	/* If striding, step by the entire compute volume, otherwise
622 	   step by the inner volume.  */
623 	unsigned volume = striding ? mask : inner_mask;
624 
625 	r = oacc_thread_numbers (false, volume, &seq);
626 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
627       }
628       break;
629 
630     case IFN_GOACC_LOOP_OFFSET:
631       /* Enable vectorization on non-SIMT targets.  */
632       if (!targetm.simt.vf
633 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
634 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
635 	     the loop.  */
636 	  && (flag_tree_loop_vectorize
637 	      || !OPTION_SET_P (flag_tree_loop_vectorize)))
638 	{
639 	  basic_block bb = gsi_bb (gsi);
640 	  class loop *parent = bb->loop_father;
641 	  class loop *body = parent->inner;
642 
643 	  parent->force_vectorize = true;
644 	  parent->safelen = INT_MAX;
645 
646 	  /* "Chunking loops" may have inner loops.  */
647 	  if (parent->inner)
648 	    {
649 	      body->force_vectorize = true;
650 	      body->safelen = INT_MAX;
651 	    }
652 
653 	  cfun->has_force_vectorize_loops = true;
654 	}
655       if (striding)
656 	{
657 	  r = oacc_thread_numbers (true, mask, &seq);
658 	  r = fold_convert (diff_type, r);
659 	}
660       else
661 	{
662 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
663 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
664 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
665 				     inner_size, outer_size);
666 
667 	  volume = fold_convert (diff_type, volume);
668 	  if (chunking)
669 	    chunk_size = fold_convert (diff_type, chunk_size);
670 	  else
671 	    {
672 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
673 
674 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
675 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
676 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
677 	    }
678 
679 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
680 			      fold_convert (diff_type, inner_size));
681 	  r = oacc_thread_numbers (true, outer_mask, &seq);
682 	  r = fold_convert (diff_type, r);
683 	  r = build2 (MULT_EXPR, diff_type, r, span);
684 
685 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
686 	  inner = fold_convert (diff_type, inner);
687 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
688 
689 	  if (chunking)
690 	    {
691 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
692 	      tree per
693 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
694 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
695 
696 	      r = build2 (PLUS_EXPR, diff_type, r, per);
697 	    }
698 	}
699       r = fold_build2 (MULT_EXPR, diff_type, r, step);
700       if (type != diff_type)
701 	r = fold_convert (type, r);
702       break;
703 
704     case IFN_GOACC_LOOP_BOUND:
705       if (striding)
706 	r = range;
707       else
708 	{
709 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
710 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
711 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
712 				     inner_size, outer_size);
713 
714 	  volume = fold_convert (diff_type, volume);
715 	  if (chunking)
716 	    chunk_size = fold_convert (diff_type, chunk_size);
717 	  else
718 	    {
719 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
720 
721 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
722 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
723 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
724 	    }
725 
726 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
727 			      fold_convert (diff_type, inner_size));
728 
729 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
730 
731 	  tree offset = gimple_call_arg (call, 6);
732 	  r = build2 (PLUS_EXPR, diff_type, r,
733 		      fold_convert (diff_type, offset));
734 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
735 		      diff_type, r, range);
736 	}
737       if (diff_type != type)
738 	r = fold_convert (type, r);
739       break;
740     }
741 
742   gimplify_assign (lhs, r, &seq);
743 
744   pop_gimplify_context (NULL);
745 
746   gsi_replace_with_seq (&gsi, seq, true);
747 }
748 
749 /* Transform a GOACC_TILE call.  Determines the element loop span for
750    the specified loop of the nest.  This is 1 if we're not tiling.
751 
752    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
753 
754 static void
oacc_xform_tile(gcall * call)755 oacc_xform_tile (gcall *call)
756 {
757   gimple_stmt_iterator gsi = gsi_for_stmt (call);
758   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
759   /* Inner loops have higher loop_nos.  */
760   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
761   tree tile_size = gimple_call_arg (call, 2);
762   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
763   tree lhs = gimple_call_lhs (call);
764   tree type = TREE_TYPE (lhs);
765   gimple_seq seq = NULL;
766   tree span = build_int_cst (type, 1);
767 
768   gcc_assert (!(e_mask
769 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
770 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
771   push_gimplify_context (!seen_error ());
772 
773 #ifndef ACCEL_COMPILER
774   /* Partitioning disabled on host compilers.  */
775   e_mask = 0;
776 #endif
777   if (!e_mask)
778     /* Not paritioning.  */
779     span = integer_one_node;
780   else if (!integer_zerop (tile_size))
781     /* User explicitly specified size.  */
782     span = tile_size;
783   else
784     {
785       /* Pick a size based on the paritioning of the element loop and
786 	 the number of loop nests.  */
787       tree first_size = NULL_TREE;
788       tree second_size = NULL_TREE;
789 
790       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
791 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
792       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
793 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
794 
795       if (!first_size)
796 	{
797 	  first_size = second_size;
798 	  second_size = NULL_TREE;
799 	}
800 
801       if (loop_no + 1 == collapse)
802 	{
803 	  span = first_size;
804 	  if (!loop_no && second_size)
805 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
806 				span, second_size);
807 	}
808       else if (loop_no + 2 == collapse)
809 	span = second_size;
810       else
811 	span = NULL_TREE;
812 
813       if (!span)
814 	/* There's no obvious element size for this loop.  Options
815 	   are 1, first_size or some non-unity constant (32 is my
816 	   favourite).   We should gather some statistics.  */
817 	span = first_size;
818     }
819 
820   span = fold_convert (type, span);
821   gimplify_assign (lhs, span, &seq);
822 
823   pop_gimplify_context (NULL);
824 
825   gsi_replace_with_seq (&gsi, seq, true);
826 }
827 
828 /* Default partitioned and minimum partitioned dimensions.  */
829 
830 static int oacc_default_dims[GOMP_DIM_MAX];
831 static int oacc_min_dims[GOMP_DIM_MAX];
832 
833 int
oacc_get_default_dim(int dim)834 oacc_get_default_dim (int dim)
835 {
836   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
837   return oacc_default_dims[dim];
838 }
839 
840 int
oacc_get_min_dim(int dim)841 oacc_get_min_dim (int dim)
842 {
843   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
844   return oacc_min_dims[dim];
845 }
846 
847 /* Parse the default dimension parameter.  This is a set of
848    :-separated optional compute dimensions.  Each specified dimension
849    is a positive integer.  When device type support is added, it is
850    planned to be a comma separated list of such compute dimensions,
851    with all but the first prefixed by the colon-terminated device
852    type.  */
853 
854 static void
oacc_parse_default_dims(const char * dims)855 oacc_parse_default_dims (const char *dims)
856 {
857   int ix;
858 
859   for (ix = GOMP_DIM_MAX; ix--;)
860     {
861       oacc_default_dims[ix] = -1;
862       oacc_min_dims[ix] = 1;
863     }
864 
865 #ifndef ACCEL_COMPILER
866   /* Cannot be overridden on the host.  */
867   dims = NULL;
868 #endif
869   if (dims)
870     {
871       const char *pos = dims;
872 
873       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
874 	{
875 	  if (ix)
876 	    {
877 	      if (*pos != ':')
878 		goto malformed;
879 	      pos++;
880 	    }
881 
882 	  if (*pos != ':')
883 	    {
884 	      long val;
885 	      const char *eptr;
886 
887 	      errno = 0;
888 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
889 	      if (errno || val <= 0 || (int) val != val)
890 		goto malformed;
891 	      pos = eptr;
892 	      oacc_default_dims[ix] = (int) val;
893 	    }
894 	}
895       if (*pos)
896 	{
897 	malformed:
898 	  error_at (UNKNOWN_LOCATION,
899 		    "%<-fopenacc-dim%> operand is malformed at %qs", pos);
900 	}
901     }
902 
903   /* Allow the backend to validate the dimensions.  */
904   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
905   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
906 }
907 
908 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
909    raw attribute.  DIMS is an array of dimensions, which is filled in.
910    LEVEL is the partitioning level of a routine, or -1 for an offload
911    region itself.  USED is the mask of partitioned execution in the
912    function.  */
913 
914 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)915 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
916 {
917   tree purpose[GOMP_DIM_MAX];
918   unsigned ix;
919   tree pos = TREE_VALUE (attrs);
920 
921   /* Make sure the attribute creator attached the dimension
922      information.  */
923   gcc_assert (pos);
924 
925   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
926     {
927       purpose[ix] = TREE_PURPOSE (pos);
928       tree val = TREE_VALUE (pos);
929       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
930       pos = TREE_CHAIN (pos);
931     }
932 
933   bool check = true;
934 #ifdef ACCEL_COMPILER
935   check = false;
936 #endif
937   if (check
938       && warn_openacc_parallelism
939       && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
940     {
941       static char const *const axes[] =
942       /* Must be kept in sync with GOMP_DIM enumeration.  */
943 	{ "gang", "worker", "vector" };
944       for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
945 	if (dims[ix] < 0)
946 	  ; /* Defaulting axis.  */
947 	else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
948 	  /* There is partitioned execution, but the user requested a
949 	     dimension size of 1.  They're probably confused.  */
950 	  warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
951 		      "region contains %s partitioned code but"
952 		      " is not %s partitioned", axes[ix], axes[ix]);
953 	else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
954 	  /* The dimension is explicitly partitioned to non-unity, but
955 	     no use is made within the region.  */
956 	  warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
957 		      "region is %s partitioned but"
958 		      " does not contain %s partitioned code",
959 		      axes[ix], axes[ix]);
960     }
961 
962   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
963 
964   /* Default anything left to 1 or a partitioned default.  */
965   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
966     if (dims[ix] < 0)
967       {
968 	/* The OpenACC spec says 'If the [num_gangs] clause is not
969 	   specified, an implementation-defined default will be used;
970 	   the default may depend on the code within the construct.'
971 	   (2.5.6).  Thus an implementation is free to choose
972 	   non-unity default for a parallel region that doesn't have
973 	   any gang-partitioned loops.  However, it appears that there
974 	   is a sufficient body of user code that expects non-gang
975 	   partitioned regions to not execute in gang-redundant mode.
976 	   So we (a) don't warn about the non-portability and (b) pick
977 	   the minimum permissible dimension size when there is no
978 	   partitioned execution.  Otherwise we pick the global
979 	   default for the dimension, which the user can control.  The
980 	   same wording and logic applies to num_workers and
981 	   vector_length, however the worker- or vector- single
982 	   execution doesn't have the same impact as gang-redundant
983 	   execution.  (If the minimum gang-level partioning is not 1,
984 	   the target is probably too confusing.)  */
985 	dims[ix] = (used & GOMP_DIM_MASK (ix)
986 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
987 	changed = true;
988       }
989 
990   if (changed)
991     {
992       /* Replace the attribute with new values.  */
993       pos = NULL_TREE;
994       for (ix = GOMP_DIM_MAX; ix--;)
995 	pos = tree_cons (purpose[ix],
996 			 build_int_cst (integer_type_node, dims[ix]), pos);
997       oacc_replace_fn_attrib (fn, pos);
998     }
999 }
1000 
1001 /* Create an empty OpenACC loop structure at LOC.  */
1002 
1003 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)1004 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1005 {
1006   oacc_loop *loop = XCNEW (oacc_loop);
1007 
1008   loop->parent = parent;
1009 
1010   if (parent)
1011     {
1012       loop->sibling = parent->child;
1013       parent->child = loop;
1014     }
1015 
1016   loop->loc = loc;
1017   return loop;
1018 }
1019 
1020 /* Create an outermost, dummy OpenACC loop for offloaded function
1021    DECL.  */
1022 
1023 static oacc_loop *
new_oacc_loop_outer(tree decl)1024 new_oacc_loop_outer (tree decl)
1025 {
1026   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1027 }
1028 
1029 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
1030    Link into PARENT loop.  Return the new loop.  */
1031 
1032 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)1033 new_oacc_loop (oacc_loop *parent, gcall *marker)
1034 {
1035   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1036 
1037   loop->marker = marker;
1038 
1039   /* TODO: This is where device_type flattening would occur for the loop
1040      flags.  */
1041 
1042   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1043 
1044   tree chunk_size = integer_zero_node;
1045   if (loop->flags & OLF_GANG_STATIC)
1046     chunk_size = gimple_call_arg (marker, 4);
1047   loop->chunk_size = chunk_size;
1048 
1049   return loop;
1050 }
1051 
1052 /* Create a dummy loop encompassing a call to a openACC routine.
1053    Extract the routine's partitioning requirements.  */
1054 
1055 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)1056 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1057 {
1058   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1059   int level = oacc_fn_attrib_level (attrs);
1060 
1061   gcc_assert (level >= 0);
1062 
1063   loop->marker = call;
1064   loop->routine = decl;
1065   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1066 		^ (GOMP_DIM_MASK (level) - 1));
1067 }
1068 
1069 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1070    Return the parent loop.  */
1071 
1072 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)1073 finish_oacc_loop (oacc_loop *loop)
1074 {
1075   /* If the loop has been collapsed, don't partition it.  */
1076   if (loop->ifns.is_empty ())
1077     loop->mask = loop->flags = 0;
1078   return loop->parent;
1079 }
1080 
1081 /* Free all OpenACC loop structures within LOOP (inclusive).  */
1082 
1083 static void
free_oacc_loop(oacc_loop * loop)1084 free_oacc_loop (oacc_loop *loop)
1085 {
1086   if (loop->sibling)
1087     free_oacc_loop (loop->sibling);
1088   if (loop->child)
1089     free_oacc_loop (loop->child);
1090 
1091   loop->ifns.release ();
1092   free (loop);
1093 }
1094 
1095 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
1096 
1097 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)1098 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1099 		     const char *title, int level)
1100 {
1101   enum ifn_unique_kind kind
1102     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1103 
1104   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1105   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1106     {
1107       gimple *stmt = gsi_stmt (gsi);
1108 
1109       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1110 	{
1111 	  enum ifn_unique_kind k
1112 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1113 	       (gimple_call_arg (stmt, 0)));
1114 
1115 	  if (k == kind && stmt != from)
1116 	    break;
1117 	}
1118       print_gimple_stmt (file, stmt, depth * 2 + 2);
1119 
1120       gsi_next (&gsi);
1121       while (gsi_end_p (gsi))
1122 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1123     }
1124 }
1125 
1126 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
1127 
1128 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)1129 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1130 {
1131   int ix;
1132 
1133   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1134 	   loop->flags, loop->mask,
1135 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1136 
1137   if (loop->marker)
1138     print_gimple_stmt (file, loop->marker, depth * 2);
1139 
1140   if (loop->routine)
1141     fprintf (file, "%*sRoutine %s:%u:%s\n",
1142 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1143 	     DECL_SOURCE_LINE (loop->routine),
1144 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1145 
1146   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1147     if (loop->heads[ix])
1148       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1149   for (ix = GOMP_DIM_MAX; ix--;)
1150     if (loop->tails[ix])
1151       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1152 
1153   if (loop->child)
1154     dump_oacc_loop (file, loop->child, depth + 1);
1155   if (loop->sibling)
1156     dump_oacc_loop (file, loop->sibling, depth);
1157 }
1158 
1159 void debug_oacc_loop (oacc_loop *);
1160 
1161 /* Dump loops to stderr.  */
1162 
1163 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)1164 debug_oacc_loop (oacc_loop *loop)
1165 {
1166   dump_oacc_loop (stderr, loop, 0);
1167 }
1168 
1169 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1170    siblings.  */
1171 
1172 static void
inform_oacc_loop(const oacc_loop * loop)1173 inform_oacc_loop (const oacc_loop *loop)
1174 {
1175   const char *gang
1176     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1177   const char *worker
1178     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1179   const char *vector
1180     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1181   const char *seq = loop->mask == 0 ? " seq" : "";
1182   const dump_user_location_t loc
1183     = dump_user_location_t::from_location_t (loop->loc);
1184   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1185 		   "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1186 		   vector, seq);
1187 
1188   if (loop->child)
1189     inform_oacc_loop (loop->child);
1190   if (loop->sibling)
1191     inform_oacc_loop (loop->sibling);
1192 }
1193 
1194 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1195    structures as we go.  By construction these loops are properly
1196    nested.  */
1197 
1198 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)1199 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1200 {
1201   int marker = 0;
1202   int remaining = 0;
1203 
1204   if (bb->flags & BB_VISITED)
1205     return;
1206 
1207  follow:
1208   bb->flags |= BB_VISITED;
1209 
1210   /* Scan for loop markers.  */
1211   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1212        gsi_next (&gsi))
1213     {
1214       gimple *stmt = gsi_stmt (gsi);
1215 
1216       if (!is_gimple_call (stmt))
1217 	continue;
1218 
1219       gcall *call = as_a <gcall *> (stmt);
1220 
1221       /* If this is a routine, make a dummy loop for it.  */
1222       if (tree decl = gimple_call_fndecl (call))
1223 	if (tree attrs = oacc_get_fn_attrib (decl))
1224 	  {
1225 	    gcc_assert (!marker);
1226 	    new_oacc_loop_routine (loop, call, decl, attrs);
1227 	  }
1228 
1229       if (!gimple_call_internal_p (call))
1230 	continue;
1231 
1232       switch (gimple_call_internal_fn (call))
1233 	{
1234 	default:
1235 	  break;
1236 
1237 	case IFN_GOACC_LOOP:
1238 	case IFN_GOACC_TILE:
1239 	  /* Record the abstraction function, so we can manipulate it
1240 	     later.  */
1241 	  loop->ifns.safe_push (call);
1242 	  break;
1243 
1244 	case IFN_UNIQUE:
1245 	  enum ifn_unique_kind kind
1246 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1247 				      (gimple_call_arg (call, 0)));
1248 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1249 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1250 	    {
1251 	      if (gimple_call_num_args (call) == 2)
1252 		{
1253 		  gcc_assert (marker && !remaining);
1254 		  marker = 0;
1255 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1256 		    loop = finish_oacc_loop (loop);
1257 		  else
1258 		    loop->head_end = call;
1259 		}
1260 	      else
1261 		{
1262 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1263 
1264 		  if (!marker)
1265 		    {
1266 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1267 			loop = new_oacc_loop (loop, call);
1268 		      remaining = count;
1269 		    }
1270 		  gcc_assert (count == remaining);
1271 		  if (remaining)
1272 		    {
1273 		      remaining--;
1274 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1275 			loop->heads[marker] = call;
1276 		      else
1277 			loop->tails[remaining] = call;
1278 		    }
1279 		  marker++;
1280 		}
1281 	    }
1282 	}
1283     }
1284   if (remaining || marker)
1285     {
1286       bb = single_succ (bb);
1287       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1288       goto follow;
1289     }
1290 
1291   /* Walk successor blocks.  */
1292   edge e;
1293   edge_iterator ei;
1294 
1295   FOR_EACH_EDGE (e, ei, bb->succs)
1296     oacc_loop_discover_walk (loop, e->dest);
1297 }
1298 
1299 /* LOOP is the first sibling.  Reverse the order in place and return
1300    the new first sibling.  Recurse to child loops.  */
1301 
1302 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1303 oacc_loop_sibling_nreverse (oacc_loop *loop)
1304 {
1305   oacc_loop *last = NULL;
1306   do
1307     {
1308       if (loop->child)
1309 	loop->child = oacc_loop_sibling_nreverse (loop->child);
1310 
1311       oacc_loop *next = loop->sibling;
1312       loop->sibling = last;
1313       last = loop;
1314       loop = next;
1315     }
1316   while (loop);
1317 
1318   return last;
1319 }
1320 
1321 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1322    the current function.  */
1323 
1324 static oacc_loop *
oacc_loop_discovery()1325 oacc_loop_discovery ()
1326 {
1327   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1328      in the following.  */
1329   clear_bb_flags ();
1330 
1331   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1332   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1333 
1334   /* The siblings were constructed in reverse order, reverse them so
1335      that diagnostics come out in an unsurprising order.  */
1336   top = oacc_loop_sibling_nreverse (top);
1337 
1338   return top;
1339 }
1340 
1341 /* Transform the abstract internal function markers starting at FROM
1342    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1343    or TAIL  marker.  */
1344 
1345 static void
oacc_loop_xform_head_tail(gcall * from,int level)1346 oacc_loop_xform_head_tail (gcall *from, int level)
1347 {
1348   enum ifn_unique_kind kind
1349     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1350   tree replacement = build_int_cst (unsigned_type_node, level);
1351 
1352   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1353     {
1354       gimple *stmt = gsi_stmt (gsi);
1355 
1356       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1357 	{
1358 	  enum ifn_unique_kind k
1359 	    = ((enum ifn_unique_kind)
1360 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1361 
1362 	  if (k == IFN_UNIQUE_OACC_FORK
1363 	      || k == IFN_UNIQUE_OACC_JOIN
1364 	      || k == IFN_UNIQUE_OACC_PRIVATE)
1365 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
1366 	  else if (k == kind && stmt != from)
1367 	    break;
1368 	}
1369       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1370 	*gimple_call_arg_ptr (stmt, 3) = replacement;
1371       update_stmt (stmt);
1372 
1373       gsi_next (&gsi);
1374       while (gsi_end_p (gsi))
1375 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1376     }
1377 }
1378 
1379 /* Process the discovered OpenACC loops, setting the correct
1380    partitioning level etc.  */
1381 
1382 static void
oacc_loop_process(oacc_loop * loop,int fn_level)1383 oacc_loop_process (oacc_loop *loop, int fn_level)
1384 {
1385   if (loop->child)
1386     oacc_loop_process (loop->child, fn_level);
1387 
1388   if (loop->mask && !loop->routine)
1389     {
1390       int ix;
1391       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1392       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1393       tree chunk_arg = loop->chunk_size;
1394       gcall *call;
1395 
1396       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1397 	{
1398 	  switch (gimple_call_internal_fn (call))
1399 	    {
1400 	    case IFN_GOACC_LOOP:
1401 	      {
1402 		bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1403 		gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1404 		if (!is_e)
1405 		  gimple_call_set_arg (call, 4, chunk_arg);
1406 	      }
1407 	      break;
1408 
1409 	    case IFN_GOACC_TILE:
1410 	      gimple_call_set_arg (call, 3, mask_arg);
1411 	      gimple_call_set_arg (call, 4, e_mask_arg);
1412 	      break;
1413 
1414 	    default:
1415 	      gcc_unreachable ();
1416 	    }
1417 	  update_stmt (call);
1418 	}
1419 
1420       unsigned dim = GOMP_DIM_GANG;
1421       unsigned mask = loop->mask | loop->e_mask;
1422       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1423 	{
1424 	  while (!(GOMP_DIM_MASK (dim) & mask))
1425 	    dim++;
1426 
1427 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
1428 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
1429 
1430 	  mask ^= GOMP_DIM_MASK (dim);
1431 	}
1432     }
1433 
1434   if (loop->sibling)
1435     oacc_loop_process (loop->sibling, fn_level);
1436 
1437 
1438   /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1439      "The 'reduction' clause may not be specified on an orphaned 'loop'
1440      construct with the 'gang' clause, or on an orphaned 'loop' construct that
1441      will generate gang parallelism in a procedure that is compiled with the
1442      'routine gang' clause."  */
1443   if (fn_level == GOMP_DIM_GANG
1444       && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1445       && (loop->flags & OLF_REDUCTION))
1446     error_at (loop->loc,
1447 	      "gang reduction on an orphan loop");
1448 }
1449 
1450 /* Walk the OpenACC loop heirarchy checking and assigning the
1451    programmer-specified partitionings.  OUTER_MASK is the partitioning
1452    this loop is contained within.  Return mask of partitioning
1453    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1454    bit.  */
1455 
1456 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1457 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1458 {
1459   unsigned this_mask = loop->mask;
1460   unsigned mask_all = 0;
1461   bool noisy = true;
1462 
1463 #ifdef ACCEL_COMPILER
1464   /* When device_type is supported, we want the device compiler to be
1465      noisy, if the loop parameters are device_type-specific.  */
1466   noisy = false;
1467 #endif
1468 
1469   if (!loop->routine)
1470     {
1471       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1472       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1473       bool tiling = (loop->flags & OLF_TILE) != 0;
1474 
1475       this_mask = ((loop->flags >> OLF_DIM_BASE)
1476 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1477 
1478       /* Apply auto partitioning if this is a non-partitioned regular
1479 	 loop, or (no more than) single axis tiled loop.  */
1480       bool maybe_auto
1481 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1482 
1483       if ((this_mask != 0) + auto_par + seq_par > 1)
1484 	{
1485 	  if (noisy)
1486 	    error_at (loop->loc,
1487 		      seq_par
1488 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
1489 		      : G_("%<auto%> conflicts with other OpenACC loop "
1490 			   "specifiers"));
1491 	  maybe_auto = false;
1492 	  loop->flags &= ~OLF_AUTO;
1493 	  if (seq_par)
1494 	    {
1495 	      loop->flags
1496 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1497 	      this_mask = 0;
1498 	    }
1499 	}
1500 
1501       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1502 	{
1503 	  loop->flags |= OLF_AUTO;
1504 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1505 	}
1506     }
1507 
1508   if (this_mask & outer_mask)
1509     {
1510       const oacc_loop *outer;
1511       for (outer = loop->parent; outer; outer = outer->parent)
1512 	if ((outer->mask | outer->e_mask) & this_mask)
1513 	  break;
1514 
1515       if (noisy)
1516 	{
1517 	  if (outer)
1518 	    {
1519 	      error_at (loop->loc,
1520 			loop->routine
1521 			? G_("routine call uses same OpenACC parallelism"
1522 			     " as containing loop")
1523 			: G_("inner loop uses same OpenACC parallelism"
1524 			     " as containing loop"));
1525 	      inform (outer->loc, "containing loop here");
1526 	    }
1527 	  else
1528 	    error_at (loop->loc,
1529 		      loop->routine
1530 		      ? G_("routine call uses OpenACC parallelism disallowed"
1531 			   " by containing routine")
1532 		      : G_("loop uses OpenACC parallelism disallowed"
1533 			   " by containing routine"));
1534 
1535 	  if (loop->routine)
1536 	    inform (DECL_SOURCE_LOCATION (loop->routine),
1537 		    "routine %qD declared here", loop->routine);
1538 	}
1539       this_mask &= ~outer_mask;
1540     }
1541   else
1542     {
1543       unsigned outermost = least_bit_hwi (this_mask);
1544 
1545       if (outermost && outermost <= outer_mask)
1546 	{
1547 	  if (noisy)
1548 	    {
1549 	      error_at (loop->loc,
1550 			"incorrectly nested OpenACC loop parallelism");
1551 
1552 	      const oacc_loop *outer;
1553 	      for (outer = loop->parent;
1554 		   outer->flags && outer->flags < outermost;
1555 		   outer = outer->parent)
1556 		continue;
1557 	      inform (outer->loc, "containing loop here");
1558 	    }
1559 
1560 	  this_mask &= ~outermost;
1561 	}
1562     }
1563 
1564   mask_all |= this_mask;
1565 
1566   if (loop->flags & OLF_TILE)
1567     {
1568       /* When tiling, vector goes to the element loop, and failing
1569 	 that we put worker there.  The std doesn't contemplate
1570 	 specifying all three.  We choose to put worker and vector on
1571 	 the element loops in that case.  */
1572       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1573       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1574 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1575 
1576       loop->e_mask = this_e_mask;
1577       this_mask ^= this_e_mask;
1578     }
1579 
1580   loop->mask = this_mask;
1581 
1582   if (dump_file)
1583     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1584 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1585 	     loop->mask, loop->e_mask);
1586 
1587   if (loop->child)
1588     {
1589       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1590       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1591       mask_all |= loop->inner;
1592     }
1593 
1594   if (loop->sibling)
1595     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1596 
1597   return mask_all;
1598 }
1599 
1600 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1601    OUTER_MASK is the partitioning this loop is contained within.
1602    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1603    Return the cumulative partitioning used by this loop, siblings and
1604    children.  */
1605 
1606 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1607 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1608 			   bool outer_assign)
1609 {
1610   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1611   bool noisy = true;
1612   bool tiling = loop->flags & OLF_TILE;
1613 
1614 #ifdef ACCEL_COMPILER
1615   /* When device_type is supported, we want the device compiler to be
1616      noisy, if the loop parameters are device_type-specific.  */
1617   noisy = false;
1618 #endif
1619 
1620   if (assign && (!outer_assign || loop->inner))
1621     {
1622       /* Allocate outermost and non-innermost loops at the outermost
1623 	 non-innermost available level.  */
1624       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1625 
1626       /* Find the first outermost available partition. */
1627       while (this_mask <= outer_mask)
1628 	this_mask <<= 1;
1629 
1630       /* Grab two axes if tiling, and we've not assigned anything  */
1631       if (tiling && !(loop->mask | loop->e_mask))
1632 	this_mask |= this_mask << 1;
1633 
1634       /* Prohibit the innermost partitioning at the moment.  */
1635       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1636 
1637       /* Don't use any dimension explicitly claimed by an inner loop. */
1638       this_mask &= ~loop->inner;
1639 
1640       if (tiling && !loop->e_mask)
1641 	{
1642 	  /* If we got two axes, allocate the inner one to the element
1643 	     loop.  */
1644 	  loop->e_mask = this_mask & (this_mask << 1);
1645 	  this_mask ^= loop->e_mask;
1646 	}
1647 
1648       loop->mask |= this_mask;
1649     }
1650 
1651   if (loop->child)
1652     {
1653       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1654       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1655 					       outer_assign | assign);
1656     }
1657 
1658   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1659     {
1660       /* Allocate the loop at the innermost available level.  Note
1661 	 that we do this even if we already assigned this loop the
1662 	 outermost available level above.  That way we'll partition
1663 	 this along 2 axes, if they are available.  */
1664       unsigned this_mask = 0;
1665 
1666       /* Determine the outermost partitioning used within this loop.  */
1667       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1668       this_mask = least_bit_hwi (this_mask);
1669 
1670       /* Pick the partitioning just inside that one.  */
1671       this_mask >>= 1;
1672 
1673       /* And avoid picking one use by an outer loop.  */
1674       this_mask &= ~outer_mask;
1675 
1676       /* If tiling and we failed completely above, grab the next one
1677 	 too.  Making sure it doesn't hit an outer loop.  */
1678       if (tiling)
1679 	{
1680 	  this_mask &= ~(loop->e_mask | loop->mask);
1681 	  unsigned tile_mask = ((this_mask >> 1)
1682 				& ~(outer_mask | loop->e_mask | loop->mask));
1683 
1684 	  if (tile_mask || loop->mask)
1685 	    {
1686 	      loop->e_mask |= this_mask;
1687 	      this_mask = tile_mask;
1688 	    }
1689 	  if (!loop->e_mask && noisy)
1690 	    warning_at (loop->loc, 0,
1691 			"insufficient partitioning available"
1692 			" to parallelize element loop");
1693 	}
1694 
1695       loop->mask |= this_mask;
1696       if (!loop->mask && noisy)
1697 	warning_at (loop->loc, 0,
1698 		    tiling
1699 		    ? G_("insufficient partitioning available"
1700 			 " to parallelize tile loop")
1701 		    : G_("insufficient partitioning available"
1702 			 " to parallelize loop"));
1703     }
1704 
1705   if (assign && dump_file)
1706     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1707 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1708 	     loop->mask, loop->e_mask);
1709 
1710   unsigned inner_mask = 0;
1711 
1712   if (loop->sibling)
1713     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1714 					     outer_mask, outer_assign);
1715 
1716   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1717 
1718   return inner_mask;
1719 }
1720 
1721 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1722    axes.  Return mask of partitioning.  */
1723 
1724 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1725 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1726 {
1727   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1728 
1729   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1730     {
1731       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1732       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1733     }
1734   return mask_all;
1735 }
1736 
1737 /* Default fork/join early expander.  Delete the function calls if
1738    there is no RTL expander.  */
1739 
1740 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1741 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1742 			 const int *ARG_UNUSED (dims), bool is_fork)
1743 {
1744   if (is_fork)
1745     return targetm.have_oacc_fork ();
1746   else
1747     return targetm.have_oacc_join ();
1748 }
1749 
1750 /* Default goacc.reduction early expander.
1751 
1752    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1753    If RES_PTR is not integer-zerop:
1754        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1755        TEARDOWN - emit '*RES_PTR = VAR'
1756    If LHS is not NULL
1757        emit 'LHS = VAR'   */
1758 
1759 void
default_goacc_reduction(gcall * call)1760 default_goacc_reduction (gcall *call)
1761 {
1762   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1763   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1764   tree lhs = gimple_call_lhs (call);
1765   tree var = gimple_call_arg (call, 2);
1766   gimple_seq seq = NULL;
1767 
1768   if (code == IFN_GOACC_REDUCTION_SETUP
1769       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1770     {
1771       /* Setup and Teardown need to copy from/to the receiver object,
1772 	 if there is one.  */
1773       tree ref_to_res = gimple_call_arg (call, 1);
1774 
1775       if (!integer_zerop (ref_to_res))
1776 	{
1777 	  tree dst = build_simple_mem_ref (ref_to_res);
1778 	  tree src = var;
1779 
1780 	  if (code == IFN_GOACC_REDUCTION_SETUP)
1781 	    {
1782 	      src = dst;
1783 	      dst = lhs;
1784 	      lhs = NULL;
1785 	    }
1786 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1787 	}
1788     }
1789 
1790   /* Copy VAR to LHS, if there is an LHS.  */
1791   if (lhs)
1792     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1793 
1794   gsi_replace_with_seq (&gsi, seq, true);
1795 }
1796 
1797 struct var_decl_rewrite_info
1798 {
1799   gimple *stmt;
1800   hash_map<tree, tree> *adjusted_vars;
1801   bool avoid_pointer_conversion;
1802   bool modified;
1803 };
1804 
1805 /* Helper function for execute_oacc_device_lower.  Rewrite VAR_DECLs (by
1806    themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1807    the var_decl_rewrite_info pointed to via DATA.  Used as part of coercing
1808    gang-private variables in OpenACC offload regions to reside in GPU shared
1809    memory.  */
1810 
1811 static tree
oacc_rewrite_var_decl(tree * tp,int * walk_subtrees,void * data)1812 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1813 {
1814   walk_stmt_info *wi = (walk_stmt_info *) data;
1815   var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1816 
1817   if (TREE_CODE (*tp) == ADDR_EXPR)
1818     {
1819       tree arg = TREE_OPERAND (*tp, 0);
1820       tree *new_arg = info->adjusted_vars->get (arg);
1821 
1822       if (new_arg)
1823 	{
1824 	  if (info->avoid_pointer_conversion)
1825 	    {
1826 	      *tp = build_fold_addr_expr (*new_arg);
1827 	      info->modified = true;
1828 	      *walk_subtrees = 0;
1829 	    }
1830 	  else
1831 	    {
1832 	      gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1833 	      tree repl = build_fold_addr_expr (*new_arg);
1834 	      gimple *stmt1
1835 		= gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1836 	      tree conv = convert_to_pointer (TREE_TYPE (*tp),
1837 					      gimple_assign_lhs (stmt1));
1838 	      gimple *stmt2
1839 		= gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1840 	      gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1841 	      gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1842 	      *tp = gimple_assign_lhs (stmt2);
1843 	      info->modified = true;
1844 	      *walk_subtrees = 0;
1845 	    }
1846 	}
1847     }
1848   else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1849     {
1850       tree *base = &TREE_OPERAND (*tp, 0);
1851 
1852       while (TREE_CODE (*base) == COMPONENT_REF
1853 	     || TREE_CODE (*base) == ARRAY_REF)
1854 	base = &TREE_OPERAND (*base, 0);
1855 
1856       if (TREE_CODE (*base) != VAR_DECL)
1857 	return NULL;
1858 
1859       tree *new_decl = info->adjusted_vars->get (*base);
1860       if (!new_decl)
1861 	return NULL;
1862 
1863       int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1864       tree field = TREE_OPERAND (*tp, 1);
1865 
1866       /* Adjust the type of the field.  */
1867       int field_quals = TYPE_QUALS (TREE_TYPE (field));
1868       if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1869 	{
1870 	  tree *field_type = &TREE_TYPE (field);
1871 	  while (TREE_CODE (*field_type) == ARRAY_TYPE)
1872 	    field_type = &TREE_TYPE (*field_type);
1873 	  field_quals |= base_quals;
1874 	  *field_type = build_qualified_type (*field_type, field_quals);
1875 	}
1876 
1877       /* Adjust the type of the component ref itself.  */
1878       tree comp_type = TREE_TYPE (*tp);
1879       int comp_quals = TYPE_QUALS (comp_type);
1880       if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1881 	{
1882 	  comp_quals |= base_quals;
1883 	  TREE_TYPE (*tp)
1884 	    = build_qualified_type (comp_type, comp_quals);
1885 	}
1886 
1887       *base = *new_decl;
1888       info->modified = true;
1889     }
1890   else if (TREE_CODE (*tp) == VAR_DECL)
1891     {
1892       tree *new_decl = info->adjusted_vars->get (*tp);
1893       if (new_decl)
1894 	{
1895 	  *tp = *new_decl;
1896 	  info->modified = true;
1897 	}
1898     }
1899 
1900   return NULL_TREE;
1901 }
1902 
1903 /* Return TRUE if CALL is a call to a builtin atomic/sync operation.  */
1904 
1905 static bool
is_sync_builtin_call(gcall * call)1906 is_sync_builtin_call (gcall *call)
1907 {
1908   tree callee = gimple_call_fndecl (call);
1909 
1910   if (callee != NULL_TREE
1911       && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1912     switch (DECL_FUNCTION_CODE (callee))
1913       {
1914 #undef DEF_SYNC_BUILTIN
1915 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1916 #include "sync-builtins.def"
1917 #undef DEF_SYNC_BUILTIN
1918 	return true;
1919 
1920       default:
1921 	;
1922       }
1923 
1924   return false;
1925 }
1926 
1927 /* Main entry point for oacc transformations which run on the device
1928    compiler after LTO, so we know what the target device is at this
1929    point (including the host fallback).  */
1930 
1931 static unsigned int
execute_oacc_loop_designation()1932 execute_oacc_loop_designation ()
1933 {
1934   tree attrs = oacc_get_fn_attrib (current_function_decl);
1935 
1936   if (!attrs)
1937     /* Not an offloaded function.  */
1938     return 0;
1939 
1940   /* Parse the default dim argument exactly once.  */
1941   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1942     {
1943       oacc_parse_default_dims (flag_openacc_dims);
1944       flag_openacc_dims = (char *)&flag_openacc_dims;
1945     }
1946 
1947   bool is_oacc_parallel
1948     = (lookup_attribute ("oacc parallel",
1949 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1950   bool is_oacc_kernels
1951     = (lookup_attribute ("oacc kernels",
1952 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1953   bool is_oacc_serial
1954     = (lookup_attribute ("oacc serial",
1955 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1956   bool is_oacc_parallel_kernels_parallelized
1957     = (lookup_attribute ("oacc parallel_kernels_parallelized",
1958 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1959   bool is_oacc_parallel_kernels_gang_single
1960     = (lookup_attribute ("oacc parallel_kernels_gang_single",
1961 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1962   int fn_level = oacc_fn_attrib_level (attrs);
1963   bool is_oacc_routine = (fn_level >= 0);
1964   gcc_checking_assert (is_oacc_parallel
1965 		       + is_oacc_kernels
1966 		       + is_oacc_serial
1967 		       + is_oacc_parallel_kernels_parallelized
1968 		       + is_oacc_parallel_kernels_gang_single
1969 		       + is_oacc_routine
1970 		       == 1);
1971 
1972   bool is_oacc_kernels_parallelized
1973     = (lookup_attribute ("oacc kernels parallelized",
1974 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1975   if (is_oacc_kernels_parallelized)
1976     gcc_checking_assert (is_oacc_kernels);
1977 
1978   if (dump_file)
1979     {
1980       if (is_oacc_parallel)
1981 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
1982       else if (is_oacc_kernels)
1983 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1984 		 (is_oacc_kernels_parallelized
1985 		  ? "parallelized" : "unparallelized"));
1986       else if (is_oacc_serial)
1987 	fprintf (dump_file, "Function is OpenACC serial offload\n");
1988       else if (is_oacc_parallel_kernels_parallelized)
1989 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1990 		 "parallel_kernels_parallelized");
1991       else if (is_oacc_parallel_kernels_gang_single)
1992 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1993 		 "parallel_kernels_gang_single");
1994       else if (is_oacc_routine)
1995 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
1996 		 fn_level);
1997       else
1998 	gcc_unreachable ();
1999     }
2000 
2001   /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2002      it's a convenient place, so...  */
2003   if (is_oacc_routine)
2004     {
2005       tree attr = lookup_attribute ("omp declare target",
2006 				    DECL_ATTRIBUTES (current_function_decl));
2007       gcc_checking_assert (attr);
2008       tree clauses = TREE_VALUE (attr);
2009       gcc_checking_assert (clauses);
2010 
2011       /* Should this OpenACC routine be discarded?  */
2012       bool discard = false;
2013 
2014       tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
2015       if (dump_file)
2016 	fprintf (dump_file,
2017 		 "OpenACC routine '%s' %s '%s' clause.\n",
2018 		 lang_hooks.decl_printable_name (current_function_decl, 2),
2019 		 clause_nohost ? "has" : "doesn't have",
2020 		 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2021       /* Host compiler, 'nohost' clause?  */
2022 #ifndef ACCEL_COMPILER
2023       if (clause_nohost)
2024 	discard = true;
2025 #endif
2026 
2027       if (dump_file)
2028 	fprintf (dump_file,
2029 		 "OpenACC routine '%s' %sdiscarded.\n",
2030 		 lang_hooks.decl_printable_name (current_function_decl, 2),
2031 		 discard ? "" : "not ");
2032       if (discard)
2033 	{
2034 	  TREE_ASM_WRITTEN (current_function_decl) = 1;
2035 	  return TODO_discard_function;
2036 	}
2037     }
2038 
2039   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2040      kernels, so remove the parallelism dimensions function attributes
2041      potentially set earlier on.  */
2042   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2043     {
2044       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
2045       attrs = oacc_get_fn_attrib (current_function_decl);
2046     }
2047 
2048   /* Discover, partition and process the loops.  */
2049   oacc_loop *loops = oacc_loop_discovery ();
2050 
2051   unsigned outer_mask = 0;
2052   if (is_oacc_routine)
2053     outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2054   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
2055   /* OpenACC kernels constructs are special: they currently don't use the
2056      generic oacc_loop infrastructure and attribute/dimension processing.  */
2057   if (is_oacc_kernels && is_oacc_kernels_parallelized)
2058     {
2059       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
2060 	 also tree-parloops.cc:create_parallel_loop.  */
2061       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2062     }
2063 
2064   int dims[GOMP_DIM_MAX];
2065   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
2066 
2067   if (dump_file)
2068     {
2069       const char *comma = "Compute dimensions [";
2070       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2071 	fprintf (dump_file, "%s%d", comma, dims[ix]);
2072       fprintf (dump_file, "]\n");
2073     }
2074 
2075   /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2076      a single gang only.  */
2077   if (is_oacc_parallel_kernels_gang_single)
2078     gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2079 
2080   oacc_loop_process (loops, fn_level);
2081   if (dump_file)
2082     {
2083       fprintf (dump_file, "OpenACC loops\n");
2084       dump_oacc_loop (dump_file, loops, 0);
2085       fprintf (dump_file, "\n");
2086     }
2087   if (dump_enabled_p ())
2088     {
2089       oacc_loop *l = loops;
2090       /* OpenACC kernels constructs are special: they currently don't use the
2091 	 generic oacc_loop infrastructure.  */
2092       if (is_oacc_kernels)
2093 	{
2094 	  /* Create a fake oacc_loop for diagnostic purposes.  */
2095 	  l = new_oacc_loop_raw (NULL,
2096 				 DECL_SOURCE_LOCATION (current_function_decl));
2097 	  l->mask = used_mask;
2098 	}
2099       else
2100 	{
2101 	  /* Skip the outermost, dummy OpenACC loop  */
2102 	  l = l->child;
2103 	}
2104       if (l)
2105 	inform_oacc_loop (l);
2106       if (is_oacc_kernels)
2107 	free_oacc_loop (l);
2108     }
2109 
2110   free_oacc_loop (loops);
2111 
2112   return 0;
2113 }
2114 
2115 static unsigned int
execute_oacc_device_lower()2116 execute_oacc_device_lower ()
2117 {
2118   tree attrs = oacc_get_fn_attrib (current_function_decl);
2119 
2120   if (!attrs)
2121     /* Not an offloaded function.  */
2122     return 0;
2123 
2124   int dims[GOMP_DIM_MAX];
2125   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2126     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
2127 
2128   hash_map<tree, tree> adjusted_vars;
2129 
2130   /* Now lower internal loop functions to target-specific code
2131      sequences.  */
2132   basic_block bb;
2133   FOR_ALL_BB_FN (bb, cfun)
2134     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2135       {
2136 	gimple *stmt = gsi_stmt (gsi);
2137 	if (!is_gimple_call (stmt))
2138 	  {
2139 	    gsi_next (&gsi);
2140 	    continue;
2141 	  }
2142 
2143 	gcall *call = as_a <gcall *> (stmt);
2144 	if (!gimple_call_internal_p (call))
2145 	  {
2146 	    gsi_next (&gsi);
2147 	    continue;
2148 	  }
2149 
2150 	/* Rewind to allow rescan.  */
2151 	gsi_prev (&gsi);
2152 	bool rescan = false, remove = false;
2153 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
2154 
2155 	switch (ifn_code)
2156 	  {
2157 	  default: break;
2158 
2159 	  case IFN_GOACC_TILE:
2160 	    oacc_xform_tile (call);
2161 	    rescan = true;
2162 	    break;
2163 
2164 	  case IFN_GOACC_LOOP:
2165 	    oacc_xform_loop (call);
2166 	    rescan = true;
2167 	    break;
2168 
2169 	  case IFN_GOACC_REDUCTION:
2170 	    /* Mark the function for SSA renaming.  */
2171 	    mark_virtual_operands_for_renaming (cfun);
2172 
2173 	    /* If the level is -1, this ended up being an unused
2174 	       axis.  Handle as a default.  */
2175 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
2176 	      default_goacc_reduction (call);
2177 	    else
2178 	      targetm.goacc.reduction (call);
2179 	    rescan = true;
2180 	    break;
2181 
2182 	  case IFN_UNIQUE:
2183 	    {
2184 	      enum ifn_unique_kind kind
2185 		= ((enum ifn_unique_kind)
2186 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2187 
2188 	      switch (kind)
2189 		{
2190 		default:
2191 		  break;
2192 
2193 		case IFN_UNIQUE_OACC_FORK:
2194 		case IFN_UNIQUE_OACC_JOIN:
2195 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
2196 		    remove = true;
2197 		  else if (!targetm.goacc.fork_join
2198 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2199 		    remove = true;
2200 		  break;
2201 
2202 		case IFN_UNIQUE_OACC_HEAD_MARK:
2203 		case IFN_UNIQUE_OACC_TAIL_MARK:
2204 		  remove = true;
2205 		  break;
2206 
2207 		case IFN_UNIQUE_OACC_PRIVATE:
2208 		  {
2209 		    dump_flags_t l_dump_flags
2210 		      = get_openacc_privatization_dump_flags ();
2211 
2212 		    location_t loc = gimple_location (stmt);
2213 		    if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2214 		      loc = DECL_SOURCE_LOCATION (current_function_decl);
2215 		    const dump_user_location_t d_u_loc
2216 		      = dump_user_location_t::from_location_t (loc);
2217 
2218 		    HOST_WIDE_INT level
2219 		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2220 		    gcc_checking_assert (level == -1
2221 					 || (level >= 0
2222 					     && level < GOMP_DIM_MAX));
2223 		    for (unsigned i = 3;
2224 			 i < gimple_call_num_args (call);
2225 			 i++)
2226 		      {
2227 			static char const *const axes[] =
2228 			/* Must be kept in sync with GOMP_DIM enumeration.  */
2229 			  { "gang", "worker", "vector" };
2230 
2231 			tree arg = gimple_call_arg (call, i);
2232 			gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2233 			tree decl = TREE_OPERAND (arg, 0);
2234 			if (dump_enabled_p ())
2235 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2236 #if __GNUC__ >= 10
2237 # pragma GCC diagnostic push
2238 # pragma GCC diagnostic ignored "-Wformat"
2239 #endif
2240 			  dump_printf_loc (l_dump_flags, d_u_loc,
2241 					   "variable %<%T%> ought to be"
2242 					   " adjusted for OpenACC"
2243 					   " privatization level: %qs\n",
2244 					   decl,
2245 					   (level == -1
2246 					    ? "UNKNOWN" : axes[level]));
2247 #if __GNUC__ >= 10
2248 # pragma GCC diagnostic pop
2249 #endif
2250 			bool adjusted;
2251 			if (level == -1)
2252 			  adjusted = false;
2253 			else if (!targetm.goacc.adjust_private_decl)
2254 			  adjusted = false;
2255 			else if (level == GOMP_DIM_VECTOR)
2256 			  {
2257 			    /* That's the default behavior.  */
2258 			    adjusted = true;
2259 			  }
2260 			else
2261 			  {
2262 			    tree oldtype = TREE_TYPE (decl);
2263 			    tree newdecl
2264 			      = targetm.goacc.adjust_private_decl (loc, decl,
2265 								   level);
2266 			    adjusted = (TREE_TYPE (newdecl) != oldtype
2267 					|| newdecl != decl);
2268 			    if (adjusted)
2269 			      adjusted_vars.put (decl, newdecl);
2270 			  }
2271 			if (adjusted
2272 			    && dump_enabled_p ())
2273 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2274 #if __GNUC__ >= 10
2275 # pragma GCC diagnostic push
2276 # pragma GCC diagnostic ignored "-Wformat"
2277 #endif
2278 			  dump_printf_loc (l_dump_flags, d_u_loc,
2279 					   "variable %<%T%> adjusted for"
2280 					   " OpenACC privatization level:"
2281 					   " %qs\n",
2282 					   decl, axes[level]);
2283 #if __GNUC__ >= 10
2284 # pragma GCC diagnostic pop
2285 #endif
2286 		      }
2287 		    remove = true;
2288 		  }
2289 		  break;
2290 		}
2291 	      break;
2292 	    }
2293 	  }
2294 
2295 	if (gsi_end_p (gsi))
2296 	  /* We rewound past the beginning of the BB.  */
2297 	  gsi = gsi_start_bb (bb);
2298 	else
2299 	  /* Undo the rewind.  */
2300 	  gsi_next (&gsi);
2301 
2302 	if (remove)
2303 	  {
2304 	    if (gimple_vdef (call))
2305 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
2306 	    if (gimple_call_lhs (call))
2307 	      {
2308 		/* Propagate the data dependency var.  */
2309 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
2310 						   gimple_call_arg (call, 1));
2311 		gsi_replace (&gsi, ass,  false);
2312 	      }
2313 	    else
2314 	      gsi_remove (&gsi, true);
2315 	  }
2316 	else if (!rescan)
2317 	  /* If not rescanning, advance over the call.  */
2318 	  gsi_next (&gsi);
2319       }
2320 
2321   /* Regarding the OpenACC privatization level, we're currently only looking at
2322      making the gang-private level work.  Regarding that, we have the following
2323      configurations:
2324 
2325        - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2326 	 particular, change 'TREE_TYPE', etc.) and there is no
2327 	 'targetm.goacc.expand_var_decl'.
2328 
2329        - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2330 	 marker and then 'targetm.goacc.expand_var_decl' does the work.
2331 
2332      Eventually (in particular, for worker-private level?), both
2333      'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2334      may need to do things, but that's currently not meant to be addressed, and
2335      thus not fully worked out and implemented, and thus untested.  Hence,
2336      'assert' what currently is implemented/tested, only.  */
2337 
2338   if (targetm.goacc.expand_var_decl)
2339     gcc_assert (adjusted_vars.is_empty ());
2340 
2341   /* Make adjustments to gang-private local variables if required by the
2342      target, e.g. forcing them into a particular address space.  Afterwards,
2343      ADDR_EXPR nodes which have adjusted variables as their argument need to
2344      be modified in one of two ways:
2345 
2346        1. They can be recreated, making a pointer to the variable in the new
2347 	  address space, or
2348 
2349        2. The address of the variable in the new address space can be taken,
2350 	  converted to the default (original) address space, and the result of
2351 	  that conversion subsituted in place of the original ADDR_EXPR node.
2352 
2353      Which of these is done depends on the gimple statement being processed.
2354      At present atomic operations and inline asms use (1), and everything else
2355      uses (2).  At least on AMD GCN, there are atomic operations that work
2356      directly in the LDS address space.
2357 
2358      COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2359      the new decl, adjusting types of appropriate tree nodes as necessary.  */
2360 
2361   if (targetm.goacc.adjust_private_decl
2362       && !adjusted_vars.is_empty ())
2363     {
2364       FOR_ALL_BB_FN (bb, cfun)
2365 	for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2366 	     !gsi_end_p (gsi);
2367 	     gsi_next (&gsi))
2368 	  {
2369 	    gimple *stmt = gsi_stmt (gsi);
2370 	    walk_stmt_info wi;
2371 	    var_decl_rewrite_info info;
2372 
2373 	    info.avoid_pointer_conversion
2374 	      = (is_gimple_call (stmt)
2375 		 && is_sync_builtin_call (as_a <gcall *> (stmt)))
2376 		|| gimple_code (stmt) == GIMPLE_ASM;
2377 	    info.stmt = stmt;
2378 	    info.modified = false;
2379 	    info.adjusted_vars = &adjusted_vars;
2380 
2381 	    memset (&wi, 0, sizeof (wi));
2382 	    wi.info = &info;
2383 
2384 	    walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2385 
2386 	    if (info.modified)
2387 	      update_stmt (stmt);
2388 	  }
2389     }
2390 
2391   return 0;
2392 }
2393 
2394 /* Default launch dimension validator.  Force everything to 1.  A
2395    backend that wants to provide larger dimensions must override this
2396    hook.  */
2397 
2398 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))2399 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2400 			     int ARG_UNUSED (fn_level),
2401 			     unsigned ARG_UNUSED (used))
2402 {
2403   bool changed = false;
2404 
2405   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2406     {
2407       if (dims[ix] != 1)
2408 	{
2409 	  dims[ix] = 1;
2410 	  changed = true;
2411 	}
2412     }
2413 
2414   return changed;
2415 }
2416 
2417 /* Default dimension bound is unknown on accelerator and 1 on host.  */
2418 
2419 int
default_goacc_dim_limit(int ARG_UNUSED (axis))2420 default_goacc_dim_limit (int ARG_UNUSED (axis))
2421 {
2422 #ifdef ACCEL_COMPILER
2423   return 0;
2424 #else
2425   return 1;
2426 #endif
2427 }
2428 
2429 namespace {
2430 
2431 const pass_data pass_data_oacc_loop_designation =
2432 {
2433   GIMPLE_PASS, /* type */
2434   "oaccloops", /* name */
2435   OPTGROUP_OMP, /* optinfo_flags */
2436   TV_NONE, /* tv_id */
2437   PROP_cfg, /* properties_required */
2438   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
2439   0, /* properties_destroyed */
2440   0, /* todo_flags_start */
2441   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2442 };
2443 
2444 class pass_oacc_loop_designation : public gimple_opt_pass
2445 {
2446 public:
pass_oacc_loop_designation(gcc::context * ctxt)2447   pass_oacc_loop_designation (gcc::context *ctxt)
2448     : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2449   {}
2450 
2451   /* opt_pass methods: */
gate(function *)2452   virtual bool gate (function *) { return flag_openacc; };
2453 
execute(function *)2454   virtual unsigned int execute (function *)
2455     {
2456       return execute_oacc_loop_designation ();
2457     }
2458 
2459 }; // class pass_oacc_loop_designation
2460 
2461 const pass_data pass_data_oacc_device_lower =
2462 {
2463   GIMPLE_PASS, /* type */
2464   "oaccdevlow", /* name */
2465   OPTGROUP_OMP, /* optinfo_flags */
2466   TV_NONE, /* tv_id */
2467   PROP_cfg, /* properties_required */
2468   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
2469   0, /* properties_destroyed */
2470   0, /* todo_flags_start */
2471   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2472 };
2473 
2474 class pass_oacc_device_lower : public gimple_opt_pass
2475 {
2476 public:
pass_oacc_device_lower(gcc::context * ctxt)2477   pass_oacc_device_lower (gcc::context *ctxt)
2478     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2479   {}
2480 
2481   /* opt_pass methods: */
gate(function *)2482   virtual bool gate (function *) { return flag_openacc; };
2483 
execute(function *)2484   virtual unsigned int execute (function *)
2485     {
2486       return execute_oacc_device_lower ();
2487     }
2488 
2489 }; // class pass_oacc_device_lower
2490 
2491 } // anon namespace
2492 
2493 gimple_opt_pass *
make_pass_oacc_loop_designation(gcc::context * ctxt)2494 make_pass_oacc_loop_designation (gcc::context *ctxt)
2495 {
2496   return new pass_oacc_loop_designation (ctxt);
2497 }
2498 
2499 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)2500 make_pass_oacc_device_lower (gcc::context *ctxt)
2501 {
2502   return new pass_oacc_device_lower (ctxt);
2503 }
2504 
2505 
2506 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2507    GOMP_SIMT_ENTER call identifying the privatized variables, which are
2508    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2509    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
2510 
2511 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)2512 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2513 {
2514   gimple *alloc_stmt = gsi_stmt (*gsi);
2515   tree simtrec = gimple_call_lhs (alloc_stmt);
2516   tree simduid = gimple_call_arg (alloc_stmt, 0);
2517   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2518   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2519   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2520   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2521   TREE_ADDRESSABLE (rectype) = 1;
2522   TREE_TYPE (simtrec) = build_pointer_type (rectype);
2523   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2524     {
2525       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2526       if (*argp == null_pointer_node)
2527 	continue;
2528       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2529 		  && VAR_P (TREE_OPERAND (*argp, 0)));
2530       tree var = TREE_OPERAND (*argp, 0);
2531 
2532       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2533 			       DECL_NAME (var), TREE_TYPE (var));
2534       SET_DECL_ALIGN (field, DECL_ALIGN (var));
2535       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2536       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2537 
2538       insert_field_into_struct (rectype, field);
2539 
2540       tree t = build_simple_mem_ref (simtrec);
2541       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2542       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2543       SET_DECL_VALUE_EXPR (var, t);
2544       DECL_HAS_VALUE_EXPR_P (var) = 1;
2545       *regimplify = true;
2546     }
2547   layout_type (rectype);
2548   tree size = TYPE_SIZE_UNIT (rectype);
2549   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2550 
2551   alloc_stmt
2552     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2553   gimple_call_set_lhs (alloc_stmt, simtrec);
2554   gsi_replace (gsi, alloc_stmt, false);
2555   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2556   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2557   gsi_replace (&enter_gsi, enter_stmt, false);
2558 
2559   use_operand_p use;
2560   gimple *exit_stmt;
2561   if (single_imm_use (simtrec, &use, &exit_stmt))
2562     {
2563       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2564       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2565       tree clobber = build_clobber (rectype);
2566       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2567       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2568     }
2569   else
2570     gcc_checking_assert (has_zero_uses (simtrec));
2571 }
2572 
2573 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
2574 
2575 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)2576 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2577 {
2578   tree t = *tp;
2579 
2580   if (VAR_P (t)
2581       && DECL_HAS_VALUE_EXPR_P (t)
2582       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2583     {
2584       *walk_subtrees = 0;
2585       return t;
2586     }
2587   return NULL_TREE;
2588 }
2589 
2590 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2591    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2592    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
2593    internal functions on non-SIMT targets, and likewise some SIMD internal
2594    functions on SIMT targets.  */
2595 
2596 static unsigned int
execute_omp_device_lower()2597 execute_omp_device_lower ()
2598 {
2599   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2600   bool regimplify = false;
2601   basic_block bb;
2602   gimple_stmt_iterator gsi;
2603   bool calls_declare_variant_alt
2604     = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2605   FOR_EACH_BB_FN (bb, cfun)
2606     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2607       {
2608 	gimple *stmt = gsi_stmt (gsi);
2609 	if (!is_gimple_call (stmt))
2610 	  continue;
2611 	if (!gimple_call_internal_p (stmt))
2612 	  {
2613 	    if (calls_declare_variant_alt)
2614 	      if (tree fndecl = gimple_call_fndecl (stmt))
2615 		{
2616 		  tree new_fndecl = omp_resolve_declare_variant (fndecl);
2617 		  if (new_fndecl != fndecl)
2618 		    {
2619 		      gimple_call_set_fndecl (stmt, new_fndecl);
2620 		      update_stmt (stmt);
2621 		    }
2622 		}
2623 	    continue;
2624 	  }
2625 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2626 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2627 	switch (gimple_call_internal_fn (stmt))
2628 	  {
2629 	  case IFN_GOMP_USE_SIMT:
2630 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
2631 	    break;
2632 	  case IFN_GOMP_SIMT_ENTER:
2633 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2634 	    goto simtreg_enter_exit;
2635 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
2636 	    if (vf != 1)
2637 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2638 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2639 	    goto simtreg_enter_exit;
2640 	  case IFN_GOMP_SIMT_EXIT:
2641 	  simtreg_enter_exit:
2642 	    if (vf != 1)
2643 	      continue;
2644 	    unlink_stmt_vdef (stmt);
2645 	    break;
2646 	  case IFN_GOMP_SIMT_LANE:
2647 	  case IFN_GOMP_SIMT_LAST_LANE:
2648 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2649 	    break;
2650 	  case IFN_GOMP_SIMT_VF:
2651 	    rhs = build_int_cst (type, vf);
2652 	    break;
2653 	  case IFN_GOMP_SIMT_ORDERED_PRED:
2654 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2655 	    if (rhs || !lhs)
2656 	      unlink_stmt_vdef (stmt);
2657 	    break;
2658 	  case IFN_GOMP_SIMT_VOTE_ANY:
2659 	  case IFN_GOMP_SIMT_XCHG_BFLY:
2660 	  case IFN_GOMP_SIMT_XCHG_IDX:
2661 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2662 	    break;
2663 	  case IFN_GOMP_SIMD_LANE:
2664 	  case IFN_GOMP_SIMD_LAST_LANE:
2665 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2666 	    break;
2667 	  case IFN_GOMP_SIMD_VF:
2668 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2669 	    break;
2670 	  default:
2671 	    continue;
2672 	  }
2673 	if (lhs && !rhs)
2674 	  continue;
2675 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2676 	gsi_replace (&gsi, stmt, false);
2677       }
2678   if (regimplify)
2679     FOR_EACH_BB_REVERSE_FN (bb, cfun)
2680       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2681 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2682 	  {
2683 	    if (gimple_clobber_p (gsi_stmt (gsi)))
2684 	      gsi_remove (&gsi, true);
2685 	    else
2686 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2687 	  }
2688   if (vf != 1)
2689     cfun->has_force_vectorize_loops = false;
2690   return 0;
2691 }
2692 
2693 namespace {
2694 
2695 const pass_data pass_data_omp_device_lower =
2696 {
2697   GIMPLE_PASS, /* type */
2698   "ompdevlow", /* name */
2699   OPTGROUP_OMP, /* optinfo_flags */
2700   TV_NONE, /* tv_id */
2701   PROP_cfg, /* properties_required */
2702   PROP_gimple_lomp_dev, /* properties_provided */
2703   0, /* properties_destroyed */
2704   0, /* todo_flags_start */
2705   TODO_update_ssa, /* todo_flags_finish */
2706 };
2707 
2708 class pass_omp_device_lower : public gimple_opt_pass
2709 {
2710 public:
pass_omp_device_lower(gcc::context * ctxt)2711   pass_omp_device_lower (gcc::context *ctxt)
2712     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2713   {}
2714 
2715   /* opt_pass methods: */
gate(function * fun)2716   virtual bool gate (function *fun)
2717     {
2718       return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2719 	      || (flag_openmp
2720 		  && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2721     }
execute(function *)2722   virtual unsigned int execute (function *)
2723     {
2724       return execute_omp_device_lower ();
2725     }
2726 
2727 }; // class pass_expand_omp_ssa
2728 
2729 } // anon namespace
2730 
2731 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2732 make_pass_omp_device_lower (gcc::context *ctxt)
2733 {
2734   return new pass_omp_device_lower (ctxt);
2735 }
2736 
2737 /* "omp declare target link" handling pass.  */
2738 
2739 namespace {
2740 
2741 const pass_data pass_data_omp_target_link =
2742 {
2743   GIMPLE_PASS,			/* type */
2744   "omptargetlink",		/* name */
2745   OPTGROUP_OMP,			/* optinfo_flags */
2746   TV_NONE,			/* tv_id */
2747   PROP_ssa,			/* properties_required */
2748   0,				/* properties_provided */
2749   0,				/* properties_destroyed */
2750   0,				/* todo_flags_start */
2751   TODO_update_ssa,		/* todo_flags_finish */
2752 };
2753 
2754 class pass_omp_target_link : public gimple_opt_pass
2755 {
2756 public:
pass_omp_target_link(gcc::context * ctxt)2757   pass_omp_target_link (gcc::context *ctxt)
2758     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2759   {}
2760 
2761   /* opt_pass methods: */
gate(function * fun)2762   virtual bool gate (function *fun)
2763     {
2764 #ifdef ACCEL_COMPILER
2765       return offloading_function_p (fun->decl);
2766 #else
2767       (void) fun;
2768       return false;
2769 #endif
2770     }
2771 
2772   virtual unsigned execute (function *);
2773 };
2774 
2775 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
2776 
2777 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2778 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2779 {
2780   tree t = *tp;
2781 
2782   if (VAR_P (t)
2783       && DECL_HAS_VALUE_EXPR_P (t)
2784       && is_global_var (t)
2785       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2786     {
2787       *walk_subtrees = 0;
2788       return t;
2789     }
2790 
2791   return NULL_TREE;
2792 }
2793 
2794 unsigned
execute(function * fun)2795 pass_omp_target_link::execute (function *fun)
2796 {
2797   basic_block bb;
2798   FOR_EACH_BB_FN (bb, fun)
2799     {
2800       gimple_stmt_iterator gsi;
2801       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2802 	{
2803 	  if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
2804 	    {
2805 	      /* Nullify the second argument of __builtin_GOMP_target_ext.  */
2806 	      gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
2807 	      update_stmt (gsi_stmt (gsi));
2808 	    }
2809 	  if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2810 	    gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2811 	}
2812     }
2813 
2814   return 0;
2815 }
2816 
2817 } // anon namespace
2818 
2819 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2820 make_pass_omp_target_link (gcc::context *ctxt)
2821 {
2822   return new pass_omp_target_link (ctxt);
2823 }
2824