1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2018 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 
56 /* Describe the OpenACC looping structure of a function.  The entire
57    function is held in a 'NULL' loop.  */
58 
59 struct oacc_loop
60 {
61   oacc_loop *parent; /* Containing loop.  */
62 
63   oacc_loop *child; /* First inner loop.  */
64 
65   oacc_loop *sibling; /* Next loop within same parent.  */
66 
67   location_t loc; /* Location of the loop start.  */
68 
69   gcall *marker; /* Initial head marker.  */
70 
71   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
72   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
73 
74   tree routine;  /* Pseudo-loop enclosing a routine.  */
75 
76   unsigned mask;   /* Partitioning mask.  */
77   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
78   unsigned inner;  /* Partitioning of inner loops.  */
79   unsigned flags;  /* Partitioning flags.  */
80   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
81   tree chunk_size; /* Chunk size.  */
82   gcall *head_end; /* Final marker of head sequence.  */
83 };
84 
85 /* Holds offload tables with decls.  */
86 vec<tree, va_gc> *offload_funcs, *offload_vars;
87 
88 /* Return level at which oacc routine may spawn a partitioned loop, or
89    -1 if it is not a routine (i.e. is an offload fn).  */
90 
91 static int
oacc_fn_attrib_level(tree attr)92 oacc_fn_attrib_level (tree attr)
93 {
94   tree pos = TREE_VALUE (attr);
95 
96   if (!TREE_PURPOSE (pos))
97     return -1;
98 
99   int ix = 0;
100   for (ix = 0; ix != GOMP_DIM_MAX;
101        ix++, pos = TREE_CHAIN (pos))
102     if (!integer_zerop (TREE_PURPOSE (pos)))
103       break;
104 
105   return ix;
106 }
107 
108 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
109    adds their addresses and sizes to constructor-vector V_CTOR.  */
110 
111 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)112 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
113 					 vec<constructor_elt, va_gc> *v_ctor)
114 {
115   unsigned len = vec_safe_length (v_decls);
116   for (unsigned i = 0; i < len; i++)
117     {
118       tree it = (*v_decls)[i];
119       bool is_var = VAR_P (it);
120       bool is_link_var
121 	= is_var
122 #ifdef ACCEL_COMPILER
123 	  && DECL_HAS_VALUE_EXPR_P (it)
124 #endif
125 	  && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
126 
127       tree size = NULL_TREE;
128       if (is_var)
129 	size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
130 
131       tree addr;
132       if (!is_link_var)
133 	addr = build_fold_addr_expr (it);
134       else
135 	{
136 #ifdef ACCEL_COMPILER
137 	  /* For "omp declare target link" vars add address of the pointer to
138 	     the target table, instead of address of the var.  */
139 	  tree value_expr = DECL_VALUE_EXPR (it);
140 	  tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
141 	  varpool_node::finalize_decl (link_ptr_decl);
142 	  addr = build_fold_addr_expr (link_ptr_decl);
143 #else
144 	  addr = build_fold_addr_expr (it);
145 #endif
146 
147 	  /* Most significant bit of the size marks "omp declare target link"
148 	     vars in host and target tables.  */
149 	  unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
150 	  isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
151 			    * BITS_PER_UNIT - 1);
152 	  size = wide_int_to_tree (const_ptr_type_node, isize);
153 	}
154 
155       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
156       if (is_var)
157 	CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
158     }
159 }
160 
161 /* Create new symbols containing (address, size) pairs for global variables,
162    marked with "omp declare target" attribute, as well as addresses for the
163    functions, which are outlined offloading regions.  */
164 void
omp_finish_file(void)165 omp_finish_file (void)
166 {
167   unsigned num_funcs = vec_safe_length (offload_funcs);
168   unsigned num_vars = vec_safe_length (offload_vars);
169 
170   if (num_funcs == 0 && num_vars == 0)
171     return;
172 
173   if (targetm_common.have_named_sections)
174     {
175       vec<constructor_elt, va_gc> *v_f, *v_v;
176       vec_alloc (v_f, num_funcs);
177       vec_alloc (v_v, num_vars * 2);
178 
179       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
180       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
181 
182       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
183 						    num_vars * 2);
184       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
185 						     num_funcs);
186       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
187       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
188       tree ctor_v = build_constructor (vars_decl_type, v_v);
189       tree ctor_f = build_constructor (funcs_decl_type, v_f);
190       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
191       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
192       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
193 				    get_identifier (".offload_func_table"),
194 				    funcs_decl_type);
195       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
196 				   get_identifier (".offload_var_table"),
197 				   vars_decl_type);
198       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
199       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
200 	 otherwise a joint table in a binary will contain padding between
201 	 tables from multiple object files.  */
202       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
203       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
204       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
205       DECL_INITIAL (funcs_decl) = ctor_f;
206       DECL_INITIAL (vars_decl) = ctor_v;
207       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
208       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
209 
210       varpool_node::finalize_decl (vars_decl);
211       varpool_node::finalize_decl (funcs_decl);
212     }
213   else
214     {
215       for (unsigned i = 0; i < num_funcs; i++)
216 	{
217 	  tree it = (*offload_funcs)[i];
218 	  targetm.record_offload_symbol (it);
219 	}
220       for (unsigned i = 0; i < num_vars; i++)
221 	{
222 	  tree it = (*offload_vars)[i];
223 	  targetm.record_offload_symbol (it);
224 	}
225     }
226 }
227 
228 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
229    axis DIM.  Return a tmp var holding the result.  */
230 
231 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)232 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
233 {
234   tree arg = build_int_cst (unsigned_type_node, dim);
235   tree size = create_tmp_var (integer_type_node);
236   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
237   gimple *call = gimple_build_call_internal (fn, 1, arg);
238 
239   gimple_call_set_lhs (call, size);
240   gimple_seq_add_stmt (seq, call);
241 
242   return size;
243 }
244 
245 /* Find the number of threads (POS = false), or thread number (POS =
246    true) for an OpenACC region partitioned as MASK.  Setup code
247    required for the calculation is added to SEQ.  */
248 
249 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)250 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
251 {
252   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
253   unsigned ix;
254 
255   /* Start at gang level, and examine relevant dimension indices.  */
256   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
257     if (GOMP_DIM_MASK (ix) & mask)
258       {
259 	if (res)
260 	  {
261 	    /* We had an outer index, so scale that by the size of
262 	       this dimension.  */
263 	    tree n = oacc_dim_call (false, ix, seq);
264 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
265 	  }
266 	if (pos)
267 	  {
268 	    /* Determine index in this dimension.  */
269 	    tree id = oacc_dim_call (true, ix, seq);
270 	    if (res)
271 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
272 	    else
273 	      res = id;
274 	  }
275       }
276 
277   if (res == NULL_TREE)
278     res = integer_zero_node;
279 
280   return res;
281 }
282 
283 /* Transform IFN_GOACC_LOOP calls to actual code.  See
284    expand_oacc_for for where these are generated.  At the vector
285    level, we stride loops, such that each member of a warp will
286    operate on adjacent iterations.  At the worker and gang level,
287    each gang/warp executes a set of contiguous iterations.  Chunking
288    can override this such that each iteration engine executes a
289    contiguous chunk, and then moves on to stride to the next chunk.  */
290 
291 static void
oacc_xform_loop(gcall * call)292 oacc_xform_loop (gcall *call)
293 {
294   gimple_stmt_iterator gsi = gsi_for_stmt (call);
295   enum ifn_goacc_loop_kind code
296     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
297   tree dir = gimple_call_arg (call, 1);
298   tree range = gimple_call_arg (call, 2);
299   tree step = gimple_call_arg (call, 3);
300   tree chunk_size = NULL_TREE;
301   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
302   tree lhs = gimple_call_lhs (call);
303   tree type = TREE_TYPE (lhs);
304   tree diff_type = TREE_TYPE (range);
305   tree r = NULL_TREE;
306   gimple_seq seq = NULL;
307   bool chunking = false, striding = true;
308   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
309   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
310 
311 #ifdef ACCEL_COMPILER
312   chunk_size = gimple_call_arg (call, 4);
313   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
314       || integer_zerop (chunk_size))   /* Default (also static).  */
315     {
316       /* If we're at the gang level, we want each to execute a
317 	 contiguous run of iterations.  Otherwise we want each element
318 	 to stride.  */
319       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
320       chunking = false;
321     }
322   else
323     {
324       /* Chunk of size 1 is striding.  */
325       striding = integer_onep (chunk_size);
326       chunking = !striding;
327     }
328 #endif
329 
330   /* striding=true, chunking=true
331        -> invalid.
332      striding=true, chunking=false
333        -> chunks=1
334      striding=false,chunking=true
335        -> chunks=ceil (range/(chunksize*threads*step))
336      striding=false,chunking=false
337        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
338   push_gimplify_context (true);
339 
340   switch (code)
341     {
342     default: gcc_unreachable ();
343 
344     case IFN_GOACC_LOOP_CHUNKS:
345       if (!chunking)
346 	r = build_int_cst (type, 1);
347       else
348 	{
349 	  /* chunk_max
350 	     = (range - dir) / (chunks * step * num_threads) + dir  */
351 	  tree per = oacc_thread_numbers (false, mask, &seq);
352 	  per = fold_convert (type, per);
353 	  chunk_size = fold_convert (type, chunk_size);
354 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
355 	  per = fold_build2 (MULT_EXPR, type, per, step);
356 	  r = build2 (MINUS_EXPR, type, range, dir);
357 	  r = build2 (PLUS_EXPR, type, r, per);
358 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
359 	}
360       break;
361 
362     case IFN_GOACC_LOOP_STEP:
363       {
364 	/* If striding, step by the entire compute volume, otherwise
365 	   step by the inner volume.  */
366 	unsigned volume = striding ? mask : inner_mask;
367 
368 	r = oacc_thread_numbers (false, volume, &seq);
369 	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
370       }
371       break;
372 
373     case IFN_GOACC_LOOP_OFFSET:
374       /* Enable vectorization on non-SIMT targets.  */
375       if (!targetm.simt.vf
376 	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
377 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
378 	     the loop.  */
379 	  && (flag_tree_loop_vectorize
380 	      || !global_options_set.x_flag_tree_loop_vectorize))
381 	{
382 	  basic_block bb = gsi_bb (gsi);
383 	  struct loop *parent = bb->loop_father;
384 	  struct loop *body = parent->inner;
385 
386 	  parent->force_vectorize = true;
387 	  parent->safelen = INT_MAX;
388 
389 	  /* "Chunking loops" may have inner loops.  */
390 	  if (parent->inner)
391 	    {
392 	      body->force_vectorize = true;
393 	      body->safelen = INT_MAX;
394 	    }
395 
396 	  cfun->has_force_vectorize_loops = true;
397 	}
398       if (striding)
399 	{
400 	  r = oacc_thread_numbers (true, mask, &seq);
401 	  r = fold_convert (diff_type, r);
402 	}
403       else
404 	{
405 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
406 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
407 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
408 				     inner_size, outer_size);
409 
410 	  volume = fold_convert (diff_type, volume);
411 	  if (chunking)
412 	    chunk_size = fold_convert (diff_type, chunk_size);
413 	  else
414 	    {
415 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
416 
417 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
418 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
419 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
420 	    }
421 
422 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
423 			      fold_convert (diff_type, inner_size));
424 	  r = oacc_thread_numbers (true, outer_mask, &seq);
425 	  r = fold_convert (diff_type, r);
426 	  r = build2 (MULT_EXPR, diff_type, r, span);
427 
428 	  tree inner = oacc_thread_numbers (true, inner_mask, &seq);
429 	  inner = fold_convert (diff_type, inner);
430 	  r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
431 
432 	  if (chunking)
433 	    {
434 	      tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
435 	      tree per
436 		= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
437 	      per = build2 (MULT_EXPR, diff_type, per, chunk);
438 
439 	      r = build2 (PLUS_EXPR, diff_type, r, per);
440 	    }
441 	}
442       r = fold_build2 (MULT_EXPR, diff_type, r, step);
443       if (type != diff_type)
444 	r = fold_convert (type, r);
445       break;
446 
447     case IFN_GOACC_LOOP_BOUND:
448       if (striding)
449 	r = range;
450       else
451 	{
452 	  tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
453 	  tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
454 	  tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
455 				     inner_size, outer_size);
456 
457 	  volume = fold_convert (diff_type, volume);
458 	  if (chunking)
459 	    chunk_size = fold_convert (diff_type, chunk_size);
460 	  else
461 	    {
462 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
463 
464 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
465 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
466 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
467 	    }
468 
469 	  tree span = build2 (MULT_EXPR, diff_type, chunk_size,
470 			      fold_convert (diff_type, inner_size));
471 
472 	  r = fold_build2 (MULT_EXPR, diff_type, span, step);
473 
474 	  tree offset = gimple_call_arg (call, 6);
475 	  r = build2 (PLUS_EXPR, diff_type, r,
476 		      fold_convert (diff_type, offset));
477 	  r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
478 		      diff_type, r, range);
479 	}
480       if (diff_type != type)
481 	r = fold_convert (type, r);
482       break;
483     }
484 
485   gimplify_assign (lhs, r, &seq);
486 
487   pop_gimplify_context (NULL);
488 
489   gsi_replace_with_seq (&gsi, seq, true);
490 }
491 
492 /* Transform a GOACC_TILE call.  Determines the element loop span for
493    the specified loop of the nest.  This is 1 if we're not tiling.
494 
495    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
496 
497 static void
oacc_xform_tile(gcall * call)498 oacc_xform_tile (gcall *call)
499 {
500   gimple_stmt_iterator gsi = gsi_for_stmt (call);
501   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
502   /* Inner loops have higher loop_nos.  */
503   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
504   tree tile_size = gimple_call_arg (call, 2);
505   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
506   tree lhs = gimple_call_lhs (call);
507   tree type = TREE_TYPE (lhs);
508   gimple_seq seq = NULL;
509   tree span = build_int_cst (type, 1);
510 
511   gcc_assert (!(e_mask
512 		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
513 		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
514   push_gimplify_context (!seen_error ());
515 
516 #ifndef ACCEL_COMPILER
517   /* Partitioning disabled on host compilers.  */
518   e_mask = 0;
519 #endif
520   if (!e_mask)
521     /* Not paritioning.  */
522     span = integer_one_node;
523   else if (!integer_zerop (tile_size))
524     /* User explicitly specified size.  */
525     span = tile_size;
526   else
527     {
528       /* Pick a size based on the paritioning of the element loop and
529 	 the number of loop nests.  */
530       tree first_size = NULL_TREE;
531       tree second_size = NULL_TREE;
532 
533       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
534 	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
535       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
536 	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
537 
538       if (!first_size)
539 	{
540 	  first_size = second_size;
541 	  second_size = NULL_TREE;
542 	}
543 
544       if (loop_no + 1 == collapse)
545 	{
546 	  span = first_size;
547 	  if (!loop_no && second_size)
548 	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
549 				span, second_size);
550 	}
551       else if (loop_no + 2 == collapse)
552 	span = second_size;
553       else
554 	span = NULL_TREE;
555 
556       if (!span)
557 	/* There's no obvious element size for this loop.  Options
558 	   are 1, first_size or some non-unity constant (32 is my
559 	   favourite).   We should gather some statistics.  */
560 	span = first_size;
561     }
562 
563   span = fold_convert (type, span);
564   gimplify_assign (lhs, span, &seq);
565 
566   pop_gimplify_context (NULL);
567 
568   gsi_replace_with_seq (&gsi, seq, true);
569 }
570 
571 /* Default partitioned and minimum partitioned dimensions.  */
572 
573 static int oacc_default_dims[GOMP_DIM_MAX];
574 static int oacc_min_dims[GOMP_DIM_MAX];
575 
576 /* Parse the default dimension parameter.  This is a set of
577    :-separated optional compute dimensions.  Each specified dimension
578    is a positive integer.  When device type support is added, it is
579    planned to be a comma separated list of such compute dimensions,
580    with all but the first prefixed by the colon-terminated device
581    type.  */
582 
583 static void
oacc_parse_default_dims(const char * dims)584 oacc_parse_default_dims (const char *dims)
585 {
586   int ix;
587 
588   for (ix = GOMP_DIM_MAX; ix--;)
589     {
590       oacc_default_dims[ix] = -1;
591       oacc_min_dims[ix] = 1;
592     }
593 
594 #ifndef ACCEL_COMPILER
595   /* Cannot be overridden on the host.  */
596   dims = NULL;
597 #endif
598   if (dims)
599     {
600       const char *pos = dims;
601 
602       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
603 	{
604 	  if (ix)
605 	    {
606 	      if (*pos != ':')
607 		goto malformed;
608 	      pos++;
609 	    }
610 
611 	  if (*pos != ':')
612 	    {
613 	      long val;
614 	      const char *eptr;
615 
616 	      errno = 0;
617 	      val = strtol (pos, CONST_CAST (char **, &eptr), 10);
618 	      if (errno || val <= 0 || (int) val != val)
619 		goto malformed;
620 	      pos = eptr;
621 	      oacc_default_dims[ix] = (int) val;
622 	    }
623 	}
624       if (*pos)
625 	{
626 	malformed:
627 	  error_at (UNKNOWN_LOCATION,
628 		    "-fopenacc-dim operand is malformed at '%s'", pos);
629 	}
630     }
631 
632   /* Allow the backend to validate the dimensions.  */
633   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
634   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
635 }
636 
637 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
638    raw attribute.  DIMS is an array of dimensions, which is filled in.
639    LEVEL is the partitioning level of a routine, or -1 for an offload
640    region itself.  USED is the mask of partitioned execution in the
641    function.  */
642 
643 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)644 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
645 {
646   tree purpose[GOMP_DIM_MAX];
647   unsigned ix;
648   tree pos = TREE_VALUE (attrs);
649 
650   /* Make sure the attribute creator attached the dimension
651      information.  */
652   gcc_assert (pos);
653 
654   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
655     {
656       purpose[ix] = TREE_PURPOSE (pos);
657       tree val = TREE_VALUE (pos);
658       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
659       pos = TREE_CHAIN (pos);
660     }
661 
662   bool changed = targetm.goacc.validate_dims (fn, dims, level);
663 
664   /* Default anything left to 1 or a partitioned default.  */
665   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
666     if (dims[ix] < 0)
667       {
668 	/* The OpenACC spec says 'If the [num_gangs] clause is not
669 	   specified, an implementation-defined default will be used;
670 	   the default may depend on the code within the construct.'
671 	   (2.5.6).  Thus an implementation is free to choose
672 	   non-unity default for a parallel region that doesn't have
673 	   any gang-partitioned loops.  However, it appears that there
674 	   is a sufficient body of user code that expects non-gang
675 	   partitioned regions to not execute in gang-redundant mode.
676 	   So we (a) don't warn about the non-portability and (b) pick
677 	   the minimum permissible dimension size when there is no
678 	   partitioned execution.  Otherwise we pick the global
679 	   default for the dimension, which the user can control.  The
680 	   same wording and logic applies to num_workers and
681 	   vector_length, however the worker- or vector- single
682 	   execution doesn't have the same impact as gang-redundant
683 	   execution.  (If the minimum gang-level partioning is not 1,
684 	   the target is probably too confusing.)  */
685 	dims[ix] = (used & GOMP_DIM_MASK (ix)
686 		    ? oacc_default_dims[ix] : oacc_min_dims[ix]);
687 	changed = true;
688       }
689 
690   if (changed)
691     {
692       /* Replace the attribute with new values.  */
693       pos = NULL_TREE;
694       for (ix = GOMP_DIM_MAX; ix--;)
695 	pos = tree_cons (purpose[ix],
696 			 build_int_cst (integer_type_node, dims[ix]), pos);
697       oacc_replace_fn_attrib (fn, pos);
698     }
699 }
700 
701 /* Create an empty OpenACC loop structure at LOC.  */
702 
703 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)704 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
705 {
706   oacc_loop *loop = XCNEW (oacc_loop);
707 
708   loop->parent = parent;
709 
710   if (parent)
711     {
712       loop->sibling = parent->child;
713       parent->child = loop;
714     }
715 
716   loop->loc = loc;
717   return loop;
718 }
719 
720 /* Create an outermost, dummy OpenACC loop for offloaded function
721    DECL.  */
722 
723 static oacc_loop *
new_oacc_loop_outer(tree decl)724 new_oacc_loop_outer (tree decl)
725 {
726   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
727 }
728 
729 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
730    Link into PARENT loop.  Return the new loop.  */
731 
732 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)733 new_oacc_loop (oacc_loop *parent, gcall *marker)
734 {
735   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
736 
737   loop->marker = marker;
738 
739   /* TODO: This is where device_type flattening would occur for the loop
740      flags.  */
741 
742   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
743 
744   tree chunk_size = integer_zero_node;
745   if (loop->flags & OLF_GANG_STATIC)
746     chunk_size = gimple_call_arg (marker, 4);
747   loop->chunk_size = chunk_size;
748 
749   return loop;
750 }
751 
752 /* Create a dummy loop encompassing a call to a openACC routine.
753    Extract the routine's partitioning requirements.  */
754 
755 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)756 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
757 {
758   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
759   int level = oacc_fn_attrib_level (attrs);
760 
761   gcc_assert (level >= 0);
762 
763   loop->marker = call;
764   loop->routine = decl;
765   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
766 		^ (GOMP_DIM_MASK (level) - 1));
767 }
768 
769 /* Finish off the current OpenACC loop ending at tail marker TAIL.
770    Return the parent loop.  */
771 
772 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)773 finish_oacc_loop (oacc_loop *loop)
774 {
775   /* If the loop has been collapsed, don't partition it.  */
776   if (loop->ifns.is_empty ())
777     loop->mask = loop->flags = 0;
778   return loop->parent;
779 }
780 
781 /* Free all OpenACC loop structures within LOOP (inclusive).  */
782 
783 static void
free_oacc_loop(oacc_loop * loop)784 free_oacc_loop (oacc_loop *loop)
785 {
786   if (loop->sibling)
787     free_oacc_loop (loop->sibling);
788   if (loop->child)
789     free_oacc_loop (loop->child);
790 
791   loop->ifns.release ();
792   free (loop);
793 }
794 
795 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
796 
797 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)798 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
799 		     const char *title, int level)
800 {
801   enum ifn_unique_kind kind
802     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
803 
804   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
805   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
806     {
807       gimple *stmt = gsi_stmt (gsi);
808 
809       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
810 	{
811 	  enum ifn_unique_kind k
812 	    = ((enum ifn_unique_kind) TREE_INT_CST_LOW
813 	       (gimple_call_arg (stmt, 0)));
814 
815 	  if (k == kind && stmt != from)
816 	    break;
817 	}
818       print_gimple_stmt (file, stmt, depth * 2 + 2);
819 
820       gsi_next (&gsi);
821       while (gsi_end_p (gsi))
822 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
823     }
824 }
825 
826 /* Dump OpenACC loops LOOP, its siblings and its children.  */
827 
828 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)829 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
830 {
831   int ix;
832 
833   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
834 	   loop->flags, loop->mask,
835 	   LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
836 
837   if (loop->marker)
838     print_gimple_stmt (file, loop->marker, depth * 2);
839 
840   if (loop->routine)
841     fprintf (file, "%*sRoutine %s:%u:%s\n",
842 	     depth * 2, "", DECL_SOURCE_FILE (loop->routine),
843 	     DECL_SOURCE_LINE (loop->routine),
844 	     IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
845 
846   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
847     if (loop->heads[ix])
848       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
849   for (ix = GOMP_DIM_MAX; ix--;)
850     if (loop->tails[ix])
851       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
852 
853   if (loop->child)
854     dump_oacc_loop (file, loop->child, depth + 1);
855   if (loop->sibling)
856     dump_oacc_loop (file, loop->sibling, depth);
857 }
858 
859 void debug_oacc_loop (oacc_loop *);
860 
861 /* Dump loops to stderr.  */
862 
863 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)864 debug_oacc_loop (oacc_loop *loop)
865 {
866   dump_oacc_loop (stderr, loop, 0);
867 }
868 
869 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
870    structures as we go.  By construction these loops are properly
871    nested.  */
872 
873 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)874 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
875 {
876   int marker = 0;
877   int remaining = 0;
878 
879   if (bb->flags & BB_VISITED)
880     return;
881 
882  follow:
883   bb->flags |= BB_VISITED;
884 
885   /* Scan for loop markers.  */
886   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
887        gsi_next (&gsi))
888     {
889       gimple *stmt = gsi_stmt (gsi);
890 
891       if (!is_gimple_call (stmt))
892 	continue;
893 
894       gcall *call = as_a <gcall *> (stmt);
895 
896       /* If this is a routine, make a dummy loop for it.  */
897       if (tree decl = gimple_call_fndecl (call))
898 	if (tree attrs = oacc_get_fn_attrib (decl))
899 	  {
900 	    gcc_assert (!marker);
901 	    new_oacc_loop_routine (loop, call, decl, attrs);
902 	  }
903 
904       if (!gimple_call_internal_p (call))
905 	continue;
906 
907       switch (gimple_call_internal_fn (call))
908 	{
909 	default:
910 	  break;
911 
912 	case IFN_GOACC_LOOP:
913 	case IFN_GOACC_TILE:
914 	  /* Record the abstraction function, so we can manipulate it
915 	     later.  */
916 	  loop->ifns.safe_push (call);
917 	  break;
918 
919 	case IFN_UNIQUE:
920 	  enum ifn_unique_kind kind
921 	    = (enum ifn_unique_kind) (TREE_INT_CST_LOW
922 				      (gimple_call_arg (call, 0)));
923 	  if (kind == IFN_UNIQUE_OACC_HEAD_MARK
924 	      || kind == IFN_UNIQUE_OACC_TAIL_MARK)
925 	    {
926 	      if (gimple_call_num_args (call) == 2)
927 		{
928 		  gcc_assert (marker && !remaining);
929 		  marker = 0;
930 		  if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
931 		    loop = finish_oacc_loop (loop);
932 		  else
933 		    loop->head_end = call;
934 		}
935 	      else
936 		{
937 		  int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
938 
939 		  if (!marker)
940 		    {
941 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
942 			loop = new_oacc_loop (loop, call);
943 		      remaining = count;
944 		    }
945 		  gcc_assert (count == remaining);
946 		  if (remaining)
947 		    {
948 		      remaining--;
949 		      if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
950 			loop->heads[marker] = call;
951 		      else
952 			loop->tails[remaining] = call;
953 		    }
954 		  marker++;
955 		}
956 	    }
957 	}
958     }
959   if (remaining || marker)
960     {
961       bb = single_succ (bb);
962       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
963       goto follow;
964     }
965 
966   /* Walk successor blocks.  */
967   edge e;
968   edge_iterator ei;
969 
970   FOR_EACH_EDGE (e, ei, bb->succs)
971     oacc_loop_discover_walk (loop, e->dest);
972 }
973 
974 /* LOOP is the first sibling.  Reverse the order in place and return
975    the new first sibling.  Recurse to child loops.  */
976 
977 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)978 oacc_loop_sibling_nreverse (oacc_loop *loop)
979 {
980   oacc_loop *last = NULL;
981   do
982     {
983       if (loop->child)
984 	loop->child = oacc_loop_sibling_nreverse (loop->child);
985 
986       oacc_loop *next = loop->sibling;
987       loop->sibling = last;
988       last = loop;
989       loop = next;
990     }
991   while (loop);
992 
993   return last;
994 }
995 
996 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
997    the current function.  */
998 
999 static oacc_loop *
oacc_loop_discovery()1000 oacc_loop_discovery ()
1001 {
1002   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1003      in the following.  */
1004   clear_bb_flags ();
1005 
1006   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1007   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1008 
1009   /* The siblings were constructed in reverse order, reverse them so
1010      that diagnostics come out in an unsurprising order.  */
1011   top = oacc_loop_sibling_nreverse (top);
1012 
1013   return top;
1014 }
1015 
1016 /* Transform the abstract internal function markers starting at FROM
1017    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1018    or TAIL  marker.  */
1019 
1020 static void
oacc_loop_xform_head_tail(gcall * from,int level)1021 oacc_loop_xform_head_tail (gcall *from, int level)
1022 {
1023   enum ifn_unique_kind kind
1024     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1025   tree replacement = build_int_cst (unsigned_type_node, level);
1026 
1027   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1028     {
1029       gimple *stmt = gsi_stmt (gsi);
1030 
1031       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1032 	{
1033 	  enum ifn_unique_kind k
1034 	    = ((enum ifn_unique_kind)
1035 	       TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1036 
1037 	  if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1038 	    *gimple_call_arg_ptr (stmt, 2) = replacement;
1039 	  else if (k == kind && stmt != from)
1040 	    break;
1041 	}
1042       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1043 	*gimple_call_arg_ptr (stmt, 3) = replacement;
1044 
1045       gsi_next (&gsi);
1046       while (gsi_end_p (gsi))
1047 	gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1048     }
1049 }
1050 
1051 /* Process the discovered OpenACC loops, setting the correct
1052    partitioning level etc.  */
1053 
1054 static void
oacc_loop_process(oacc_loop * loop)1055 oacc_loop_process (oacc_loop *loop)
1056 {
1057   if (loop->child)
1058     oacc_loop_process (loop->child);
1059 
1060   if (loop->mask && !loop->routine)
1061     {
1062       int ix;
1063       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1064       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1065       tree chunk_arg = loop->chunk_size;
1066       gcall *call;
1067 
1068       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1069 	switch (gimple_call_internal_fn (call))
1070 	  {
1071 	  case IFN_GOACC_LOOP:
1072 	    {
1073 	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1074 	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1075 	      if (!is_e)
1076 		gimple_call_set_arg (call, 4, chunk_arg);
1077 	    }
1078 	    break;
1079 
1080 	  case IFN_GOACC_TILE:
1081 	    gimple_call_set_arg (call, 3, mask_arg);
1082 	    gimple_call_set_arg (call, 4, e_mask_arg);
1083 	    break;
1084 
1085 	  default:
1086 	    gcc_unreachable ();
1087 	  }
1088 
1089       unsigned dim = GOMP_DIM_GANG;
1090       unsigned mask = loop->mask | loop->e_mask;
1091       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1092 	{
1093 	  while (!(GOMP_DIM_MASK (dim) & mask))
1094 	    dim++;
1095 
1096 	  oacc_loop_xform_head_tail (loop->heads[ix], dim);
1097 	  oacc_loop_xform_head_tail (loop->tails[ix], dim);
1098 
1099 	  mask ^= GOMP_DIM_MASK (dim);
1100 	}
1101     }
1102 
1103   if (loop->sibling)
1104     oacc_loop_process (loop->sibling);
1105 }
1106 
1107 /* Walk the OpenACC loop heirarchy checking and assigning the
1108    programmer-specified partitionings.  OUTER_MASK is the partitioning
1109    this loop is contained within.  Return mask of partitioning
1110    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1111    bit.  */
1112 
1113 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1114 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1115 {
1116   unsigned this_mask = loop->mask;
1117   unsigned mask_all = 0;
1118   bool noisy = true;
1119 
1120 #ifdef ACCEL_COMPILER
1121   /* When device_type is supported, we want the device compiler to be
1122      noisy, if the loop parameters are device_type-specific.  */
1123   noisy = false;
1124 #endif
1125 
1126   if (!loop->routine)
1127     {
1128       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1129       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1130       bool tiling = (loop->flags & OLF_TILE) != 0;
1131 
1132       this_mask = ((loop->flags >> OLF_DIM_BASE)
1133 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1134 
1135       /* Apply auto partitioning if this is a non-partitioned regular
1136 	 loop, or (no more than) single axis tiled loop.  */
1137       bool maybe_auto
1138 	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1139 
1140       if ((this_mask != 0) + auto_par + seq_par > 1)
1141 	{
1142 	  if (noisy)
1143 	    error_at (loop->loc,
1144 		      seq_par
1145 		      ? G_("%<seq%> overrides other OpenACC loop specifiers")
1146 		      : G_("%<auto%> conflicts with other OpenACC loop "
1147 			   "specifiers"));
1148 	  maybe_auto = false;
1149 	  loop->flags &= ~OLF_AUTO;
1150 	  if (seq_par)
1151 	    {
1152 	      loop->flags
1153 		&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1154 	      this_mask = 0;
1155 	    }
1156 	}
1157 
1158       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1159 	{
1160 	  loop->flags |= OLF_AUTO;
1161 	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1162 	}
1163     }
1164 
1165   if (this_mask & outer_mask)
1166     {
1167       const oacc_loop *outer;
1168       for (outer = loop->parent; outer; outer = outer->parent)
1169 	if ((outer->mask | outer->e_mask) & this_mask)
1170 	  break;
1171 
1172       if (noisy)
1173 	{
1174 	  if (outer)
1175 	    {
1176 	      error_at (loop->loc,
1177 			loop->routine
1178 			? G_("routine call uses same OpenACC parallelism"
1179 			     " as containing loop")
1180 			: G_("inner loop uses same OpenACC parallelism"
1181 			     " as containing loop"));
1182 	      inform (outer->loc, "containing loop here");
1183 	    }
1184 	  else
1185 	    error_at (loop->loc,
1186 		      loop->routine
1187 		      ? G_("routine call uses OpenACC parallelism disallowed"
1188 			   " by containing routine")
1189 		      : G_("loop uses OpenACC parallelism disallowed"
1190 			   " by containing routine"));
1191 
1192 	  if (loop->routine)
1193 	    inform (DECL_SOURCE_LOCATION (loop->routine),
1194 		    "routine %qD declared here", loop->routine);
1195 	}
1196       this_mask &= ~outer_mask;
1197     }
1198   else
1199     {
1200       unsigned outermost = least_bit_hwi (this_mask);
1201 
1202       if (outermost && outermost <= outer_mask)
1203 	{
1204 	  if (noisy)
1205 	    {
1206 	      error_at (loop->loc,
1207 			"incorrectly nested OpenACC loop parallelism");
1208 
1209 	      const oacc_loop *outer;
1210 	      for (outer = loop->parent;
1211 		   outer->flags && outer->flags < outermost;
1212 		   outer = outer->parent)
1213 		continue;
1214 	      inform (outer->loc, "containing loop here");
1215 	    }
1216 
1217 	  this_mask &= ~outermost;
1218 	}
1219     }
1220 
1221   mask_all |= this_mask;
1222 
1223   if (loop->flags & OLF_TILE)
1224     {
1225       /* When tiling, vector goes to the element loop, and failing
1226 	 that we put worker there.  The std doesn't contemplate
1227 	 specifying all three.  We choose to put worker and vector on
1228 	 the element loops in that case.  */
1229       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1230       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1231 	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1232 
1233       loop->e_mask = this_e_mask;
1234       this_mask ^= this_e_mask;
1235     }
1236 
1237   loop->mask = this_mask;
1238 
1239   if (dump_file)
1240     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1241 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1242 	     loop->mask, loop->e_mask);
1243 
1244   if (loop->child)
1245     {
1246       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1247       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1248       mask_all |= loop->inner;
1249     }
1250 
1251   if (loop->sibling)
1252     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1253 
1254   return mask_all;
1255 }
1256 
1257 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1258    OUTER_MASK is the partitioning this loop is contained within.
1259    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1260    Return the cumulative partitioning used by this loop, siblings and
1261    children.  */
1262 
1263 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1264 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1265 			   bool outer_assign)
1266 {
1267   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1268   bool noisy = true;
1269   bool tiling = loop->flags & OLF_TILE;
1270 
1271 #ifdef ACCEL_COMPILER
1272   /* When device_type is supported, we want the device compiler to be
1273      noisy, if the loop parameters are device_type-specific.  */
1274   noisy = false;
1275 #endif
1276 
1277   if (assign && (!outer_assign || loop->inner))
1278     {
1279       /* Allocate outermost and non-innermost loops at the outermost
1280 	 non-innermost available level.  */
1281       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1282 
1283       /* Find the first outermost available partition. */
1284       while (this_mask <= outer_mask)
1285 	this_mask <<= 1;
1286 
1287       /* Grab two axes if tiling, and we've not assigned anything  */
1288       if (tiling && !(loop->mask | loop->e_mask))
1289 	this_mask |= this_mask << 1;
1290 
1291       /* Prohibit the innermost partitioning at the moment.  */
1292       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1293 
1294       /* Don't use any dimension explicitly claimed by an inner loop. */
1295       this_mask &= ~loop->inner;
1296 
1297       if (tiling && !loop->e_mask)
1298 	{
1299 	  /* If we got two axes, allocate the inner one to the element
1300 	     loop.  */
1301 	  loop->e_mask = this_mask & (this_mask << 1);
1302 	  this_mask ^= loop->e_mask;
1303 	}
1304 
1305       loop->mask |= this_mask;
1306     }
1307 
1308   if (loop->child)
1309     {
1310       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1311       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1312 					       outer_assign | assign);
1313     }
1314 
1315   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1316     {
1317       /* Allocate the loop at the innermost available level.  Note
1318 	 that we do this even if we already assigned this loop the
1319 	 outermost available level above.  That way we'll partition
1320 	 this along 2 axes, if they are available.  */
1321       unsigned this_mask = 0;
1322 
1323       /* Determine the outermost partitioning used within this loop.  */
1324       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1325       this_mask = least_bit_hwi (this_mask);
1326 
1327       /* Pick the partitioning just inside that one.  */
1328       this_mask >>= 1;
1329 
1330       /* And avoid picking one use by an outer loop.  */
1331       this_mask &= ~outer_mask;
1332 
1333       /* If tiling and we failed completely above, grab the next one
1334 	 too.  Making sure it doesn't hit an outer loop.  */
1335       if (tiling)
1336 	{
1337 	  this_mask &= ~(loop->e_mask | loop->mask);
1338 	  unsigned tile_mask = ((this_mask >> 1)
1339 				& ~(outer_mask | loop->e_mask | loop->mask));
1340 
1341 	  if (tile_mask || loop->mask)
1342 	    {
1343 	      loop->e_mask |= this_mask;
1344 	      this_mask = tile_mask;
1345 	    }
1346 	  if (!loop->e_mask && noisy)
1347 	    warning_at (loop->loc, 0,
1348 			"insufficient partitioning available"
1349 			" to parallelize element loop");
1350 	}
1351 
1352       loop->mask |= this_mask;
1353       if (!loop->mask && noisy)
1354 	warning_at (loop->loc, 0,
1355 		    tiling
1356 		    ? G_("insufficient partitioning available"
1357 			 " to parallelize tile loop")
1358 		    : G_("insufficient partitioning available"
1359 			 " to parallelize loop"));
1360     }
1361 
1362   if (assign && dump_file)
1363     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1364 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1365 	     loop->mask, loop->e_mask);
1366 
1367   unsigned inner_mask = 0;
1368 
1369   if (loop->sibling)
1370     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1371 					     outer_mask, outer_assign);
1372 
1373   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1374 
1375   return inner_mask;
1376 }
1377 
1378 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1379    axes.  Return mask of partitioning.  */
1380 
1381 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1382 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1383 {
1384   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1385 
1386   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1387     {
1388       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1389       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1390     }
1391   return mask_all;
1392 }
1393 
1394 /* Default fork/join early expander.  Delete the function calls if
1395    there is no RTL expander.  */
1396 
1397 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1398 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1399 			 const int *ARG_UNUSED (dims), bool is_fork)
1400 {
1401   if (is_fork)
1402     return targetm.have_oacc_fork ();
1403   else
1404     return targetm.have_oacc_join ();
1405 }
1406 
1407 /* Default goacc.reduction early expander.
1408 
1409    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1410    If RES_PTR is not integer-zerop:
1411        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1412        TEARDOWN - emit '*RES_PTR = VAR'
1413    If LHS is not NULL
1414        emit 'LHS = VAR'   */
1415 
1416 void
default_goacc_reduction(gcall * call)1417 default_goacc_reduction (gcall *call)
1418 {
1419   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1420   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1421   tree lhs = gimple_call_lhs (call);
1422   tree var = gimple_call_arg (call, 2);
1423   gimple_seq seq = NULL;
1424 
1425   if (code == IFN_GOACC_REDUCTION_SETUP
1426       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1427     {
1428       /* Setup and Teardown need to copy from/to the receiver object,
1429 	 if there is one.  */
1430       tree ref_to_res = gimple_call_arg (call, 1);
1431 
1432       if (!integer_zerop (ref_to_res))
1433 	{
1434 	  tree dst = build_simple_mem_ref (ref_to_res);
1435 	  tree src = var;
1436 
1437 	  if (code == IFN_GOACC_REDUCTION_SETUP)
1438 	    {
1439 	      src = dst;
1440 	      dst = lhs;
1441 	      lhs = NULL;
1442 	    }
1443 	  gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1444 	}
1445     }
1446 
1447   /* Copy VAR to LHS, if there is an LHS.  */
1448   if (lhs)
1449     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1450 
1451   gsi_replace_with_seq (&gsi, seq, true);
1452 }
1453 
1454 /* Main entry point for oacc transformations which run on the device
1455    compiler after LTO, so we know what the target device is at this
1456    point (including the host fallback).  */
1457 
1458 static unsigned int
execute_oacc_device_lower()1459 execute_oacc_device_lower ()
1460 {
1461   tree attrs = oacc_get_fn_attrib (current_function_decl);
1462 
1463   if (!attrs)
1464     /* Not an offloaded function.  */
1465     return 0;
1466 
1467   /* Parse the default dim argument exactly once.  */
1468   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1469     {
1470       oacc_parse_default_dims (flag_openacc_dims);
1471       flag_openacc_dims = (char *)&flag_openacc_dims;
1472     }
1473 
1474   bool is_oacc_kernels
1475     = (lookup_attribute ("oacc kernels",
1476 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1477   bool is_oacc_kernels_parallelized
1478     = (lookup_attribute ("oacc kernels parallelized",
1479 			 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1480 
1481   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1482      kernels, so remove the parallelism dimensions function attributes
1483      potentially set earlier on.  */
1484   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1485     {
1486       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1487       attrs = oacc_get_fn_attrib (current_function_decl);
1488     }
1489 
1490   /* Discover, partition and process the loops.  */
1491   oacc_loop *loops = oacc_loop_discovery ();
1492   int fn_level = oacc_fn_attrib_level (attrs);
1493 
1494   if (dump_file)
1495     {
1496       if (fn_level >= 0)
1497 	fprintf (dump_file, "Function is OpenACC routine level %d\n",
1498 		 fn_level);
1499       else if (is_oacc_kernels)
1500 	fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1501 		 (is_oacc_kernels_parallelized
1502 		  ? "parallelized" : "unparallelized"));
1503       else
1504 	fprintf (dump_file, "Function is OpenACC parallel offload\n");
1505     }
1506 
1507   unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1508   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1509   /* OpenACC kernels constructs are special: they currently don't use the
1510      generic oacc_loop infrastructure and attribute/dimension processing.  */
1511   if (is_oacc_kernels && is_oacc_kernels_parallelized)
1512     {
1513       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
1514 	 also tree-parloops.c:create_parallel_loop.  */
1515       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1516     }
1517 
1518   int dims[GOMP_DIM_MAX];
1519   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1520 
1521   if (dump_file)
1522     {
1523       const char *comma = "Compute dimensions [";
1524       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1525 	fprintf (dump_file, "%s%d", comma, dims[ix]);
1526       fprintf (dump_file, "]\n");
1527     }
1528 
1529   oacc_loop_process (loops);
1530   if (dump_file)
1531     {
1532       fprintf (dump_file, "OpenACC loops\n");
1533       dump_oacc_loop (dump_file, loops, 0);
1534       fprintf (dump_file, "\n");
1535     }
1536 
1537   /* Offloaded targets may introduce new basic blocks, which require
1538      dominance information to update SSA.  */
1539   calculate_dominance_info (CDI_DOMINATORS);
1540 
1541   /* Now lower internal loop functions to target-specific code
1542      sequences.  */
1543   basic_block bb;
1544   FOR_ALL_BB_FN (bb, cfun)
1545     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1546       {
1547 	gimple *stmt = gsi_stmt (gsi);
1548 	if (!is_gimple_call (stmt))
1549 	  {
1550 	    gsi_next (&gsi);
1551 	    continue;
1552 	  }
1553 
1554 	gcall *call = as_a <gcall *> (stmt);
1555 	if (!gimple_call_internal_p (call))
1556 	  {
1557 	    gsi_next (&gsi);
1558 	    continue;
1559 	  }
1560 
1561 	/* Rewind to allow rescan.  */
1562 	gsi_prev (&gsi);
1563 	bool rescan = false, remove = false;
1564 	enum  internal_fn ifn_code = gimple_call_internal_fn (call);
1565 
1566 	switch (ifn_code)
1567 	  {
1568 	  default: break;
1569 
1570 	  case IFN_GOACC_TILE:
1571 	    oacc_xform_tile (call);
1572 	    rescan = true;
1573 	    break;
1574 
1575 	  case IFN_GOACC_LOOP:
1576 	    oacc_xform_loop (call);
1577 	    rescan = true;
1578 	    break;
1579 
1580 	  case IFN_GOACC_REDUCTION:
1581 	    /* Mark the function for SSA renaming.  */
1582 	    mark_virtual_operands_for_renaming (cfun);
1583 
1584 	    /* If the level is -1, this ended up being an unused
1585 	       axis.  Handle as a default.  */
1586 	    if (integer_minus_onep (gimple_call_arg (call, 3)))
1587 	      default_goacc_reduction (call);
1588 	    else
1589 	      targetm.goacc.reduction (call);
1590 	    rescan = true;
1591 	    break;
1592 
1593 	  case IFN_UNIQUE:
1594 	    {
1595 	      enum ifn_unique_kind kind
1596 		= ((enum ifn_unique_kind)
1597 		   TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1598 
1599 	      switch (kind)
1600 		{
1601 		default:
1602 		  break;
1603 
1604 		case IFN_UNIQUE_OACC_FORK:
1605 		case IFN_UNIQUE_OACC_JOIN:
1606 		  if (integer_minus_onep (gimple_call_arg (call, 2)))
1607 		    remove = true;
1608 		  else if (!targetm.goacc.fork_join
1609 			   (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1610 		    remove = true;
1611 		  break;
1612 
1613 		case IFN_UNIQUE_OACC_HEAD_MARK:
1614 		case IFN_UNIQUE_OACC_TAIL_MARK:
1615 		  remove = true;
1616 		  break;
1617 		}
1618 	      break;
1619 	    }
1620 	  }
1621 
1622 	if (gsi_end_p (gsi))
1623 	  /* We rewound past the beginning of the BB.  */
1624 	  gsi = gsi_start_bb (bb);
1625 	else
1626 	  /* Undo the rewind.  */
1627 	  gsi_next (&gsi);
1628 
1629 	if (remove)
1630 	  {
1631 	    if (gimple_vdef (call))
1632 	      replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1633 	    if (gimple_call_lhs (call))
1634 	      {
1635 		/* Propagate the data dependency var.  */
1636 		gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1637 						   gimple_call_arg (call, 1));
1638 		gsi_replace (&gsi, ass,  false);
1639 	      }
1640 	    else
1641 	      gsi_remove (&gsi, true);
1642 	  }
1643 	else if (!rescan)
1644 	  /* If not rescanning, advance over the call.  */
1645 	  gsi_next (&gsi);
1646       }
1647 
1648   free_oacc_loop (loops);
1649 
1650   return 0;
1651 }
1652 
1653 /* Default launch dimension validator.  Force everything to 1.  A
1654    backend that wants to provide larger dimensions must override this
1655    hook.  */
1656 
1657 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level))1658 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1659 			     int ARG_UNUSED (fn_level))
1660 {
1661   bool changed = false;
1662 
1663   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1664     {
1665       if (dims[ix] != 1)
1666 	{
1667 	  dims[ix] = 1;
1668 	  changed = true;
1669 	}
1670     }
1671 
1672   return changed;
1673 }
1674 
1675 /* Default dimension bound is unknown on accelerator and 1 on host.  */
1676 
1677 int
default_goacc_dim_limit(int ARG_UNUSED (axis))1678 default_goacc_dim_limit (int ARG_UNUSED (axis))
1679 {
1680 #ifdef ACCEL_COMPILER
1681   return 0;
1682 #else
1683   return 1;
1684 #endif
1685 }
1686 
1687 namespace {
1688 
1689 const pass_data pass_data_oacc_device_lower =
1690 {
1691   GIMPLE_PASS, /* type */
1692   "oaccdevlow", /* name */
1693   OPTGROUP_OMP, /* optinfo_flags */
1694   TV_NONE, /* tv_id */
1695   PROP_cfg, /* properties_required */
1696   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
1697   0, /* properties_destroyed */
1698   0, /* todo_flags_start */
1699   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1700 };
1701 
1702 class pass_oacc_device_lower : public gimple_opt_pass
1703 {
1704 public:
pass_oacc_device_lower(gcc::context * ctxt)1705   pass_oacc_device_lower (gcc::context *ctxt)
1706     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1707   {}
1708 
1709   /* opt_pass methods: */
gate(function *)1710   virtual bool gate (function *) { return flag_openacc; };
1711 
execute(function *)1712   virtual unsigned int execute (function *)
1713     {
1714       return execute_oacc_device_lower ();
1715     }
1716 
1717 }; // class pass_oacc_device_lower
1718 
1719 } // anon namespace
1720 
1721 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)1722 make_pass_oacc_device_lower (gcc::context *ctxt)
1723 {
1724   return new pass_oacc_device_lower (ctxt);
1725 }
1726 
1727 
1728 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1729    GOMP_SIMT_ENTER call identifying the privatized variables, which are
1730    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1731    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
1732 
1733 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)1734 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1735 {
1736   gimple *alloc_stmt = gsi_stmt (*gsi);
1737   tree simtrec = gimple_call_lhs (alloc_stmt);
1738   tree simduid = gimple_call_arg (alloc_stmt, 0);
1739   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1740   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1741   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1742   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1743   TREE_ADDRESSABLE (rectype) = 1;
1744   TREE_TYPE (simtrec) = build_pointer_type (rectype);
1745   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1746     {
1747       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1748       if (*argp == null_pointer_node)
1749 	continue;
1750       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1751 		  && VAR_P (TREE_OPERAND (*argp, 0)));
1752       tree var = TREE_OPERAND (*argp, 0);
1753 
1754       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1755 			       DECL_NAME (var), TREE_TYPE (var));
1756       SET_DECL_ALIGN (field, DECL_ALIGN (var));
1757       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
1758       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
1759 
1760       insert_field_into_struct (rectype, field);
1761 
1762       tree t = build_simple_mem_ref (simtrec);
1763       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
1764       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
1765       SET_DECL_VALUE_EXPR (var, t);
1766       DECL_HAS_VALUE_EXPR_P (var) = 1;
1767       *regimplify = true;
1768     }
1769   layout_type (rectype);
1770   tree size = TYPE_SIZE_UNIT (rectype);
1771   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
1772 
1773   alloc_stmt
1774     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
1775   gimple_call_set_lhs (alloc_stmt, simtrec);
1776   gsi_replace (gsi, alloc_stmt, false);
1777   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
1778   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
1779   gsi_replace (&enter_gsi, enter_stmt, false);
1780 
1781   use_operand_p use;
1782   gimple *exit_stmt;
1783   if (single_imm_use (simtrec, &use, &exit_stmt))
1784     {
1785       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
1786       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
1787       tree clobber = build_constructor (rectype, NULL);
1788       TREE_THIS_VOLATILE (clobber) = 1;
1789       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
1790       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
1791     }
1792   else
1793     gcc_checking_assert (has_zero_uses (simtrec));
1794 }
1795 
1796 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
1797 
1798 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)1799 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
1800 {
1801   tree t = *tp;
1802 
1803   if (VAR_P (t)
1804       && DECL_HAS_VALUE_EXPR_P (t)
1805       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
1806     {
1807       *walk_subtrees = 0;
1808       return t;
1809     }
1810   return NULL_TREE;
1811 }
1812 
1813 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1814    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1815    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
1816    internal functions on non-SIMT targets, and likewise some SIMD internal
1817    functions on SIMT targets.  */
1818 
1819 static unsigned int
execute_omp_device_lower()1820 execute_omp_device_lower ()
1821 {
1822   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1823   bool regimplify = false;
1824   basic_block bb;
1825   gimple_stmt_iterator gsi;
1826   FOR_EACH_BB_FN (bb, cfun)
1827     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1828       {
1829 	gimple *stmt = gsi_stmt (gsi);
1830 	if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1831 	  continue;
1832 	tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1833 	tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1834 	switch (gimple_call_internal_fn (stmt))
1835 	  {
1836 	  case IFN_GOMP_USE_SIMT:
1837 	    rhs = vf == 1 ? integer_zero_node : integer_one_node;
1838 	    break;
1839 	  case IFN_GOMP_SIMT_ENTER:
1840 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1841 	    goto simtreg_enter_exit;
1842 	  case IFN_GOMP_SIMT_ENTER_ALLOC:
1843 	    if (vf != 1)
1844 	      ompdevlow_adjust_simt_enter (&gsi, &regimplify);
1845 	    rhs = vf == 1 ? null_pointer_node : NULL_TREE;
1846 	    goto simtreg_enter_exit;
1847 	  case IFN_GOMP_SIMT_EXIT:
1848 	  simtreg_enter_exit:
1849 	    if (vf != 1)
1850 	      continue;
1851 	    unlink_stmt_vdef (stmt);
1852 	    break;
1853 	  case IFN_GOMP_SIMT_LANE:
1854 	  case IFN_GOMP_SIMT_LAST_LANE:
1855 	    rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1856 	    break;
1857 	  case IFN_GOMP_SIMT_VF:
1858 	    rhs = build_int_cst (type, vf);
1859 	    break;
1860 	  case IFN_GOMP_SIMT_ORDERED_PRED:
1861 	    rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1862 	    if (rhs || !lhs)
1863 	      unlink_stmt_vdef (stmt);
1864 	    break;
1865 	  case IFN_GOMP_SIMT_VOTE_ANY:
1866 	  case IFN_GOMP_SIMT_XCHG_BFLY:
1867 	  case IFN_GOMP_SIMT_XCHG_IDX:
1868 	    rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1869 	    break;
1870 	  case IFN_GOMP_SIMD_LANE:
1871 	  case IFN_GOMP_SIMD_LAST_LANE:
1872 	    rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1873 	    break;
1874 	  case IFN_GOMP_SIMD_VF:
1875 	    rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1876 	    break;
1877 	  default:
1878 	    continue;
1879 	  }
1880 	if (lhs && !rhs)
1881 	  continue;
1882 	stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1883 	gsi_replace (&gsi, stmt, false);
1884       }
1885   if (regimplify)
1886     FOR_EACH_BB_REVERSE_FN (bb, cfun)
1887       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1888 	if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
1889 	  {
1890 	    if (gimple_clobber_p (gsi_stmt (gsi)))
1891 	      gsi_remove (&gsi, true);
1892 	    else
1893 	      gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1894 	  }
1895   if (vf != 1)
1896     cfun->has_force_vectorize_loops = false;
1897   return 0;
1898 }
1899 
1900 namespace {
1901 
1902 const pass_data pass_data_omp_device_lower =
1903 {
1904   GIMPLE_PASS, /* type */
1905   "ompdevlow", /* name */
1906   OPTGROUP_OMP, /* optinfo_flags */
1907   TV_NONE, /* tv_id */
1908   PROP_cfg, /* properties_required */
1909   PROP_gimple_lomp_dev, /* properties_provided */
1910   0, /* properties_destroyed */
1911   0, /* todo_flags_start */
1912   TODO_update_ssa, /* todo_flags_finish */
1913 };
1914 
1915 class pass_omp_device_lower : public gimple_opt_pass
1916 {
1917 public:
pass_omp_device_lower(gcc::context * ctxt)1918   pass_omp_device_lower (gcc::context *ctxt)
1919     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1920   {}
1921 
1922   /* opt_pass methods: */
gate(function * fun)1923   virtual bool gate (function *fun)
1924     {
1925       return !(fun->curr_properties & PROP_gimple_lomp_dev);
1926     }
execute(function *)1927   virtual unsigned int execute (function *)
1928     {
1929       return execute_omp_device_lower ();
1930     }
1931 
1932 }; // class pass_expand_omp_ssa
1933 
1934 } // anon namespace
1935 
1936 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)1937 make_pass_omp_device_lower (gcc::context *ctxt)
1938 {
1939   return new pass_omp_device_lower (ctxt);
1940 }
1941 
1942 /* "omp declare target link" handling pass.  */
1943 
1944 namespace {
1945 
1946 const pass_data pass_data_omp_target_link =
1947 {
1948   GIMPLE_PASS,			/* type */
1949   "omptargetlink",		/* name */
1950   OPTGROUP_OMP,			/* optinfo_flags */
1951   TV_NONE,			/* tv_id */
1952   PROP_ssa,			/* properties_required */
1953   0,				/* properties_provided */
1954   0,				/* properties_destroyed */
1955   0,				/* todo_flags_start */
1956   TODO_update_ssa,		/* todo_flags_finish */
1957 };
1958 
1959 class pass_omp_target_link : public gimple_opt_pass
1960 {
1961 public:
pass_omp_target_link(gcc::context * ctxt)1962   pass_omp_target_link (gcc::context *ctxt)
1963     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
1964   {}
1965 
1966   /* opt_pass methods: */
gate(function * fun)1967   virtual bool gate (function *fun)
1968     {
1969 #ifdef ACCEL_COMPILER
1970       return offloading_function_p (fun->decl);
1971 #else
1972       (void) fun;
1973       return false;
1974 #endif
1975     }
1976 
1977   virtual unsigned execute (function *);
1978 };
1979 
1980 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
1981 
1982 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)1983 find_link_var_op (tree *tp, int *walk_subtrees, void *)
1984 {
1985   tree t = *tp;
1986 
1987   if (VAR_P (t)
1988       && DECL_HAS_VALUE_EXPR_P (t)
1989       && is_global_var (t)
1990       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
1991     {
1992       *walk_subtrees = 0;
1993       return t;
1994     }
1995 
1996   return NULL_TREE;
1997 }
1998 
1999 unsigned
execute(function * fun)2000 pass_omp_target_link::execute (function *fun)
2001 {
2002   basic_block bb;
2003   FOR_EACH_BB_FN (bb, fun)
2004     {
2005       gimple_stmt_iterator gsi;
2006       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2007 	if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2008 	  gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2009     }
2010 
2011   return 0;
2012 }
2013 
2014 } // anon namespace
2015 
2016 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2017 make_pass_omp_target_link (gcc::context *ctxt)
2018 {
2019   return new pass_omp_target_link (ctxt);
2020 }
2021