1 /* Lowering and expansion of OpenMP directives for HSA GPU agents.
2 
3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "tree.h"
26 #include "gimple.h"
27 #include "tree-pass.h"
28 #include "ssa.h"
29 #include "cgraph.h"
30 #include "pretty-print.h"
31 #include "fold-const.h"
32 #include "gimplify.h"
33 #include "gimple-iterator.h"
34 #include "gimple-walk.h"
35 #include "tree-inline.h"
36 #include "langhooks.h"
37 #include "omp-general.h"
38 #include "omp-low.h"
39 #include "omp-grid.h"
40 #include "gimple-pretty-print.h"
41 
42 /* Return the lastprivate predicate for a given gridified loop described by
43    FD).  */
44 
45 tree
omp_grid_lastprivate_predicate(struct omp_for_data * fd)46 omp_grid_lastprivate_predicate (struct omp_for_data *fd)
47 {
48   /* When dealing with a gridified loop, we need to check up to three collapsed
49      iteration variables but they are not actually captured in this fd.
50      Fortunately, we can easily rely on HSA builtins to get this
51      information.  */
52 
53   tree id, size;
54   if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP
55       && gimple_omp_for_grid_intra_group (fd->for_stmt))
56     {
57       id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMID);
58       size = builtin_decl_explicit (BUILT_IN_HSA_CURRENTWORKGROUPSIZE);
59     }
60   else
61     {
62       id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMABSID);
63       size = builtin_decl_explicit (BUILT_IN_HSA_GRIDSIZE);
64     }
65   tree cond = NULL;
66   for (int dim = 0; dim < fd->collapse; dim++)
67     {
68       tree dim_tree = build_int_cstu (unsigned_type_node, dim);
69       tree u1 = build_int_cstu (unsigned_type_node, 1);
70       tree c2
71 	= build2 (EQ_EXPR, boolean_type_node,
72 		  build2 (PLUS_EXPR, unsigned_type_node,
73 			  build_call_expr (id, 1, dim_tree), u1),
74 		  build_call_expr (size, 1, dim_tree));
75       if (cond)
76 	cond = build2 (TRUTH_AND_EXPR, boolean_type_node, cond, c2);
77       else
78 	cond = c2;
79     }
80   return cond;
81 }
82 
83 /* Structure describing the basic properties of the loop we ara analyzing
84    whether it can be gridified and when it is gridified.  */
85 
86 class grid_prop
87 {
88 public:
89   /* True when we are doing tiling gridification, i.e. when there is a distinct
90      distribute loop over groups and a loop construct over work-items.  False
91      when distribute and parallel for loops form a combined construct.  */
92   bool tiling;
93   /* Location of the target construct for optimization information
94      messages.  */
95   dump_user_location_t target_loc;
96   /* The collapse clause of the involved loops.  Collapse value of all of them
97      must be the same for gridification to take place.  */
98   size_t collapse;
99   /* Group sizes, if requested by the user or NULL if not requested.  */
100   tree group_sizes[3];
101 };
102 
103 #define GRID_MISSED_MSG_PREFIX "Will not turn target construct into a " \
104   "gridified HSA kernel because "
105 
106 /* Return true if STMT is an assignment of a register-type into a local
107    VAR_DECL.  If GRID is non-NULL, the assignment additionally must not be to
108    any of the trees specifying group sizes there.  */
109 
110 static bool
grid_safe_assignment_p(gimple * stmt,grid_prop * grid)111 grid_safe_assignment_p (gimple *stmt, grid_prop *grid)
112 {
113   gassign *assign = dyn_cast <gassign *> (stmt);
114   if (!assign)
115     return false;
116   if (gimple_clobber_p (assign))
117     return true;
118   tree lhs = gimple_assign_lhs (assign);
119   if (!VAR_P (lhs)
120       || !is_gimple_reg_type (TREE_TYPE (lhs))
121       || is_global_var (lhs))
122     return false;
123   if (grid)
124     for (unsigned i = 0; i < grid->collapse; i++)
125       if (lhs == grid->group_sizes[i])
126 	return false;
127   return true;
128 }
129 
130 /* Return true if all statements in SEQ are assignments to local register-type
131    variables that do not hold group size information.  */
132 
133 static bool
grid_seq_only_contains_local_assignments(gimple_seq seq,grid_prop * grid)134 grid_seq_only_contains_local_assignments (gimple_seq seq, grid_prop *grid)
135 {
136   if (!seq)
137     return true;
138 
139   gimple_stmt_iterator gsi;
140   for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
141     if (!grid_safe_assignment_p (gsi_stmt (gsi), grid))
142       return false;
143   return true;
144 }
145 
146 /* Scan statements in SEQ and call itself recursively on any bind.  GRID
147    describes hitherto discovered properties of the loop that is evaluated for
148    possible gridification.  If during whole search only assignments to
149    register-type local variables (that do not overwrite group size information)
150    and one single OMP statement is encountered, return true, otherwise return
151    false.  RET is where we store any OMP statement encountered.  */
152 
153 static bool
grid_find_single_omp_among_assignments_1(gimple_seq seq,grid_prop * grid,const char * name,gimple ** ret)154 grid_find_single_omp_among_assignments_1 (gimple_seq seq, grid_prop *grid,
155 					  const char *name, gimple **ret)
156 {
157   gimple_stmt_iterator gsi;
158   for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
159     {
160       gimple *stmt = gsi_stmt (gsi);
161 
162       if (grid_safe_assignment_p (stmt, grid))
163 	continue;
164       if (gbind *bind = dyn_cast <gbind *> (stmt))
165 	{
166 	  gimple_seq bind_body = gimple_bind_body (bind);
167 	  if (!grid_find_single_omp_among_assignments_1 (bind_body, grid, name,
168 							 ret))
169 	      return false;
170 	}
171       else if (is_gimple_omp (stmt))
172 	{
173 	  if (*ret)
174 	    {
175 	      if (dump_enabled_p ())
176 		{
177 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
178 				   GRID_MISSED_MSG_PREFIX "%s construct "
179 				   "contains multiple OpenMP constructs\n",
180 				   name);
181 		  dump_printf_loc (MSG_NOTE, *ret,
182 				   "The first OpenMP construct within "
183 				   "a parallel\n");
184 		  dump_printf_loc (MSG_NOTE, stmt,
185 				   "The second OpenMP construct within "
186 				   "a parallel\n");
187 		}
188 	      return false;
189 	    }
190 	  *ret = stmt;
191 	}
192       else
193 	{
194 	  if (dump_enabled_p ())
195 	    {
196 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
197 			       GRID_MISSED_MSG_PREFIX "%s construct contains "
198 			       "a complex statement\n", name);
199 	      dump_printf_loc (MSG_NOTE, stmt,
200 			       "This statement cannot be analyzed for "
201 			       "gridification\n");
202 	    }
203 	  return false;
204 	}
205     }
206   return true;
207 }
208 
209 /* Scan statements in SEQ and make sure that it and any binds in it contain
210    only assignments to local register-type variables (that do not overwrite
211    group size information) and one OMP construct.  If so, return that
212    construct, otherwise return NULL.  GRID describes hitherto discovered
213    properties of the loop that is evaluated for possible gridification.  If
214    dumping is enabled and function fails, use NAME to dump a note with the
215    reason for failure.  */
216 
217 static gimple *
grid_find_single_omp_among_assignments(gimple_seq seq,grid_prop * grid,const char * name)218 grid_find_single_omp_among_assignments (gimple_seq seq, grid_prop *grid,
219 					const char *name)
220 {
221   if (!seq)
222     {
223       if (dump_enabled_p ())
224 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
225 			 GRID_MISSED_MSG_PREFIX "%s construct has empty body\n",
226 			 name);
227       return NULL;
228     }
229 
230   gimple *ret = NULL;
231   if (grid_find_single_omp_among_assignments_1 (seq, grid, name, &ret))
232     {
233       if (!ret && dump_enabled_p ())
234 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
235 			 GRID_MISSED_MSG_PREFIX "%s construct does not contain"
236 			 " any other OpenMP construct\n", name);
237       return ret;
238     }
239   else
240     return NULL;
241 }
242 
243 /* Walker function looking for statements there is no point gridifying (and for
244    noreturn function calls which we cannot do).  Return non-NULL if such a
245    function is found.  */
246 
247 static tree
grid_find_ungridifiable_statement(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi)248 grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi,
249 				   bool *handled_ops_p,
250 				   struct walk_stmt_info *wi)
251 {
252   *handled_ops_p = false;
253   gimple *stmt = gsi_stmt (*gsi);
254   switch (gimple_code (stmt))
255     {
256     case GIMPLE_CALL:
257       if (gimple_call_noreturn_p (as_a <gcall *> (stmt)))
258 	{
259 	  *handled_ops_p = true;
260 	  wi->info = stmt;
261 	  return error_mark_node;
262 	}
263       break;
264 
265     /* We may reduce the following list if we find a way to implement the
266        clauses, but now there is no point trying further.  */
267     case GIMPLE_OMP_CRITICAL:
268     case GIMPLE_OMP_TASKGROUP:
269     case GIMPLE_OMP_TASK:
270     case GIMPLE_OMP_SECTION:
271     case GIMPLE_OMP_SECTIONS:
272     case GIMPLE_OMP_SECTIONS_SWITCH:
273     case GIMPLE_OMP_TARGET:
274     case GIMPLE_OMP_ORDERED:
275       *handled_ops_p = true;
276       wi->info = stmt;
277       return error_mark_node;
278     default:
279       break;
280     }
281   return NULL;
282 }
283 
284 /* Examine clauses of omp parallel statement PAR and if any prevents
285    gridification, issue a missed-optimization diagnostics and return false,
286    otherwise return true.  GRID describes hitherto discovered properties of the
287    loop that is evaluated for possible gridification.  */
288 
289 static bool
grid_parallel_clauses_gridifiable(gomp_parallel * par,dump_user_location_t tloc)290 grid_parallel_clauses_gridifiable (gomp_parallel *par, dump_user_location_t tloc)
291 {
292   tree clauses = gimple_omp_parallel_clauses (par);
293   while (clauses)
294     {
295       switch (OMP_CLAUSE_CODE (clauses))
296 	{
297 	case OMP_CLAUSE_NUM_THREADS:
298 	  if (dump_enabled_p ())
299 	    {
300 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
301 			       GRID_MISSED_MSG_PREFIX "because there is "
302 			       "a num_threads clause of the parallel "
303 			       "construct\n");
304 	      dump_printf_loc (MSG_NOTE, par,
305 			       "Parallel construct has a num_threads clause\n");
306 	    }
307 	  return false;
308 
309 	case OMP_CLAUSE_REDUCTION:
310 	  if (dump_enabled_p ())
311 	    {
312 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
313 			       GRID_MISSED_MSG_PREFIX "a reduction clause "
314 			       "is present\n ");
315 	      dump_printf_loc (MSG_NOTE, par,
316 			       "Parallel construct has a reduction clause\n");
317 	    }
318 	  return false;
319 
320 	default:
321 	  break;
322 	}
323       clauses = OMP_CLAUSE_CHAIN (clauses);
324     }
325   return true;
326 }
327 
328 /* Examine clauses and the body of omp loop statement GFOR and if something
329    prevents gridification, issue a missed-optimization diagnostics and return
330    false, otherwise return true.  GRID describes hitherto discovered properties
331    of the loop that is evaluated for possible gridification.  */
332 
333 static bool
grid_inner_loop_gridifiable_p(gomp_for * gfor,grid_prop * grid)334 grid_inner_loop_gridifiable_p (gomp_for *gfor, grid_prop *grid)
335 {
336   if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor),
337 						 grid))
338     {
339       if (dump_enabled_p ())
340 	{
341 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
342 			   GRID_MISSED_MSG_PREFIX "the inner loop "
343 			   "loop bounds computation contains a complex "
344 			   "statement\n");
345 	  dump_printf_loc (MSG_NOTE, gfor,
346 			   "Loop construct cannot be analyzed for "
347 			   "gridification\n");
348 	}
349       return false;
350     }
351 
352   tree clauses = gimple_omp_for_clauses (gfor);
353   while (clauses)
354     {
355       switch (OMP_CLAUSE_CODE (clauses))
356 	{
357 	case OMP_CLAUSE_SCHEDULE:
358 	  if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
359 	    {
360 	      if (dump_enabled_p ())
361 		{
362 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
363 				   GRID_MISSED_MSG_PREFIX "the inner loop "
364 				   "has a non-automatic schedule clause\n");
365 		  dump_printf_loc (MSG_NOTE, gfor,
366 				   "Loop construct has a non automatic "
367 				   "schedule clause\n");
368 		}
369 	      return false;
370 	    }
371 	  break;
372 
373 	case OMP_CLAUSE_REDUCTION:
374 	  if (dump_enabled_p ())
375 	    {
376 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
377 			       GRID_MISSED_MSG_PREFIX "a reduction "
378 			       "clause is present\n ");
379 	      dump_printf_loc (MSG_NOTE, gfor,
380 			       "Loop construct has a reduction schedule "
381 			       "clause\n");
382 	    }
383 	  return false;
384 
385 	default:
386 	  break;
387 	}
388       clauses = OMP_CLAUSE_CHAIN (clauses);
389     }
390   struct walk_stmt_info wi;
391   memset (&wi, 0, sizeof (wi));
392   if (walk_gimple_seq (gimple_omp_body (gfor),
393 		       grid_find_ungridifiable_statement,
394 		       NULL, &wi))
395     {
396       gimple *bad = (gimple *) wi.info;
397       if (dump_enabled_p ())
398 	{
399 	  if (is_gimple_call (bad))
400 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
401 			       GRID_MISSED_MSG_PREFIX "the inner loop contains "
402 			       "call to a noreturn function\n");
403 	  else
404 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
405 			     GRID_MISSED_MSG_PREFIX "the inner loop contains "
406 			     "statement %s which cannot be transformed\n",
407 			     gimple_code_name[(int) gimple_code (bad)]);
408 	  dump_printf_loc (MSG_NOTE, bad,
409 			   "This statement cannot be analyzed for "
410 			   "gridification\n");
411 	}
412       return false;
413     }
414   return true;
415 }
416 
417 /* Given distribute omp construct represented by DIST, which in the original
418    source forms a compound construct with a looping construct, return true if it
419    can be turned into a gridified HSA kernel.  Otherwise return false.  GRID
420    describes hitherto discovered properties of the loop that is evaluated for
421    possible gridification.  */
422 
423 static bool
grid_dist_follows_simple_pattern(gomp_for * dist,grid_prop * grid)424 grid_dist_follows_simple_pattern (gomp_for *dist, grid_prop *grid)
425 {
426   dump_user_location_t tloc = grid->target_loc;
427   gimple *stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist),
428 							 grid, "distribute");
429   gomp_parallel *par;
430   if (!stmt
431       || !(par = dyn_cast <gomp_parallel *> (stmt))
432       || !grid_parallel_clauses_gridifiable (par, tloc))
433     return false;
434 
435   stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), grid,
436 						 "parallel");
437   gomp_for *gfor;
438   if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
439     return false;
440 
441   if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
442     {
443       if (dump_enabled_p ())
444 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
445 			 GRID_MISSED_MSG_PREFIX "the inner loop is not "
446 			 "a simple for loop\n");
447       return false;
448     }
449   gcc_assert (gimple_omp_for_collapse (gfor) == grid->collapse);
450 
451   if (!grid_inner_loop_gridifiable_p (gfor, grid))
452     return false;
453 
454   return true;
455 }
456 
457 /* Given an omp loop statement GFOR, return true if it can participate in
458    tiling gridification, i.e. in one where the distribute and parallel for
459    loops do not form a compound statement.  GRID describes hitherto discovered
460    properties of the loop that is evaluated for possible gridification.  */
461 
462 static bool
grid_gfor_follows_tiling_pattern(gomp_for * gfor,grid_prop * grid)463 grid_gfor_follows_tiling_pattern (gomp_for *gfor, grid_prop *grid)
464 {
465   if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
466     {
467       if (dump_enabled_p ())
468 	{
469 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
470 			   GRID_MISSED_MSG_PREFIX "an inner loop is not "
471 			   "a simple for loop\n");
472 	  dump_printf_loc (MSG_NOTE, gfor,
473 			   "This statement is not a simple for loop\n");
474 	}
475       return false;
476     }
477 
478   if (!grid_inner_loop_gridifiable_p (gfor, grid))
479     return false;
480 
481   if (gimple_omp_for_collapse (gfor) != grid->collapse)
482     {
483       if (dump_enabled_p ())
484 	{
485 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
486 			   GRID_MISSED_MSG_PREFIX "an inner loop does not "
487 			   "have use the same collapse clause\n");
488 	  dump_printf_loc (MSG_NOTE, gfor,
489 			   "Loop construct uses a different collapse clause\n");
490 	}
491       return false;
492     }
493 
494   struct omp_for_data fd;
495   struct omp_for_data_loop *loops
496     = (struct omp_for_data_loop *)alloca (grid->collapse
497 					  * sizeof (struct omp_for_data_loop));
498   omp_extract_for_data (gfor, &fd, loops);
499   for (unsigned i = 0; i < grid->collapse; i++)
500     {
501       tree itype, type = TREE_TYPE (fd.loops[i].v);
502       if (POINTER_TYPE_P (type))
503 	itype = signed_type_for (type);
504       else
505 	itype = type;
506 
507       tree n1 = fold_convert (itype, fd.loops[i].n1);
508       tree n2 = fold_convert (itype, fd.loops[i].n2);
509       tree t = build_int_cst (itype,
510 			      (fd.loops[i].cond_code == LT_EXPR ? -1 : 1));
511       t = fold_build2 (PLUS_EXPR, itype, fd.loops[i].step, t);
512       t = fold_build2 (PLUS_EXPR, itype, t, n2);
513       t = fold_build2 (MINUS_EXPR, itype, t, n1);
514       if (TYPE_UNSIGNED (itype) && fd.loops[i].cond_code == GT_EXPR)
515 	t = fold_build2 (TRUNC_DIV_EXPR, itype,
516 			 fold_build1 (NEGATE_EXPR, itype, t),
517 			 fold_build1 (NEGATE_EXPR, itype, fd.loops[i].step));
518       else
519 	t = fold_build2 (TRUNC_DIV_EXPR, itype, t, fd.loops[i].step);
520 
521       if (!operand_equal_p (grid->group_sizes[i], t, 0))
522 	{
523 	  if (dump_enabled_p ())
524 	    {
525 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
526 			       GRID_MISSED_MSG_PREFIX "the distribute and "
527 			       "an internal loop do not agree on tile size\n");
528 	      dump_printf_loc (MSG_NOTE, gfor,
529 			       "Loop construct does not seem to loop over "
530 			       "a tile size\n");
531 	    }
532 	  return false;
533 	}
534     }
535   return true;
536 }
537 
538 /* Facing a call to FNDECL in the body of a distribute construct, return true
539    if we can handle it or false if it precludes gridification.  */
540 
541 static bool
grid_call_permissible_in_distribute_p(tree fndecl)542 grid_call_permissible_in_distribute_p (tree fndecl)
543 {
544   if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
545     return true;
546 
547   const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
548   if (strstr (name, "omp_") != name)
549     return false;
550 
551   if ((strcmp (name, "omp_get_thread_num") == 0)
552       || (strcmp (name, "omp_get_num_threads") == 0)
553       || (strcmp (name, "omp_get_num_teams") == 0)
554       || (strcmp (name, "omp_get_team_num") == 0)
555       || (strcmp (name, "omp_get_level") == 0)
556       || (strcmp (name, "omp_get_active_level") == 0)
557       || (strcmp (name, "omp_in_parallel") == 0))
558     return true;
559 
560   return false;
561 }
562 
563 /* Facing a call satisfying grid_call_permissible_in_distribute_p in the body
564    of a distribute construct that is pointed at by GSI, modify it as necessary
565    for gridification.  If the statement itself got removed, return true.  */
566 
567 static bool
grid_handle_call_in_distribute(gimple_stmt_iterator * gsi)568 grid_handle_call_in_distribute (gimple_stmt_iterator *gsi)
569 {
570   gimple *stmt = gsi_stmt (*gsi);
571   tree fndecl = gimple_call_fndecl (stmt);
572   gcc_checking_assert (stmt);
573   if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
574     return false;
575 
576   const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
577   if ((strcmp (name, "omp_get_thread_num") == 0)
578       || (strcmp (name, "omp_get_level") == 0)
579       || (strcmp (name, "omp_get_active_level") == 0)
580       || (strcmp (name, "omp_in_parallel") == 0))
581     {
582       tree lhs = gimple_call_lhs (stmt);
583       if (lhs)
584 	{
585 	  gassign *assign
586 	    = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
587 	  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
588 	}
589       gsi_remove (gsi, true);
590       return true;
591     }
592 
593   /* The rest of the omp functions can stay as they are, HSA back-end will
594      handle them correctly.  */
595   gcc_checking_assert ((strcmp (name, "omp_get_num_threads") == 0)
596 		       || (strcmp (name, "omp_get_num_teams") == 0)
597 		       || (strcmp (name, "omp_get_team_num") == 0));
598   return false;
599 }
600 
601 /* Given a sequence of statements within a distribute omp construct or a
602    parallel construct, which in the original source does not form a compound
603    construct with a looping construct, return true if it does not prevent us
604    from turning it into a gridified HSA kernel.  Otherwise return false.  GRID
605    describes hitherto discovered properties of the loop that is evaluated for
606    possible gridification.  IN_PARALLEL must be true if seq is within a
607    parallel construct and flase if it is only within a distribute
608    construct.  */
609 
610 static bool
grid_dist_follows_tiling_pattern(gimple_seq seq,grid_prop * grid,bool in_parallel)611 grid_dist_follows_tiling_pattern (gimple_seq seq, grid_prop *grid,
612 				  bool in_parallel)
613 {
614   gimple_stmt_iterator gsi;
615   for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
616     {
617       gimple *stmt = gsi_stmt (gsi);
618 
619       if (grid_safe_assignment_p (stmt, grid)
620 	  || gimple_code (stmt) == GIMPLE_GOTO
621 	  || gimple_code (stmt) == GIMPLE_LABEL
622 	  || gimple_code (stmt) == GIMPLE_COND)
623 	continue;
624       else if (gbind *bind = dyn_cast <gbind *> (stmt))
625 	{
626 	  if (!grid_dist_follows_tiling_pattern (gimple_bind_body (bind),
627 						 grid, in_parallel))
628 	    return false;
629 	  continue;
630 	}
631       else if (gtry *try_stmt = dyn_cast <gtry *> (stmt))
632 	{
633 	  if (gimple_try_kind (try_stmt) == GIMPLE_TRY_CATCH)
634 	    {
635 	      if (dump_enabled_p ())
636 		{
637 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
638 				   GRID_MISSED_MSG_PREFIX "the distribute "
639 				   "construct contains a try..catch region\n");
640 		  dump_printf_loc (MSG_NOTE, try_stmt,
641 				   "This statement cannot be analyzed for "
642 				   "tiled gridification\n");
643 		}
644 	      return false;
645 	    }
646 	  if (!grid_dist_follows_tiling_pattern (gimple_try_eval (try_stmt),
647 						 grid, in_parallel))
648 	    return false;
649 	  if (!grid_dist_follows_tiling_pattern (gimple_try_cleanup (try_stmt),
650 						 grid, in_parallel))
651 	    return false;
652 	  continue;
653 	}
654       else if (is_gimple_call (stmt))
655 	{
656 	  tree fndecl = gimple_call_fndecl (stmt);
657 	  if (fndecl && grid_call_permissible_in_distribute_p (fndecl))
658 	    continue;
659 
660 	  if (dump_enabled_p ())
661 	    {
662 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
663 			       GRID_MISSED_MSG_PREFIX "the distribute "
664 			       "construct contains a call\n");
665 	      dump_printf_loc (MSG_NOTE, stmt,
666 			       "This statement cannot be analyzed for "
667 			       "tiled gridification\n");
668 	    }
669 	  return false;
670 	}
671       else if (gomp_parallel *par = dyn_cast <gomp_parallel *> (stmt))
672 	{
673 	  if (in_parallel)
674 	    {
675 	      if (dump_enabled_p ())
676 		{
677 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
678 				   GRID_MISSED_MSG_PREFIX "a parallel "
679 				   "construct contains another parallel "
680 				   "construct\n");
681 		  dump_printf_loc (MSG_NOTE, stmt,
682 				   "This parallel construct is nested in "
683 				   "another one\n");
684 		}
685 	      return false;
686 	    }
687 	  if (!grid_parallel_clauses_gridifiable (par, grid->target_loc)
688 	      || !grid_dist_follows_tiling_pattern (gimple_omp_body (par),
689 						    grid, true))
690 	    return false;
691 	}
692       else if (gomp_for *gfor = dyn_cast <gomp_for *> (stmt))
693 	{
694 	  if (!in_parallel)
695 	    {
696 	      if (dump_enabled_p ())
697 		{
698 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
699 				   GRID_MISSED_MSG_PREFIX "a loop "
700 				   "construct is not nested within a parallel "
701 				   "construct\n");
702 		  dump_printf_loc (MSG_NOTE, stmt,
703 				   "This loop construct is not nested in "
704 				   "a parallel construct\n");
705 		}
706 	      return false;
707 	    }
708 	  if (!grid_gfor_follows_tiling_pattern (gfor, grid))
709 	    return false;
710 	}
711       else
712 	{
713 	  if (dump_enabled_p ())
714 	    {
715 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
716 			       GRID_MISSED_MSG_PREFIX "the distribute "
717 			       "construct contains a complex statement\n");
718 	      dump_printf_loc (MSG_NOTE, stmt,
719 			       "This statement cannot be analyzed for "
720 			       "tiled gridification\n");
721 	    }
722 	  return false;
723 	}
724     }
725     return true;
726 }
727 
728 /* If TARGET follows a pattern that can be turned into a gridified HSA kernel,
729    return true, otherwise return false.  In the case of success, also fill in
730    GRID with information describing the kernel grid.  */
731 
732 static bool
grid_target_follows_gridifiable_pattern(gomp_target * target,grid_prop * grid)733 grid_target_follows_gridifiable_pattern (gomp_target *target, grid_prop *grid)
734 {
735   if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
736     return false;
737 
738   dump_user_location_t tloc = target;
739   grid->target_loc = tloc;
740   gimple *stmt
741     = grid_find_single_omp_among_assignments (gimple_omp_body (target),
742 					      grid, "target");
743   if (!stmt)
744     return false;
745   gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
746   tree group_size = NULL;
747   if (!teams)
748     {
749       if (dump_enabled_p ())
750 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
751 			 GRID_MISSED_MSG_PREFIX "it does not have a sole "
752 			 "teams construct in it.\n");
753       return false;
754     }
755 
756   tree clauses = gimple_omp_teams_clauses (teams);
757   while (clauses)
758     {
759       switch (OMP_CLAUSE_CODE (clauses))
760 	{
761 	case OMP_CLAUSE_NUM_TEAMS:
762 	  if (dump_enabled_p ())
763 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
764 			     GRID_MISSED_MSG_PREFIX "the teams construct "
765 			     "contains a num_teams clause\n ");
766 	  return false;
767 
768 	case OMP_CLAUSE_REDUCTION:
769 	  if (dump_enabled_p ())
770 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
771 			     GRID_MISSED_MSG_PREFIX "a reduction "
772 			     "clause is present\n ");
773 	  return false;
774 
775 	case OMP_CLAUSE_THREAD_LIMIT:
776 	  if (!integer_zerop (OMP_CLAUSE_OPERAND (clauses, 0)))
777 	    group_size = OMP_CLAUSE_OPERAND (clauses, 0);
778 	  break;
779 
780 	default:
781 	  break;
782 	}
783       clauses = OMP_CLAUSE_CHAIN (clauses);
784     }
785 
786   stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), grid,
787 						 "teams");
788   if (!stmt)
789     return false;
790   gomp_for *dist = dyn_cast <gomp_for *> (stmt);
791   if (!dist)
792     {
793       if (dump_enabled_p ())
794 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
795 			 GRID_MISSED_MSG_PREFIX "the teams construct does not "
796 			 "have a single distribute construct in it.\n");
797       return false;
798     }
799 
800   gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
801 
802   grid->collapse = gimple_omp_for_collapse (dist);
803   if (grid->collapse > 3)
804     {
805       if (dump_enabled_p ())
806 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
807 			 GRID_MISSED_MSG_PREFIX "the distribute construct "
808 			 "contains collapse clause with parameter greater "
809 			 "than 3\n");
810       return false;
811     }
812 
813   struct omp_for_data fd;
814   struct omp_for_data_loop *dist_loops
815     = (struct omp_for_data_loop *)alloca (grid->collapse
816 					  * sizeof (struct omp_for_data_loop));
817   omp_extract_for_data (dist, &fd, dist_loops);
818   if (fd.chunk_size)
819     {
820       if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
821 	{
822 	  if (dump_enabled_p ())
823 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
824 			     GRID_MISSED_MSG_PREFIX "the teams "
825 			     "thread limit is different from distribute "
826 			     "schedule chunk\n");
827 	  return false;
828 	}
829       group_size = fd.chunk_size;
830     }
831   if (group_size && grid->collapse > 1)
832     {
833       if (dump_enabled_p ())
834 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
835 			 GRID_MISSED_MSG_PREFIX "group size cannot be "
836 			 "set using thread_limit or schedule clauses "
837 			 "when also using a collapse clause greater than 1\n");
838       return false;
839     }
840 
841   if (gimple_omp_for_combined_p (dist))
842     {
843       grid->tiling = false;
844       grid->group_sizes[0] = group_size;
845       for (unsigned i = 1; i < grid->collapse; i++)
846 	grid->group_sizes[i] = NULL;
847       return grid_dist_follows_simple_pattern (dist, grid);
848     }
849   else
850     {
851       grid->tiling = true;
852       if (group_size)
853 	{
854 	  if (dump_enabled_p ())
855 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
856 			     GRID_MISSED_MSG_PREFIX "group size cannot be set "
857 			     "using thread_limit or schedule clauses when "
858 			     "distribute and loop constructs do not form "
859 			     "one combined construct\n");
860 	  return false;
861 	}
862       for (unsigned i = 0; i < grid->collapse; i++)
863 	{
864 	  if (fd.loops[i].cond_code == GT_EXPR)
865 	    grid->group_sizes[i] = fold_build1 (NEGATE_EXPR,
866 						TREE_TYPE (fd.loops[i].step),
867 						fd.loops[i].step);
868 	  else
869 	    grid->group_sizes[i] = fd.loops[i].step;
870 	}
871       return grid_dist_follows_tiling_pattern (gimple_omp_body (dist), grid,
872 					       false);
873     }
874 }
875 
876 /* Operand walker, used to remap pre-body declarations according to a hash map
877    provided in DATA.  */
878 
879 static tree
grid_remap_prebody_decls(tree * tp,int * walk_subtrees,void * data)880 grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
881 {
882   tree t = *tp;
883 
884   if (DECL_P (t) || TYPE_P (t))
885     *walk_subtrees = 0;
886   else
887     *walk_subtrees = 1;
888 
889   if (VAR_P (t))
890     {
891       struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
892       hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
893       tree *repl = declmap->get (t);
894       if (repl)
895 	*tp = *repl;
896     }
897   return NULL_TREE;
898 }
899 
900 /* Identifiers of segments into which a particular variable should be places
901    when gridifying.  */
902 
903 enum grid_var_segment {GRID_SEGMENT_PRIVATE, GRID_SEGMENT_GROUP,
904 		       GRID_SEGMENT_GLOBAL};
905 
906 /* Mark VAR so that it is eventually placed into SEGMENT.  Place an artificial
907    builtin call into SEQ that will make sure the variable is always considered
908    address taken.  */
909 
910 static void
grid_mark_variable_segment(tree var,enum grid_var_segment segment)911 grid_mark_variable_segment (tree var, enum grid_var_segment segment)
912 {
913   /* Making a non-addressable variables would require that we re-gimplify all
914      their uses.  Fortunately, we do not have to do this because if they are
915      not addressable, it means they are not used in atomic or parallel
916      statements and so relaxed GPU consistency rules mean we can just keep them
917      private.  */
918   if (!TREE_ADDRESSABLE (var))
919     return;
920 
921   switch (segment)
922     {
923     case GRID_SEGMENT_GROUP:
924       DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_group_segment"),
925 					 NULL, DECL_ATTRIBUTES (var));
926       break;
927     case GRID_SEGMENT_GLOBAL:
928       DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_global_segment"),
929 					 NULL, DECL_ATTRIBUTES (var));
930       break;
931     default:
932       gcc_unreachable ();
933     }
934 
935   if (!TREE_STATIC (var))
936     {
937       TREE_STATIC (var) = 1;
938       const char *prefix = IDENTIFIER_POINTER (DECL_NAME (var));
939       SET_DECL_ASSEMBLER_NAME (var, create_tmp_var_name (prefix));
940       varpool_node::finalize_decl (var);
941     }
942 
943 }
944 
945 /* Copy leading register-type assignments to local variables in SRC to just
946    before DST, Creating temporaries, adjusting mapping of operands in WI and
947    remapping operands as necessary.  Add any new temporaries to TGT_BIND.
948    Return the first statement that does not conform to grid_safe_assignment_p
949    or NULL.  If VAR_SEGMENT is not GRID_SEGMENT_PRIVATE, also mark all
950    variables in traversed bind statements so that they are put into the
951    appropriate segment.  */
952 
953 static gimple *
grid_copy_leading_local_assignments(gimple_seq src,gimple_stmt_iterator * dst,gbind * tgt_bind,enum grid_var_segment var_segment,struct walk_stmt_info * wi)954 grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
955 				     gbind *tgt_bind,
956 				     enum grid_var_segment var_segment,
957 				     struct walk_stmt_info *wi)
958 {
959   hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
960   gimple_stmt_iterator gsi;
961   for (gsi = gsi_start (src); !gsi_end_p (gsi); gsi_next (&gsi))
962     {
963       gimple *stmt = gsi_stmt (gsi);
964       if (gbind *bind = dyn_cast <gbind *> (stmt))
965 	{
966 	  gimple *r = grid_copy_leading_local_assignments
967 	    (gimple_bind_body (bind), dst, tgt_bind, var_segment, wi);
968 
969 	  if (var_segment != GRID_SEGMENT_PRIVATE)
970 	    for (tree var = gimple_bind_vars (bind);
971 		 var;
972 		 var = DECL_CHAIN (var))
973 	      grid_mark_variable_segment (var, var_segment);
974 	  if (r)
975 	    return r;
976 	  else
977 	    continue;
978 	}
979       if (!grid_safe_assignment_p (stmt, NULL))
980 	return stmt;
981       tree lhs = gimple_assign_lhs (as_a <gassign *> (stmt));
982       tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL),
983 				 TREE_TYPE (lhs));
984       DECL_CONTEXT (repl) = current_function_decl;
985       gimple_bind_append_vars (tgt_bind, repl);
986 
987       declmap->put (lhs, repl);
988       gassign *copy = as_a <gassign *> (gimple_copy (stmt));
989       walk_gimple_op (copy, grid_remap_prebody_decls, wi);
990       gsi_insert_before (dst, copy, GSI_SAME_STMT);
991     }
992   return NULL;
993 }
994 
995 /* Statement walker function to make adjustments to statements within the
996    gridifed kernel copy.  */
997 
998 static tree
grid_process_grid_body(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info *)999 grid_process_grid_body (gimple_stmt_iterator *gsi, bool *handled_ops_p,
1000 			struct walk_stmt_info *)
1001 {
1002   *handled_ops_p = false;
1003   gimple *stmt = gsi_stmt (*gsi);
1004   if (gimple_code (stmt) == GIMPLE_OMP_FOR
1005       && gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_SIMD)
1006   {
1007     gomp_for *loop = as_a <gomp_for *> (stmt);
1008     tree clauses = gimple_omp_for_clauses (loop);
1009     tree cl = omp_find_clause (clauses, OMP_CLAUSE_SAFELEN);
1010     if (cl)
1011       OMP_CLAUSE_SAFELEN_EXPR (cl) = integer_one_node;
1012     else
1013       {
1014 	tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_SAFELEN);
1015 	OMP_CLAUSE_SAFELEN_EXPR (c) = integer_one_node;
1016 	OMP_CLAUSE_CHAIN (c) = clauses;
1017 	gimple_omp_for_set_clauses (loop, c);
1018       }
1019   }
1020   return NULL_TREE;
1021 }
1022 
1023 /* Given a PARLOOP that is a normal for looping construct but also a part of a
1024    combined construct with a simd loop, eliminate the simd loop.  */
1025 
1026 static void
grid_eliminate_combined_simd_part(gomp_for * parloop)1027 grid_eliminate_combined_simd_part (gomp_for *parloop)
1028 {
1029   struct walk_stmt_info wi;
1030 
1031   memset (&wi, 0, sizeof (wi));
1032   wi.val_only = true;
1033   enum gf_mask msk = GF_OMP_FOR_KIND_SIMD;
1034   wi.info = (void *) &msk;
1035   walk_gimple_seq (gimple_omp_body (parloop), omp_find_combined_for, NULL, &wi);
1036   gimple *stmt = (gimple *) wi.info;
1037   /* We expect that the SIMD id the only statement in the parallel loop.  */
1038   gcc_assert (stmt
1039 	      && gimple_code (stmt) == GIMPLE_OMP_FOR
1040 	      && (gimple_omp_for_kind (stmt) == GF_OMP_FOR_KIND_SIMD)
1041 	      && gimple_omp_for_combined_into_p (stmt)
1042 	      && !gimple_omp_for_combined_p (stmt));
1043   gomp_for *simd = as_a <gomp_for *> (stmt);
1044 
1045   /* Copy over the iteration properties because the body refers to the index in
1046      the bottmom-most loop.  */
1047   unsigned i, collapse = gimple_omp_for_collapse (parloop);
1048   gcc_checking_assert (collapse == gimple_omp_for_collapse (simd));
1049   for (i = 0; i < collapse; i++)
1050     {
1051       gimple_omp_for_set_index (parloop, i, gimple_omp_for_index (simd, i));
1052       gimple_omp_for_set_initial (parloop, i, gimple_omp_for_initial (simd, i));
1053       gimple_omp_for_set_final (parloop, i, gimple_omp_for_final (simd, i));
1054       gimple_omp_for_set_incr (parloop, i, gimple_omp_for_incr (simd, i));
1055     }
1056 
1057   tree *tgt= gimple_omp_for_clauses_ptr (parloop);
1058   while (*tgt)
1059     tgt = &OMP_CLAUSE_CHAIN (*tgt);
1060 
1061   /* Copy over all clauses, except for linear clauses, which are turned into
1062      private clauses, and all other simd-specific clauses, which are
1063      ignored.  */
1064   tree *pc = gimple_omp_for_clauses_ptr (simd);
1065   while (*pc)
1066     {
1067       tree c = *pc;
1068       switch (OMP_CLAUSE_CODE (c))
1069 	{
1070 	case OMP_CLAUSE_LINEAR:
1071 	  {
1072 	    tree priv = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_PRIVATE);
1073 	    OMP_CLAUSE_DECL (priv) = OMP_CLAUSE_DECL (c);
1074 	    OMP_CLAUSE_CHAIN (priv) = NULL;
1075 	    *tgt = priv;
1076 	    tgt = &OMP_CLAUSE_CHAIN (priv);
1077 	    pc = &OMP_CLAUSE_CHAIN (c);
1078 	    break;
1079 	  }
1080 
1081 	case OMP_CLAUSE_SAFELEN:
1082 	case OMP_CLAUSE_SIMDLEN:
1083 	case OMP_CLAUSE_ALIGNED:
1084 	  pc = &OMP_CLAUSE_CHAIN (c);
1085 	  break;
1086 
1087 	default:
1088 	  *pc = OMP_CLAUSE_CHAIN (c);
1089 	  OMP_CLAUSE_CHAIN (c) = NULL;
1090 	  *tgt = c;
1091 	  tgt = &OMP_CLAUSE_CHAIN (c);
1092 	  break;
1093 	}
1094     }
1095 
1096   /* Finally, throw away the simd and mark the parallel loop as not
1097      combined.  */
1098   gimple_omp_set_body (parloop, gimple_omp_body (simd));
1099   gimple_omp_for_set_combined_p (parloop, false);
1100 }
1101 
1102 /* Statement walker function marking all parallels as grid_phony and loops as
1103    grid ones representing threads of a particular thread group.  */
1104 
1105 static tree
grid_mark_tiling_loops(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi_in)1106 grid_mark_tiling_loops (gimple_stmt_iterator *gsi, bool *handled_ops_p,
1107 			struct walk_stmt_info *wi_in)
1108 {
1109   *handled_ops_p = false;
1110   if (gomp_for *loop = dyn_cast <gomp_for *> (gsi_stmt (*gsi)))
1111     {
1112       *handled_ops_p = true;
1113       gimple_omp_for_set_kind (loop, GF_OMP_FOR_KIND_GRID_LOOP);
1114       gimple_omp_for_set_grid_intra_group (loop, true);
1115       if (gimple_omp_for_combined_p (loop))
1116 	grid_eliminate_combined_simd_part (loop);
1117 
1118       struct walk_stmt_info body_wi;
1119       memset (&body_wi, 0, sizeof (body_wi));
1120       walk_gimple_seq_mod (gimple_omp_body_ptr (loop),
1121 			   grid_process_grid_body, NULL, &body_wi);
1122 
1123       gbind *bind = (gbind *) wi_in->info;
1124       tree c;
1125       for (c = gimple_omp_for_clauses (loop); c; c = OMP_CLAUSE_CHAIN (c))
1126 	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE)
1127 	  {
1128 	    push_gimplify_context ();
1129 	    tree ov = OMP_CLAUSE_DECL (c);
1130 	    tree gv = copy_var_decl (ov, create_tmp_var_name (NULL),
1131 				    TREE_TYPE (ov));
1132 
1133 	    grid_mark_variable_segment (gv, GRID_SEGMENT_GROUP);
1134 	    DECL_CONTEXT (gv) = current_function_decl;
1135 	    gimple_bind_append_vars (bind, gv);
1136 	    tree x = lang_hooks.decls.omp_clause_assign_op (c, gv, ov);
1137 	    gimplify_and_add (x, &OMP_CLAUSE_LASTPRIVATE_GIMPLE_SEQ (c));
1138 	    x = lang_hooks.decls.omp_clause_copy_ctor (c, ov, gv);
1139 	    gimple_seq l = NULL;
1140 	    gimplify_and_add (x, &l);
1141 	    gsi_insert_seq_after (gsi, l, GSI_SAME_STMT);
1142 	    pop_gimplify_context (bind);
1143 	  }
1144     }
1145   return NULL_TREE;
1146 }
1147 
1148 /* Statement walker function marking all parallels as grid_phony and loops as
1149    grid ones representing threads of a particular thread group.  */
1150 
1151 static tree
grid_mark_tiling_parallels_and_loops(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi_in)1152 grid_mark_tiling_parallels_and_loops (gimple_stmt_iterator *gsi,
1153 				      bool *handled_ops_p,
1154 				      struct walk_stmt_info *wi_in)
1155 {
1156   *handled_ops_p = false;
1157   wi_in->removed_stmt = false;
1158   gimple *stmt = gsi_stmt (*gsi);
1159   if (gbind *bind = dyn_cast <gbind *> (stmt))
1160     {
1161       for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
1162 	grid_mark_variable_segment (var, GRID_SEGMENT_GROUP);
1163     }
1164   else if (gomp_parallel *parallel = dyn_cast <gomp_parallel *> (stmt))
1165     {
1166       *handled_ops_p = true;
1167       gimple_omp_parallel_set_grid_phony (parallel, true);
1168 
1169       gbind *new_bind = gimple_build_bind (NULL, NULL, make_node (BLOCK));
1170       gimple_bind_set_body (new_bind, gimple_omp_body (parallel));
1171       gimple_seq s = NULL;
1172       gimple_seq_add_stmt (&s, new_bind);
1173       gimple_omp_set_body (parallel, s);
1174 
1175       struct walk_stmt_info wi_par;
1176       memset (&wi_par, 0, sizeof (wi_par));
1177       wi_par.info = new_bind;
1178       walk_gimple_seq_mod (gimple_bind_body_ptr (new_bind),
1179 			   grid_mark_tiling_loops, NULL, &wi_par);
1180     }
1181   else if (is_a <gcall *> (stmt))
1182     wi_in->removed_stmt = grid_handle_call_in_distribute (gsi);
1183   return NULL_TREE;
1184 }
1185 
1186 /* Given freshly copied top level kernel SEQ, identify the individual OMP
1187    components, mark them as part of kernel, copy assignment leading to them
1188    just before DST, remapping them using WI and adding new temporaries to
1189    TGT_BIND, and return the loop that will be used for kernel dispatch.  */
1190 
1191 static gomp_for *
grid_process_kernel_body_copy(grid_prop * grid,gimple_seq seq,gimple_stmt_iterator * dst,gbind * tgt_bind,struct walk_stmt_info * wi)1192 grid_process_kernel_body_copy (grid_prop *grid, gimple_seq seq,
1193 			       gimple_stmt_iterator *dst,
1194 			       gbind *tgt_bind, struct walk_stmt_info *wi)
1195 {
1196   gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind,
1197 						      GRID_SEGMENT_GLOBAL, wi);
1198   gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
1199   gcc_assert (teams);
1200   gimple_omp_teams_set_grid_phony (teams, true);
1201   stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst,
1202 					      tgt_bind, GRID_SEGMENT_GLOBAL,
1203 					      wi);
1204   gcc_checking_assert (stmt);
1205   gomp_for *dist = dyn_cast <gomp_for *> (stmt);
1206   gcc_assert (dist);
1207   gimple_seq prebody = gimple_omp_for_pre_body (dist);
1208   if (prebody)
1209     grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
1210 					 GRID_SEGMENT_GROUP, wi);
1211 
1212   if (grid->tiling)
1213     {
1214       gimple_omp_for_set_kind (dist, GF_OMP_FOR_KIND_GRID_LOOP);
1215       gimple_omp_for_set_grid_group_iter (dist, true);
1216 
1217       struct walk_stmt_info wi_tiled;
1218       memset (&wi_tiled, 0, sizeof (wi_tiled));
1219       walk_gimple_seq_mod (gimple_omp_body_ptr (dist),
1220 			   grid_mark_tiling_parallels_and_loops, NULL,
1221 			   &wi_tiled);
1222       return dist;
1223     }
1224   else
1225     {
1226       gimple_omp_for_set_grid_phony (dist, true);
1227       stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
1228 						  tgt_bind,
1229 						  GRID_SEGMENT_PRIVATE, wi);
1230       gcc_checking_assert (stmt);
1231       gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
1232       gimple_omp_parallel_set_grid_phony (parallel, true);
1233       stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel),
1234 						  dst, tgt_bind,
1235 						  GRID_SEGMENT_PRIVATE, wi);
1236       gomp_for *inner_loop = as_a <gomp_for *> (stmt);
1237       gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
1238       prebody = gimple_omp_for_pre_body (inner_loop);
1239       if (prebody)
1240 	grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
1241 					     GRID_SEGMENT_PRIVATE, wi);
1242 
1243       if (gimple_omp_for_combined_p (inner_loop))
1244 	grid_eliminate_combined_simd_part (inner_loop);
1245       struct walk_stmt_info body_wi;
1246       memset (&body_wi, 0, sizeof (body_wi));
1247       walk_gimple_seq_mod (gimple_omp_body_ptr (inner_loop),
1248 			   grid_process_grid_body, NULL, &body_wi);
1249 
1250       return inner_loop;
1251     }
1252 }
1253 
1254 /* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern,
1255    create a GPU kernel for it.  GSI must point to the same statement, TGT_BIND
1256    is the bind into which temporaries inserted before TARGET should be
1257    added.  */
1258 
1259 static void
grid_attempt_target_gridification(gomp_target * target,gimple_stmt_iterator * gsi,gbind * tgt_bind)1260 grid_attempt_target_gridification (gomp_target *target,
1261 				   gimple_stmt_iterator *gsi,
1262 				   gbind *tgt_bind)
1263 {
1264   /* removed group_size */
1265   grid_prop grid = {};
1266   if (!target || !grid_target_follows_gridifiable_pattern (target, &grid))
1267     return;
1268 
1269   location_t loc = gimple_location (target);
1270   if (dump_enabled_p ())
1271     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, target,
1272 		     "Target construct will be turned into a gridified HSA "
1273 		     "kernel\n");
1274 
1275   /* Copy target body to a GPUKERNEL construct:  */
1276   gimple_seq kernel_seq = copy_gimple_seq_and_replace_locals
1277     (gimple_omp_body (target));
1278 
1279   hash_map<tree, tree> *declmap = new hash_map<tree, tree>;
1280   struct walk_stmt_info wi;
1281   memset (&wi, 0, sizeof (struct walk_stmt_info));
1282   wi.info = declmap;
1283 
1284   /* Copy assignments in between OMP statements before target, mark OMP
1285      statements within copy appropriately.  */
1286   gomp_for *inner_loop = grid_process_kernel_body_copy (&grid, kernel_seq, gsi,
1287 							tgt_bind, &wi);
1288 
1289   gbind *old_bind
1290     = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target)));
1291   gbind *new_bind = as_a <gbind *> (gimple_seq_first (kernel_seq));
1292   tree new_block = gimple_bind_block (new_bind);
1293   tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind));
1294   BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block);
1295   BLOCK_SUBBLOCKS (enc_block) = new_block;
1296   BLOCK_SUPERCONTEXT (new_block) = enc_block;
1297   gimple *gpukernel = gimple_build_omp_grid_body (kernel_seq);
1298   gimple_seq_add_stmt
1299     (gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))),
1300      gpukernel);
1301 
1302   for (size_t i = 0; i < grid.collapse; i++)
1303     walk_tree (&grid.group_sizes[i], grid_remap_prebody_decls, &wi, NULL);
1304   push_gimplify_context ();
1305   for (size_t i = 0; i < grid.collapse; i++)
1306     {
1307       tree index_var = gimple_omp_for_index (inner_loop, i);
1308       tree itype, type = TREE_TYPE (index_var);
1309       if (POINTER_TYPE_P (type))
1310 	itype = signed_type_for (type);
1311       else
1312 	itype = type;
1313 
1314       enum tree_code cond_code = gimple_omp_for_cond (inner_loop, i);
1315       tree n1 = unshare_expr (gimple_omp_for_initial (inner_loop, i));
1316       walk_tree (&n1, grid_remap_prebody_decls, &wi, NULL);
1317       tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i));
1318       walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL);
1319       tree step
1320 	= omp_get_for_step_from_incr (loc, gimple_omp_for_incr (inner_loop, i));
1321       omp_adjust_for_condition (loc, &cond_code, &n2, index_var, step);
1322       n1 = fold_convert (itype, n1);
1323       n2 = fold_convert (itype, n2);
1324 
1325       tree cond = fold_build2 (cond_code, boolean_type_node, n1, n2);
1326 
1327       tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1));
1328       t = fold_build2 (PLUS_EXPR, itype, step, t);
1329       t = fold_build2 (PLUS_EXPR, itype, t, n2);
1330       t = fold_build2 (MINUS_EXPR, itype, t, n1);
1331       if (TYPE_UNSIGNED (itype) && cond_code == GT_EXPR)
1332 	t = fold_build2 (TRUNC_DIV_EXPR, itype,
1333 			 fold_build1 (NEGATE_EXPR, itype, t),
1334 			 fold_build1 (NEGATE_EXPR, itype, step));
1335       else
1336 	t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
1337       t = fold_build3 (COND_EXPR, itype, cond, t, build_zero_cst (itype));
1338       if (grid.tiling)
1339 	{
1340 	  if (cond_code == GT_EXPR)
1341 	    step = fold_build1 (NEGATE_EXPR, itype, step);
1342 	  t = fold_build2 (MULT_EXPR, itype, t, step);
1343 	}
1344 
1345       tree gs = fold_convert (uint32_type_node, t);
1346       gimple_seq tmpseq = NULL;
1347       gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue);
1348       if (!gimple_seq_empty_p (tmpseq))
1349 	gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
1350 
1351       tree ws;
1352       if (grid.group_sizes[i])
1353 	{
1354 	  ws = fold_convert (uint32_type_node, grid.group_sizes[i]);
1355 	  tmpseq = NULL;
1356 	  gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue);
1357 	  if (!gimple_seq_empty_p (tmpseq))
1358 	    gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
1359 	}
1360       else
1361 	ws = build_zero_cst (uint32_type_node);
1362 
1363       tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__GRIDDIM_);
1364       OMP_CLAUSE__GRIDDIM__DIMENSION (c) = i;
1365       OMP_CLAUSE__GRIDDIM__SIZE (c) = gs;
1366       OMP_CLAUSE__GRIDDIM__GROUP (c) = ws;
1367       OMP_CLAUSE_CHAIN (c) = gimple_omp_target_clauses (target);
1368       gimple_omp_target_set_clauses (target, c);
1369     }
1370   pop_gimplify_context (tgt_bind);
1371   delete declmap;
1372   return;
1373 }
1374 
1375 /* Walker function doing all the work for create_target_kernels.  */
1376 
1377 static tree
grid_gridify_all_targets_stmt(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * incoming)1378 grid_gridify_all_targets_stmt (gimple_stmt_iterator *gsi,
1379 				   bool *handled_ops_p,
1380 				   struct walk_stmt_info *incoming)
1381 {
1382   *handled_ops_p = false;
1383 
1384   gimple *stmt = gsi_stmt (*gsi);
1385   gomp_target *target = dyn_cast <gomp_target *> (stmt);
1386   if (target)
1387     {
1388       gbind *tgt_bind = (gbind *) incoming->info;
1389       gcc_checking_assert (tgt_bind);
1390       grid_attempt_target_gridification (target, gsi, tgt_bind);
1391       return NULL_TREE;
1392     }
1393   gbind *bind = dyn_cast <gbind *> (stmt);
1394   if (bind)
1395     {
1396       *handled_ops_p = true;
1397       struct walk_stmt_info wi;
1398       memset (&wi, 0, sizeof (wi));
1399       wi.info = bind;
1400       walk_gimple_seq_mod (gimple_bind_body_ptr (bind),
1401 			   grid_gridify_all_targets_stmt, NULL, &wi);
1402     }
1403   return NULL_TREE;
1404 }
1405 
1406 /* Attempt to gridify all target constructs in BODY_P.  All such targets will
1407    have their bodies duplicated, with the new copy being put into a
1408    gimple_omp_grid_body statement.  All kernel-related construct within the
1409    grid_body will be marked with phony flags or kernel kinds.  Moreover, some
1410    re-structuring is often needed, such as copying pre-bodies before the target
1411    construct so that kernel grid sizes can be computed.  */
1412 
1413 void
omp_grid_gridify_all_targets(gimple_seq * body_p)1414 omp_grid_gridify_all_targets (gimple_seq *body_p)
1415 {
1416   struct walk_stmt_info wi;
1417   memset (&wi, 0, sizeof (wi));
1418   walk_gimple_seq_mod (body_p, grid_gridify_all_targets_stmt, NULL, &wi);
1419 }
1420