1 /* Loop autoparallelization.
2    Copyright (C) 2006-2016 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "gimple-pretty-print.h"
33 #include "fold-const.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-walk.h"
38 #include "stor-layout.h"
39 #include "tree-nested.h"
40 #include "tree-cfg.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "tree-into-ssa.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "langhooks.h"
49 #include "tree-vectorizer.h"
50 #include "tree-hasher.h"
51 #include "tree-parloops.h"
52 #include "omp-low.h"
53 #include "tree-ssa.h"
54 #include "params.h"
55 #include "params-enum.h"
56 #include "tree-ssa-alias.h"
57 #include "tree-eh.h"
58 #include "gomp-constants.h"
59 #include "tree-dfa.h"
60 
61 /* This pass tries to distribute iterations of loops into several threads.
62    The implementation is straightforward -- for each loop we test whether its
63    iterations are independent, and if it is the case (and some additional
64    conditions regarding profitability and correctness are satisfied), we
65    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
66    machinery do its job.
67 
68    The most of the complexity is in bringing the code into shape expected
69    by the omp expanders:
70    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
71       variable and that the exit test is at the start of the loop body
72    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
73       variables by accesses through pointers, and breaking up ssa chains
74       by storing the values incoming to the parallelized loop to a structure
75       passed to the new function as an argument (something similar is done
76       in omp gimplification, unfortunately only a small part of the code
77       can be shared).
78 
79    TODO:
80    -- if there are several parallelizable loops in a function, it may be
81       possible to generate the threads just once (using synchronization to
82       ensure that cross-loop dependences are obeyed).
83    -- handling of common reduction patterns for outer loops.
84 
85    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
86 /*
87   Reduction handling:
88   currently we use vect_force_simple_reduction() to detect reduction patterns.
89   The code transformation will be introduced by an example.
90 
91 
92 parloop
93 {
94   int sum=1;
95 
96   for (i = 0; i < N; i++)
97    {
98     x[i] = i + 3;
99     sum+=x[i];
100    }
101 }
102 
103 gimple-like code:
104 header_bb:
105 
106   # sum_29 = PHI <sum_11(5), 1(3)>
107   # i_28 = PHI <i_12(5), 0(3)>
108   D.1795_8 = i_28 + 3;
109   x[i_28] = D.1795_8;
110   sum_11 = D.1795_8 + sum_29;
111   i_12 = i_28 + 1;
112   if (N_6(D) > i_12)
113     goto header_bb;
114 
115 
116 exit_bb:
117 
118   # sum_21 = PHI <sum_11(4)>
119   printf (&"%d"[0], sum_21);
120 
121 
122 after reduction transformation (only relevant parts):
123 
124 parloop
125 {
126 
127 ....
128 
129 
130   # Storing the initial value given by the user.  #
131 
132   .paral_data_store.32.sum.27 = 1;
133 
134   #pragma omp parallel num_threads(4)
135 
136   #pragma omp for schedule(static)
137 
138   # The neutral element corresponding to the particular
139   reduction's operation, e.g. 0 for PLUS_EXPR,
140   1 for MULT_EXPR, etc. replaces the user's initial value.  #
141 
142   # sum.27_29 = PHI <sum.27_11, 0>
143 
144   sum.27_11 = D.1827_8 + sum.27_29;
145 
146   GIMPLE_OMP_CONTINUE
147 
148   # Adding this reduction phi is done at create_phi_for_local_result() #
149   # sum.27_56 = PHI <sum.27_11, 0>
150   GIMPLE_OMP_RETURN
151 
152   # Creating the atomic operation is done at
153   create_call_for_reduction_1()  #
154 
155   #pragma omp atomic_load
156   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
157   D.1840_60 = sum.27_56 + D.1839_59;
158   #pragma omp atomic_store (D.1840_60);
159 
160   GIMPLE_OMP_RETURN
161 
162  # collecting the result after the join of the threads is done at
163   create_loads_for_reductions().
164   The value computed by the threads is loaded from the
165   shared struct.  #
166 
167 
168   .paral_data_load.33_52 = &.paral_data_store.32;
169   sum_37 =  .paral_data_load.33_52->sum.27;
170   sum_43 = D.1795_41 + sum_37;
171 
172   exit bb:
173   # sum_21 = PHI <sum_43, sum_26>
174   printf (&"%d"[0], sum_21);
175 
176 ...
177 
178 }
179 
180 */
181 
182 /* Minimal number of iterations of a loop that should be executed in each
183    thread.  */
184 #define MIN_PER_THREAD 100
185 
186 /* Element of the hashtable, representing a
187    reduction in the current loop.  */
188 struct reduction_info
189 {
190   gimple *reduc_stmt;		/* reduction statement.  */
191   gimple *reduc_phi;		/* The phi node defining the reduction.  */
192   enum tree_code reduction_code;/* code for the reduction operation.  */
193   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
194 				   result.  */
195   gphi *keep_res;		/* The PHI_RESULT of this phi is the resulting value
196 				   of the reduction variable when existing the loop. */
197   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
198   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
199   tree reduc_addr;		/* The address of the reduction variable for
200 				   openacc reductions.  */
201   tree init;			/* reduction initialization value.  */
202   gphi *new_phi;		/* (helper field) Newly created phi node whose result
203 				   will be passed to the atomic operation.  Represents
204 				   the local result each thread computed for the reduction
205 				   operation.  */
206 };
207 
208 /* Reduction info hashtable helpers.  */
209 
210 struct reduction_hasher : free_ptr_hash <reduction_info>
211 {
212   static inline hashval_t hash (const reduction_info *);
213   static inline bool equal (const reduction_info *, const reduction_info *);
214 };
215 
216 /* Equality and hash functions for hashtab code.  */
217 
218 inline bool
equal(const reduction_info * a,const reduction_info * b)219 reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
220 {
221   return (a->reduc_phi == b->reduc_phi);
222 }
223 
224 inline hashval_t
hash(const reduction_info * a)225 reduction_hasher::hash (const reduction_info *a)
226 {
227   return a->reduc_version;
228 }
229 
230 typedef hash_table<reduction_hasher> reduction_info_table_type;
231 
232 
233 static struct reduction_info *
reduction_phi(reduction_info_table_type * reduction_list,gimple * phi)234 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
235 {
236   struct reduction_info tmpred, *red;
237 
238   if (reduction_list->elements () == 0 || phi == NULL)
239     return NULL;
240 
241   if (gimple_uid (phi) == (unsigned int)-1
242       || gimple_uid (phi) == 0)
243     return NULL;
244 
245   tmpred.reduc_phi = phi;
246   tmpred.reduc_version = gimple_uid (phi);
247   red = reduction_list->find (&tmpred);
248   gcc_assert (red == NULL || red->reduc_phi == phi);
249 
250   return red;
251 }
252 
253 /* Element of hashtable of names to copy.  */
254 
255 struct name_to_copy_elt
256 {
257   unsigned version;	/* The version of the name to copy.  */
258   tree new_name;	/* The new name used in the copy.  */
259   tree field;		/* The field of the structure used to pass the
260 			   value.  */
261 };
262 
263 /* Name copies hashtable helpers.  */
264 
265 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
266 {
267   static inline hashval_t hash (const name_to_copy_elt *);
268   static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
269 };
270 
271 /* Equality and hash functions for hashtab code.  */
272 
273 inline bool
equal(const name_to_copy_elt * a,const name_to_copy_elt * b)274 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
275 {
276   return a->version == b->version;
277 }
278 
279 inline hashval_t
hash(const name_to_copy_elt * a)280 name_to_copy_hasher::hash (const name_to_copy_elt *a)
281 {
282   return (hashval_t) a->version;
283 }
284 
285 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
286 
287 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
288    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
289    represents the denominator for every element in the matrix.  */
290 typedef struct lambda_trans_matrix_s
291 {
292   lambda_matrix matrix;
293   int rowsize;
294   int colsize;
295   int denominator;
296 } *lambda_trans_matrix;
297 #define LTM_MATRIX(T) ((T)->matrix)
298 #define LTM_ROWSIZE(T) ((T)->rowsize)
299 #define LTM_COLSIZE(T) ((T)->colsize)
300 #define LTM_DENOMINATOR(T) ((T)->denominator)
301 
302 /* Allocate a new transformation matrix.  */
303 
304 static lambda_trans_matrix
lambda_trans_matrix_new(int colsize,int rowsize,struct obstack * lambda_obstack)305 lambda_trans_matrix_new (int colsize, int rowsize,
306 			 struct obstack * lambda_obstack)
307 {
308   lambda_trans_matrix ret;
309 
310   ret = (lambda_trans_matrix)
311     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
312   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
313   LTM_ROWSIZE (ret) = rowsize;
314   LTM_COLSIZE (ret) = colsize;
315   LTM_DENOMINATOR (ret) = 1;
316   return ret;
317 }
318 
319 /* Multiply a vector VEC by a matrix MAT.
320    MAT is an M*N matrix, and VEC is a vector with length N.  The result
321    is stored in DEST which must be a vector of length M.  */
322 
323 static void
lambda_matrix_vector_mult(lambda_matrix matrix,int m,int n,lambda_vector vec,lambda_vector dest)324 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
325 			   lambda_vector vec, lambda_vector dest)
326 {
327   int i, j;
328 
329   lambda_vector_clear (dest, m);
330   for (i = 0; i < m; i++)
331     for (j = 0; j < n; j++)
332       dest[i] += matrix[i][j] * vec[j];
333 }
334 
335 /* Return true if TRANS is a legal transformation matrix that respects
336    the dependence vectors in DISTS and DIRS.  The conservative answer
337    is false.
338 
339    "Wolfe proves that a unimodular transformation represented by the
340    matrix T is legal when applied to a loop nest with a set of
341    lexicographically non-negative distance vectors RDG if and only if
342    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
343    i.e.: if and only if it transforms the lexicographically positive
344    distance vectors to lexicographically positive vectors.  Note that
345    a unimodular matrix must transform the zero vector (and only it) to
346    the zero vector." S.Muchnick.  */
347 
348 static bool
lambda_transform_legal_p(lambda_trans_matrix trans,int nb_loops,vec<ddr_p> dependence_relations)349 lambda_transform_legal_p (lambda_trans_matrix trans,
350 			  int nb_loops,
351 			  vec<ddr_p> dependence_relations)
352 {
353   unsigned int i, j;
354   lambda_vector distres;
355   struct data_dependence_relation *ddr;
356 
357   gcc_assert (LTM_COLSIZE (trans) == nb_loops
358 	      && LTM_ROWSIZE (trans) == nb_loops);
359 
360   /* When there are no dependences, the transformation is correct.  */
361   if (dependence_relations.length () == 0)
362     return true;
363 
364   ddr = dependence_relations[0];
365   if (ddr == NULL)
366     return true;
367 
368   /* When there is an unknown relation in the dependence_relations, we
369      know that it is no worth looking at this loop nest: give up.  */
370   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
371     return false;
372 
373   distres = lambda_vector_new (nb_loops);
374 
375   /* For each distance vector in the dependence graph.  */
376   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
377     {
378       /* Don't care about relations for which we know that there is no
379 	 dependence, nor about read-read (aka. output-dependences):
380 	 these data accesses can happen in any order.  */
381       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
382 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
383 	continue;
384 
385       /* Conservatively answer: "this transformation is not valid".  */
386       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
387 	return false;
388 
389       /* If the dependence could not be captured by a distance vector,
390 	 conservatively answer that the transform is not valid.  */
391       if (DDR_NUM_DIST_VECTS (ddr) == 0)
392 	return false;
393 
394       /* Compute trans.dist_vect */
395       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
396 	{
397 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
398 				     DDR_DIST_VECT (ddr, j), distres);
399 
400 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
401 	    return false;
402 	}
403     }
404   return true;
405 }
406 
407 /* Data dependency analysis. Returns true if the iterations of LOOP
408    are independent on each other (that is, if we can execute them
409    in parallel).  */
410 
411 static bool
loop_parallel_p(struct loop * loop,struct obstack * parloop_obstack)412 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
413 {
414   vec<ddr_p> dependence_relations;
415   vec<data_reference_p> datarefs;
416   lambda_trans_matrix trans;
417   bool ret = false;
418 
419   if (dump_file && (dump_flags & TDF_DETAILS))
420   {
421     fprintf (dump_file, "Considering loop %d\n", loop->num);
422     if (!loop->inner)
423       fprintf (dump_file, "loop is innermost\n");
424     else
425       fprintf (dump_file, "loop NOT innermost\n");
426    }
427 
428   /* Check for problems with dependences.  If the loop can be reversed,
429      the iterations are independent.  */
430   auto_vec<loop_p, 3> loop_nest;
431   datarefs.create (10);
432   dependence_relations.create (100);
433   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
434 					   &dependence_relations))
435     {
436       if (dump_file && (dump_flags & TDF_DETAILS))
437 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
438       ret = false;
439       goto end;
440     }
441   if (dump_file && (dump_flags & TDF_DETAILS))
442     dump_data_dependence_relations (dump_file, dependence_relations);
443 
444   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
445   LTM_MATRIX (trans)[0][0] = -1;
446 
447   if (lambda_transform_legal_p (trans, 1, dependence_relations))
448     {
449       ret = true;
450       if (dump_file && (dump_flags & TDF_DETAILS))
451 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
452     }
453   else if (dump_file && (dump_flags & TDF_DETAILS))
454     fprintf (dump_file,
455 	     "  FAILED: data dependencies exist across iterations\n");
456 
457  end:
458   free_dependence_relations (dependence_relations);
459   free_data_refs (datarefs);
460 
461   return ret;
462 }
463 
464 /* Return true when LOOP contains basic blocks marked with the
465    BB_IRREDUCIBLE_LOOP flag.  */
466 
467 static inline bool
loop_has_blocks_with_irreducible_flag(struct loop * loop)468 loop_has_blocks_with_irreducible_flag (struct loop *loop)
469 {
470   unsigned i;
471   basic_block *bbs = get_loop_body_in_dom_order (loop);
472   bool res = true;
473 
474   for (i = 0; i < loop->num_nodes; i++)
475     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
476       goto end;
477 
478   res = false;
479  end:
480   free (bbs);
481   return res;
482 }
483 
484 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
485    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
486    to their addresses that can be reused.  The address of OBJ is known to
487    be invariant in the whole function.  Other needed statements are placed
488    right before GSI.  */
489 
490 static tree
take_address_of(tree obj,tree type,edge entry,int_tree_htab_type * decl_address,gimple_stmt_iterator * gsi)491 take_address_of (tree obj, tree type, edge entry,
492 		 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
493 {
494   int uid;
495   tree *var_p, name, addr;
496   gassign *stmt;
497   gimple_seq stmts;
498 
499   /* Since the address of OBJ is invariant, the trees may be shared.
500      Avoid rewriting unrelated parts of the code.  */
501   obj = unshare_expr (obj);
502   for (var_p = &obj;
503        handled_component_p (*var_p);
504        var_p = &TREE_OPERAND (*var_p, 0))
505     continue;
506 
507   /* Canonicalize the access to base on a MEM_REF.  */
508   if (DECL_P (*var_p))
509     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
510 
511   /* Assign a canonical SSA name to the address of the base decl used
512      in the address and share it for all accesses and addresses based
513      on it.  */
514   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
515   int_tree_map elt;
516   elt.uid = uid;
517   int_tree_map *slot = decl_address->find_slot (elt, INSERT);
518   if (!slot->to)
519     {
520       if (gsi == NULL)
521 	return NULL;
522       addr = TREE_OPERAND (*var_p, 0);
523       const char *obj_name
524 	= get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
525       if (obj_name)
526 	name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
527       else
528 	name = make_ssa_name (TREE_TYPE (addr));
529       stmt = gimple_build_assign (name, addr);
530       gsi_insert_on_edge_immediate (entry, stmt);
531 
532       slot->uid = uid;
533       slot->to = name;
534     }
535   else
536     name = slot->to;
537 
538   /* Express the address in terms of the canonical SSA name.  */
539   TREE_OPERAND (*var_p, 0) = name;
540   if (gsi == NULL)
541     return build_fold_addr_expr_with_type (obj, type);
542 
543   name = force_gimple_operand (build_addr (obj),
544 			       &stmts, true, NULL_TREE);
545   if (!gimple_seq_empty_p (stmts))
546     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
547 
548   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
549     {
550       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
551 				   NULL_TREE);
552       if (!gimple_seq_empty_p (stmts))
553 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
554     }
555 
556   return name;
557 }
558 
559 static tree
reduc_stmt_res(gimple * stmt)560 reduc_stmt_res (gimple *stmt)
561 {
562   return (gimple_code (stmt) == GIMPLE_PHI
563 	  ? gimple_phi_result (stmt)
564 	  : gimple_assign_lhs (stmt));
565 }
566 
567 /* Callback for htab_traverse.  Create the initialization statement
568    for reduction described in SLOT, and place it at the preheader of
569    the loop described in DATA.  */
570 
571 int
initialize_reductions(reduction_info ** slot,struct loop * loop)572 initialize_reductions (reduction_info **slot, struct loop *loop)
573 {
574   tree init;
575   tree type, arg;
576   edge e;
577 
578   struct reduction_info *const reduc = *slot;
579 
580   /* Create initialization in preheader:
581      reduction_variable = initialization value of reduction.  */
582 
583   /* In the phi node at the header, replace the argument coming
584      from the preheader with the reduction initialization value.  */
585 
586   /* Initialize the reduction.  */
587   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
588   init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt),
589 				reduc->reduction_code, type);
590   reduc->init = init;
591 
592   /* Replace the argument representing the initialization value
593      with the initialization value for the reduction (neutral
594      element for the particular operation, e.g. 0 for PLUS_EXPR,
595      1 for MULT_EXPR, etc).
596      Keep the old value in a new variable "reduction_initial",
597      that will be taken in consideration after the parallel
598      computing is done.  */
599 
600   e = loop_preheader_edge (loop);
601   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
602   /* Create new variable to hold the initial value.  */
603 
604   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
605 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
606   reduc->initial_value = arg;
607   return 1;
608 }
609 
610 struct elv_data
611 {
612   struct walk_stmt_info info;
613   edge entry;
614   int_tree_htab_type *decl_address;
615   gimple_stmt_iterator *gsi;
616   bool changed;
617   bool reset;
618 };
619 
620 /* Eliminates references to local variables in *TP out of the single
621    entry single exit region starting at DTA->ENTRY.
622    DECL_ADDRESS contains addresses of the references that had their
623    address taken already.  If the expression is changed, CHANGED is
624    set to true.  Callback for walk_tree.  */
625 
626 static tree
eliminate_local_variables_1(tree * tp,int * walk_subtrees,void * data)627 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
628 {
629   struct elv_data *const dta = (struct elv_data *) data;
630   tree t = *tp, var, addr, addr_type, type, obj;
631 
632   if (DECL_P (t))
633     {
634       *walk_subtrees = 0;
635 
636       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
637 	return NULL_TREE;
638 
639       type = TREE_TYPE (t);
640       addr_type = build_pointer_type (type);
641       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
642 			      dta->gsi);
643       if (dta->gsi == NULL && addr == NULL_TREE)
644 	{
645 	  dta->reset = true;
646 	  return NULL_TREE;
647 	}
648 
649       *tp = build_simple_mem_ref (addr);
650 
651       dta->changed = true;
652       return NULL_TREE;
653     }
654 
655   if (TREE_CODE (t) == ADDR_EXPR)
656     {
657       /* ADDR_EXPR may appear in two contexts:
658 	 -- as a gimple operand, when the address taken is a function invariant
659 	 -- as gimple rhs, when the resulting address in not a function
660 	    invariant
661 	 We do not need to do anything special in the latter case (the base of
662 	 the memory reference whose address is taken may be replaced in the
663 	 DECL_P case).  The former case is more complicated, as we need to
664 	 ensure that the new address is still a gimple operand.  Thus, it
665 	 is not sufficient to replace just the base of the memory reference --
666 	 we need to move the whole computation of the address out of the
667 	 loop.  */
668       if (!is_gimple_val (t))
669 	return NULL_TREE;
670 
671       *walk_subtrees = 0;
672       obj = TREE_OPERAND (t, 0);
673       var = get_base_address (obj);
674       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
675 	return NULL_TREE;
676 
677       addr_type = TREE_TYPE (t);
678       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
679 			      dta->gsi);
680       if (dta->gsi == NULL && addr == NULL_TREE)
681 	{
682 	  dta->reset = true;
683 	  return NULL_TREE;
684 	}
685       *tp = addr;
686 
687       dta->changed = true;
688       return NULL_TREE;
689     }
690 
691   if (!EXPR_P (t))
692     *walk_subtrees = 0;
693 
694   return NULL_TREE;
695 }
696 
697 /* Moves the references to local variables in STMT at *GSI out of the single
698    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
699    addresses of the references that had their address taken
700    already.  */
701 
702 static void
eliminate_local_variables_stmt(edge entry,gimple_stmt_iterator * gsi,int_tree_htab_type * decl_address)703 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
704 				int_tree_htab_type *decl_address)
705 {
706   struct elv_data dta;
707   gimple *stmt = gsi_stmt (*gsi);
708 
709   memset (&dta.info, '\0', sizeof (dta.info));
710   dta.entry = entry;
711   dta.decl_address = decl_address;
712   dta.changed = false;
713   dta.reset = false;
714 
715   if (gimple_debug_bind_p (stmt))
716     {
717       dta.gsi = NULL;
718       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
719 		 eliminate_local_variables_1, &dta.info, NULL);
720       if (dta.reset)
721 	{
722 	  gimple_debug_bind_reset_value (stmt);
723 	  dta.changed = true;
724 	}
725     }
726   else if (gimple_clobber_p (stmt))
727     {
728       unlink_stmt_vdef (stmt);
729       stmt = gimple_build_nop ();
730       gsi_replace (gsi, stmt, false);
731       dta.changed = true;
732     }
733   else
734     {
735       dta.gsi = gsi;
736       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
737     }
738 
739   if (dta.changed)
740     update_stmt (stmt);
741 }
742 
743 /* Eliminates the references to local variables from the single entry
744    single exit region between the ENTRY and EXIT edges.
745 
746    This includes:
747    1) Taking address of a local variable -- these are moved out of the
748    region (and temporary variable is created to hold the address if
749    necessary).
750 
751    2) Dereferencing a local variable -- these are replaced with indirect
752    references.  */
753 
754 static void
eliminate_local_variables(edge entry,edge exit)755 eliminate_local_variables (edge entry, edge exit)
756 {
757   basic_block bb;
758   auto_vec<basic_block, 3> body;
759   unsigned i;
760   gimple_stmt_iterator gsi;
761   bool has_debug_stmt = false;
762   int_tree_htab_type decl_address (10);
763   basic_block entry_bb = entry->src;
764   basic_block exit_bb = exit->dest;
765 
766   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
767 
768   FOR_EACH_VEC_ELT (body, i, bb)
769     if (bb != entry_bb && bb != exit_bb)
770       {
771         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
772 	  if (is_gimple_debug (gsi_stmt (gsi)))
773 	    {
774 	      if (gimple_debug_bind_p (gsi_stmt (gsi)))
775 	        has_debug_stmt = true;
776 	    }
777 	  else
778 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
779       }
780 
781   if (has_debug_stmt)
782     FOR_EACH_VEC_ELT (body, i, bb)
783       if (bb != entry_bb && bb != exit_bb)
784 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
785 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
786 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
787 }
788 
789 /* Returns true if expression EXPR is not defined between ENTRY and
790    EXIT, i.e. if all its operands are defined outside of the region.  */
791 
792 static bool
expr_invariant_in_region_p(edge entry,edge exit,tree expr)793 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
794 {
795   basic_block entry_bb = entry->src;
796   basic_block exit_bb = exit->dest;
797   basic_block def_bb;
798 
799   if (is_gimple_min_invariant (expr))
800     return true;
801 
802   if (TREE_CODE (expr) == SSA_NAME)
803     {
804       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
805       if (def_bb
806 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
807 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
808 	return false;
809 
810       return true;
811     }
812 
813   return false;
814 }
815 
816 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
817    The copies are stored to NAME_COPIES, if NAME was already duplicated,
818    its duplicate stored in NAME_COPIES is returned.
819 
820    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
821    duplicated, storing the copies in DECL_COPIES.  */
822 
823 static tree
separate_decls_in_region_name(tree name,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies,bool copy_name_p)824 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
825 			       int_tree_htab_type *decl_copies,
826 			       bool copy_name_p)
827 {
828   tree copy, var, var_copy;
829   unsigned idx, uid, nuid;
830   struct int_tree_map ielt;
831   struct name_to_copy_elt elt, *nelt;
832   name_to_copy_elt **slot;
833   int_tree_map *dslot;
834 
835   if (TREE_CODE (name) != SSA_NAME)
836     return name;
837 
838   idx = SSA_NAME_VERSION (name);
839   elt.version = idx;
840   slot = name_copies->find_slot_with_hash (&elt, idx,
841 					   copy_name_p ? INSERT : NO_INSERT);
842   if (slot && *slot)
843     return (*slot)->new_name;
844 
845   if (copy_name_p)
846     {
847       copy = duplicate_ssa_name (name, NULL);
848       nelt = XNEW (struct name_to_copy_elt);
849       nelt->version = idx;
850       nelt->new_name = copy;
851       nelt->field = NULL_TREE;
852       *slot = nelt;
853     }
854   else
855     {
856       gcc_assert (!slot);
857       copy = name;
858     }
859 
860   var = SSA_NAME_VAR (name);
861   if (!var)
862     return copy;
863 
864   uid = DECL_UID (var);
865   ielt.uid = uid;
866   dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT);
867   if (!dslot->to)
868     {
869       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
870       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
871       dslot->uid = uid;
872       dslot->to = var_copy;
873 
874       /* Ensure that when we meet this decl next time, we won't duplicate
875          it again.  */
876       nuid = DECL_UID (var_copy);
877       ielt.uid = nuid;
878       dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT);
879       gcc_assert (!dslot->to);
880       dslot->uid = nuid;
881       dslot->to = var_copy;
882     }
883   else
884     var_copy = dslot->to;
885 
886   replace_ssa_name_symbol (copy, var_copy);
887   return copy;
888 }
889 
890 /* Finds the ssa names used in STMT that are defined outside the
891    region between ENTRY and EXIT and replaces such ssa names with
892    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
893    decls of all ssa names used in STMT (including those defined in
894    LOOP) are replaced with the new temporary variables; the
895    replacement decls are stored in DECL_COPIES.  */
896 
897 static void
separate_decls_in_region_stmt(edge entry,edge exit,gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)898 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
899 			       name_to_copy_table_type *name_copies,
900 			       int_tree_htab_type *decl_copies)
901 {
902   use_operand_p use;
903   def_operand_p def;
904   ssa_op_iter oi;
905   tree name, copy;
906   bool copy_name_p;
907 
908   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
909   {
910     name = DEF_FROM_PTR (def);
911     gcc_assert (TREE_CODE (name) == SSA_NAME);
912     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
913 					  false);
914     gcc_assert (copy == name);
915   }
916 
917   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
918   {
919     name = USE_FROM_PTR (use);
920     if (TREE_CODE (name) != SSA_NAME)
921       continue;
922 
923     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
924     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
925 					  copy_name_p);
926     SET_USE (use, copy);
927   }
928 }
929 
930 /* Finds the ssa names used in STMT that are defined outside the
931    region between ENTRY and EXIT and replaces such ssa names with
932    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
933    decls of all ssa names used in STMT (including those defined in
934    LOOP) are replaced with the new temporary variables; the
935    replacement decls are stored in DECL_COPIES.  */
936 
937 static bool
separate_decls_in_region_debug(gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)938 separate_decls_in_region_debug (gimple *stmt,
939 				name_to_copy_table_type *name_copies,
940 				int_tree_htab_type *decl_copies)
941 {
942   use_operand_p use;
943   ssa_op_iter oi;
944   tree var, name;
945   struct int_tree_map ielt;
946   struct name_to_copy_elt elt;
947   name_to_copy_elt **slot;
948   int_tree_map *dslot;
949 
950   if (gimple_debug_bind_p (stmt))
951     var = gimple_debug_bind_get_var (stmt);
952   else if (gimple_debug_source_bind_p (stmt))
953     var = gimple_debug_source_bind_get_var (stmt);
954   else
955     return true;
956   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
957     return true;
958   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
959   ielt.uid = DECL_UID (var);
960   dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT);
961   if (!dslot)
962     return true;
963   if (gimple_debug_bind_p (stmt))
964     gimple_debug_bind_set_var (stmt, dslot->to);
965   else if (gimple_debug_source_bind_p (stmt))
966     gimple_debug_source_bind_set_var (stmt, dslot->to);
967 
968   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
969   {
970     name = USE_FROM_PTR (use);
971     if (TREE_CODE (name) != SSA_NAME)
972       continue;
973 
974     elt.version = SSA_NAME_VERSION (name);
975     slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT);
976     if (!slot)
977       {
978 	gimple_debug_bind_reset_value (stmt);
979 	update_stmt (stmt);
980 	break;
981       }
982 
983     SET_USE (use, (*slot)->new_name);
984   }
985 
986   return false;
987 }
988 
989 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
990    specified in SLOT. The type is passed in DATA.  */
991 
992 int
add_field_for_reduction(reduction_info ** slot,tree type)993 add_field_for_reduction (reduction_info **slot, tree type)
994 {
995 
996   struct reduction_info *const red = *slot;
997   tree var = reduc_stmt_res (red->reduc_stmt);
998   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
999 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1000 
1001   insert_field_into_struct (type, field);
1002 
1003   red->field = field;
1004 
1005   return 1;
1006 }
1007 
1008 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1009    described in SLOT. The type is passed in DATA.  */
1010 
1011 int
add_field_for_name(name_to_copy_elt ** slot,tree type)1012 add_field_for_name (name_to_copy_elt **slot, tree type)
1013 {
1014   struct name_to_copy_elt *const elt = *slot;
1015   tree name = ssa_name (elt->version);
1016   tree field = build_decl (UNKNOWN_LOCATION,
1017 			   FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1018 			   TREE_TYPE (name));
1019 
1020   insert_field_into_struct (type, field);
1021   elt->field = field;
1022 
1023   return 1;
1024 }
1025 
1026 /* Callback for htab_traverse.  A local result is the intermediate result
1027    computed by a single
1028    thread, or the initial value in case no iteration was executed.
1029    This function creates a phi node reflecting these values.
1030    The phi's result will be stored in NEW_PHI field of the
1031    reduction's data structure.  */
1032 
1033 int
create_phi_for_local_result(reduction_info ** slot,struct loop * loop)1034 create_phi_for_local_result (reduction_info **slot, struct loop *loop)
1035 {
1036   struct reduction_info *const reduc = *slot;
1037   edge e;
1038   gphi *new_phi;
1039   basic_block store_bb, continue_bb;
1040   tree local_res;
1041   source_location locus;
1042 
1043   /* STORE_BB is the block where the phi
1044      should be stored.  It is the destination of the loop exit.
1045      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1046   continue_bb = single_pred (loop->latch);
1047   store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1048 
1049   /* STORE_BB has two predecessors.  One coming from  the loop
1050      (the reduction's result is computed at the loop),
1051      and another coming from a block preceding the loop,
1052      when no iterations
1053      are executed (the initial value should be taken).  */
1054   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1055     e = EDGE_PRED (store_bb, 1);
1056   else
1057     e = EDGE_PRED (store_bb, 0);
1058   tree lhs = reduc_stmt_res (reduc->reduc_stmt);
1059   local_res = copy_ssa_name (lhs);
1060   locus = gimple_location (reduc->reduc_stmt);
1061   new_phi = create_phi_node (local_res, store_bb);
1062   add_phi_arg (new_phi, reduc->init, e, locus);
1063   add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1064   reduc->new_phi = new_phi;
1065 
1066   return 1;
1067 }
1068 
1069 struct clsn_data
1070 {
1071   tree store;
1072   tree load;
1073 
1074   basic_block store_bb;
1075   basic_block load_bb;
1076 };
1077 
1078 /* Callback for htab_traverse.  Create an atomic instruction for the
1079    reduction described in SLOT.
1080    DATA annotates the place in memory the atomic operation relates to,
1081    and the basic block it needs to be generated in.  */
1082 
1083 int
create_call_for_reduction_1(reduction_info ** slot,struct clsn_data * clsn_data)1084 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1085 {
1086   struct reduction_info *const reduc = *slot;
1087   gimple_stmt_iterator gsi;
1088   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1089   tree load_struct;
1090   basic_block bb;
1091   basic_block new_bb;
1092   edge e;
1093   tree t, addr, ref, x;
1094   tree tmp_load, name;
1095   gimple *load;
1096 
1097   if (reduc->reduc_addr == NULL_TREE)
1098     {
1099       load_struct = build_simple_mem_ref (clsn_data->load);
1100       t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1101 
1102       addr = build_addr (t);
1103     }
1104   else
1105     {
1106       /* Set the address for the atomic store.  */
1107       addr = reduc->reduc_addr;
1108 
1109       /* Remove the non-atomic store '*addr = sum'.  */
1110       tree res = PHI_RESULT (reduc->keep_res);
1111       use_operand_p use_p;
1112       gimple *stmt;
1113       bool single_use_p = single_imm_use (res, &use_p, &stmt);
1114       gcc_assert (single_use_p);
1115       replace_uses_by (gimple_vdef (stmt),
1116 		       gimple_vuse (stmt));
1117       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1118       gsi_remove (&gsi, true);
1119     }
1120 
1121   /* Create phi node.  */
1122   bb = clsn_data->load_bb;
1123 
1124   gsi = gsi_last_bb (bb);
1125   e = split_block (bb, gsi_stmt (gsi));
1126   new_bb = e->dest;
1127 
1128   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1129   tmp_load = make_ssa_name (tmp_load);
1130   load = gimple_build_omp_atomic_load (tmp_load, addr);
1131   SSA_NAME_DEF_STMT (tmp_load) = load;
1132   gsi = gsi_start_bb (new_bb);
1133   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1134 
1135   e = split_block (new_bb, load);
1136   new_bb = e->dest;
1137   gsi = gsi_start_bb (new_bb);
1138   ref = tmp_load;
1139   x = fold_build2 (reduc->reduction_code,
1140 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1141 		   PHI_RESULT (reduc->new_phi));
1142 
1143   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1144 				   GSI_CONTINUE_LINKING);
1145 
1146   gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1147   return 1;
1148 }
1149 
1150 /* Create the atomic operation at the join point of the threads.
1151    REDUCTION_LIST describes the reductions in the LOOP.
1152    LD_ST_DATA describes the shared data structure where
1153    shared data is stored in and loaded from.  */
1154 static void
create_call_for_reduction(struct loop * loop,reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1155 create_call_for_reduction (struct loop *loop,
1156 			   reduction_info_table_type *reduction_list,
1157 			   struct clsn_data *ld_st_data)
1158 {
1159   reduction_list->traverse <struct loop *, create_phi_for_local_result> (loop);
1160   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1161   basic_block continue_bb = single_pred (loop->latch);
1162   ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1163   reduction_list
1164     ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1165 }
1166 
1167 /* Callback for htab_traverse.  Loads the final reduction value at the
1168    join point of all threads, and inserts it in the right place.  */
1169 
1170 int
create_loads_for_reductions(reduction_info ** slot,struct clsn_data * clsn_data)1171 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1172 {
1173   struct reduction_info *const red = *slot;
1174   gimple *stmt;
1175   gimple_stmt_iterator gsi;
1176   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1177   tree load_struct;
1178   tree name;
1179   tree x;
1180 
1181   /* If there's no exit phi, the result of the reduction is unused.  */
1182   if (red->keep_res == NULL)
1183     return 1;
1184 
1185   gsi = gsi_after_labels (clsn_data->load_bb);
1186   load_struct = build_simple_mem_ref (clsn_data->load);
1187   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1188 			NULL_TREE);
1189 
1190   x = load_struct;
1191   name = PHI_RESULT (red->keep_res);
1192   stmt = gimple_build_assign (name, x);
1193 
1194   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1195 
1196   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1197        !gsi_end_p (gsi); gsi_next (&gsi))
1198     if (gsi_stmt (gsi) == red->keep_res)
1199       {
1200 	remove_phi_node (&gsi, false);
1201 	return 1;
1202       }
1203   gcc_unreachable ();
1204 }
1205 
1206 /* Load the reduction result that was stored in LD_ST_DATA.
1207    REDUCTION_LIST describes the list of reductions that the
1208    loads should be generated for.  */
1209 static void
create_final_loads_for_reduction(reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1210 create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1211 				  struct clsn_data *ld_st_data)
1212 {
1213   gimple_stmt_iterator gsi;
1214   tree t;
1215   gimple *stmt;
1216 
1217   gsi = gsi_after_labels (ld_st_data->load_bb);
1218   t = build_fold_addr_expr (ld_st_data->store);
1219   stmt = gimple_build_assign (ld_st_data->load, t);
1220 
1221   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1222 
1223   reduction_list
1224     ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1225 
1226 }
1227 
1228 /* Callback for htab_traverse.  Store the neutral value for the
1229   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1230   1 for MULT_EXPR, etc. into the reduction field.
1231   The reduction is specified in SLOT. The store information is
1232   passed in DATA.  */
1233 
1234 int
create_stores_for_reduction(reduction_info ** slot,struct clsn_data * clsn_data)1235 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1236 {
1237   struct reduction_info *const red = *slot;
1238   tree t;
1239   gimple *stmt;
1240   gimple_stmt_iterator gsi;
1241   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1242 
1243   gsi = gsi_last_bb (clsn_data->store_bb);
1244   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1245   stmt = gimple_build_assign (t, red->initial_value);
1246   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1247 
1248   return 1;
1249 }
1250 
1251 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1252    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1253    specified in SLOT.  */
1254 
1255 int
create_loads_and_stores_for_name(name_to_copy_elt ** slot,struct clsn_data * clsn_data)1256 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1257 				  struct clsn_data *clsn_data)
1258 {
1259   struct name_to_copy_elt *const elt = *slot;
1260   tree t;
1261   gimple *stmt;
1262   gimple_stmt_iterator gsi;
1263   tree type = TREE_TYPE (elt->new_name);
1264   tree load_struct;
1265 
1266   gsi = gsi_last_bb (clsn_data->store_bb);
1267   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1268   stmt = gimple_build_assign (t, ssa_name (elt->version));
1269   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1270 
1271   gsi = gsi_last_bb (clsn_data->load_bb);
1272   load_struct = build_simple_mem_ref (clsn_data->load);
1273   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1274   stmt = gimple_build_assign (elt->new_name, t);
1275   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1276 
1277   return 1;
1278 }
1279 
1280 /* Moves all the variables used in LOOP and defined outside of it (including
1281    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1282    name) to a structure created for this purpose.  The code
1283 
1284    while (1)
1285      {
1286        use (a);
1287        use (b);
1288      }
1289 
1290    is transformed this way:
1291 
1292    bb0:
1293    old.a = a;
1294    old.b = b;
1295 
1296    bb1:
1297    a' = new->a;
1298    b' = new->b;
1299    while (1)
1300      {
1301        use (a');
1302        use (b');
1303      }
1304 
1305    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1306    pointer `new' is intentionally not initialized (the loop will be split to a
1307    separate function later, and `new' will be initialized from its arguments).
1308    LD_ST_DATA holds information about the shared data structure used to pass
1309    information among the threads.  It is initialized here, and
1310    gen_parallel_loop will pass it to create_call_for_reduction that
1311    needs this information.  REDUCTION_LIST describes the reductions
1312    in LOOP.  */
1313 
1314 static void
separate_decls_in_region(edge entry,edge exit,reduction_info_table_type * reduction_list,tree * arg_struct,tree * new_arg_struct,struct clsn_data * ld_st_data)1315 separate_decls_in_region (edge entry, edge exit,
1316 			  reduction_info_table_type *reduction_list,
1317 			  tree *arg_struct, tree *new_arg_struct,
1318 			  struct clsn_data *ld_st_data)
1319 
1320 {
1321   basic_block bb1 = split_edge (entry);
1322   basic_block bb0 = single_pred (bb1);
1323   name_to_copy_table_type name_copies (10);
1324   int_tree_htab_type decl_copies (10);
1325   unsigned i;
1326   tree type, type_name, nvar;
1327   gimple_stmt_iterator gsi;
1328   struct clsn_data clsn_data;
1329   auto_vec<basic_block, 3> body;
1330   basic_block bb;
1331   basic_block entry_bb = bb1;
1332   basic_block exit_bb = exit->dest;
1333   bool has_debug_stmt = false;
1334 
1335   entry = single_succ_edge (entry_bb);
1336   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1337 
1338   FOR_EACH_VEC_ELT (body, i, bb)
1339     {
1340       if (bb != entry_bb && bb != exit_bb)
1341 	{
1342 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1343 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1344 					   &name_copies, &decl_copies);
1345 
1346 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1347 	    {
1348 	      gimple *stmt = gsi_stmt (gsi);
1349 
1350 	      if (is_gimple_debug (stmt))
1351 		has_debug_stmt = true;
1352 	      else
1353 		separate_decls_in_region_stmt (entry, exit, stmt,
1354 					       &name_copies, &decl_copies);
1355 	    }
1356 	}
1357     }
1358 
1359   /* Now process debug bind stmts.  We must not create decls while
1360      processing debug stmts, so we defer their processing so as to
1361      make sure we will have debug info for as many variables as
1362      possible (all of those that were dealt with in the loop above),
1363      and discard those for which we know there's nothing we can
1364      do.  */
1365   if (has_debug_stmt)
1366     FOR_EACH_VEC_ELT (body, i, bb)
1367       if (bb != entry_bb && bb != exit_bb)
1368 	{
1369 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1370 	    {
1371 	      gimple *stmt = gsi_stmt (gsi);
1372 
1373 	      if (is_gimple_debug (stmt))
1374 		{
1375 		  if (separate_decls_in_region_debug (stmt, &name_copies,
1376 						      &decl_copies))
1377 		    {
1378 		      gsi_remove (&gsi, true);
1379 		      continue;
1380 		    }
1381 		}
1382 
1383 	      gsi_next (&gsi);
1384 	    }
1385 	}
1386 
1387   if (name_copies.elements () == 0 && reduction_list->elements () == 0)
1388     {
1389       /* It may happen that there is nothing to copy (if there are only
1390          loop carried and external variables in the loop).  */
1391       *arg_struct = NULL;
1392       *new_arg_struct = NULL;
1393     }
1394   else
1395     {
1396       /* Create the type for the structure to store the ssa names to.  */
1397       type = lang_hooks.types.make_type (RECORD_TYPE);
1398       type_name = build_decl (UNKNOWN_LOCATION,
1399 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1400 			      type);
1401       TYPE_NAME (type) = type_name;
1402 
1403       name_copies.traverse <tree, add_field_for_name> (type);
1404       if (reduction_list && reduction_list->elements () > 0)
1405 	{
1406 	  /* Create the fields for reductions.  */
1407 	  reduction_list->traverse <tree, add_field_for_reduction> (type);
1408 	}
1409       layout_type (type);
1410 
1411       /* Create the loads and stores.  */
1412       *arg_struct = create_tmp_var (type, ".paral_data_store");
1413       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1414       *new_arg_struct = make_ssa_name (nvar);
1415 
1416       ld_st_data->store = *arg_struct;
1417       ld_st_data->load = *new_arg_struct;
1418       ld_st_data->store_bb = bb0;
1419       ld_st_data->load_bb = bb1;
1420 
1421       name_copies
1422 	.traverse <struct clsn_data *, create_loads_and_stores_for_name>
1423 		  (ld_st_data);
1424 
1425       /* Load the calculation from memory (after the join of the threads).  */
1426 
1427       if (reduction_list && reduction_list->elements () > 0)
1428 	{
1429 	  reduction_list
1430 	    ->traverse <struct clsn_data *, create_stores_for_reduction>
1431 	    (ld_st_data);
1432 	  clsn_data.load = make_ssa_name (nvar);
1433 	  clsn_data.load_bb = exit->dest;
1434 	  clsn_data.store = ld_st_data->store;
1435 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1436 	}
1437     }
1438 }
1439 
1440 /* Returns true if FN was created to run in parallel.  */
1441 
1442 bool
parallelized_function_p(tree fndecl)1443 parallelized_function_p (tree fndecl)
1444 {
1445   cgraph_node *node = cgraph_node::get (fndecl);
1446   gcc_assert (node != NULL);
1447   return node->parallelized_function;
1448 }
1449 
1450 /* Creates and returns an empty function that will receive the body of
1451    a parallelized loop.  */
1452 
1453 static tree
create_loop_fn(location_t loc)1454 create_loop_fn (location_t loc)
1455 {
1456   char buf[100];
1457   char *tname;
1458   tree decl, type, name, t;
1459   struct function *act_cfun = cfun;
1460   static unsigned loopfn_num;
1461 
1462   loc = LOCATION_LOCUS (loc);
1463   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1464   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1465   clean_symbol_name (tname);
1466   name = get_identifier (tname);
1467   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1468 
1469   decl = build_decl (loc, FUNCTION_DECL, name, type);
1470   TREE_STATIC (decl) = 1;
1471   TREE_USED (decl) = 1;
1472   DECL_ARTIFICIAL (decl) = 1;
1473   DECL_IGNORED_P (decl) = 0;
1474   TREE_PUBLIC (decl) = 0;
1475   DECL_UNINLINABLE (decl) = 1;
1476   DECL_EXTERNAL (decl) = 0;
1477   DECL_CONTEXT (decl) = NULL_TREE;
1478   DECL_INITIAL (decl) = make_node (BLOCK);
1479 
1480   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1481   DECL_ARTIFICIAL (t) = 1;
1482   DECL_IGNORED_P (t) = 1;
1483   DECL_RESULT (decl) = t;
1484 
1485   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1486 		  ptr_type_node);
1487   DECL_ARTIFICIAL (t) = 1;
1488   DECL_ARG_TYPE (t) = ptr_type_node;
1489   DECL_CONTEXT (t) = decl;
1490   TREE_USED (t) = 1;
1491   DECL_ARGUMENTS (decl) = t;
1492 
1493   allocate_struct_function (decl, false);
1494 
1495   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1496      it.  */
1497   set_cfun (act_cfun);
1498 
1499   return decl;
1500 }
1501 
1502 /* Replace uses of NAME by VAL in block BB.  */
1503 
1504 static void
replace_uses_in_bb_by(tree name,tree val,basic_block bb)1505 replace_uses_in_bb_by (tree name, tree val, basic_block bb)
1506 {
1507   gimple *use_stmt;
1508   imm_use_iterator imm_iter;
1509 
1510   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
1511     {
1512       if (gimple_bb (use_stmt) != bb)
1513 	continue;
1514 
1515       use_operand_p use_p;
1516       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1517 	SET_USE (use_p, val);
1518     }
1519 }
1520 
1521 /* Do transformation from:
1522 
1523      <bb preheader>:
1524      ...
1525      goto <bb header>
1526 
1527      <bb header>:
1528      ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1529      sum_a = PHI <sum_init (preheader), sum_b (latch)>
1530      ...
1531      use (ivtmp_a)
1532      ...
1533      sum_b = sum_a + sum_update
1534      ...
1535      if (ivtmp_a < n)
1536        goto <bb latch>;
1537      else
1538        goto <bb exit>;
1539 
1540      <bb latch>:
1541      ivtmp_b = ivtmp_a + 1;
1542      goto <bb header>
1543 
1544      <bb exit>:
1545      sum_z = PHI <sum_b (cond[1]), ...>
1546 
1547      [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
1548 	 that's <bb header>.
1549 
1550    to:
1551 
1552      <bb preheader>:
1553      ...
1554      goto <bb newheader>
1555 
1556      <bb header>:
1557      ivtmp_a = PHI <ivtmp_c (latch)>
1558      sum_a = PHI <sum_c (latch)>
1559      ...
1560      use (ivtmp_a)
1561      ...
1562      sum_b = sum_a + sum_update
1563      ...
1564      goto <bb latch>;
1565 
1566      <bb newheader>:
1567      ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1568      sum_c = PHI <sum_init (preheader), sum_b (latch)>
1569      if (ivtmp_c < n + 1)
1570        goto <bb header>;
1571      else
1572        goto <bb newexit>;
1573 
1574      <bb latch>:
1575      ivtmp_b = ivtmp_a + 1;
1576      goto <bb newheader>
1577 
1578      <bb newexit>:
1579      sum_y = PHI <sum_c (newheader)>
1580 
1581      <bb exit>:
1582      sum_z = PHI <sum_y (newexit), ...>
1583 
1584 
1585    In unified diff format:
1586 
1587       <bb preheader>:
1588       ...
1589 -     goto <bb header>
1590 +     goto <bb newheader>
1591 
1592       <bb header>:
1593 -     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1594 -     sum_a = PHI <sum_init (preheader), sum_b (latch)>
1595 +     ivtmp_a = PHI <ivtmp_c (latch)>
1596 +     sum_a = PHI <sum_c (latch)>
1597       ...
1598       use (ivtmp_a)
1599       ...
1600       sum_b = sum_a + sum_update
1601       ...
1602 -     if (ivtmp_a < n)
1603 -       goto <bb latch>;
1604 +     goto <bb latch>;
1605 +
1606 +     <bb newheader>:
1607 +     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1608 +     sum_c = PHI <sum_init (preheader), sum_b (latch)>
1609 +     if (ivtmp_c < n + 1)
1610 +       goto <bb header>;
1611       else
1612 	goto <bb exit>;
1613 
1614       <bb latch>:
1615       ivtmp_b = ivtmp_a + 1;
1616 -     goto <bb header>
1617 +     goto <bb newheader>
1618 
1619 +    <bb newexit>:
1620 +    sum_y = PHI <sum_c (newheader)>
1621 
1622       <bb exit>:
1623 -     sum_z = PHI <sum_b (cond[1]), ...>
1624 +     sum_z = PHI <sum_y (newexit), ...>
1625 
1626    Note: the example does not show any virtual phis, but these are handled more
1627    or less as reductions.
1628 
1629 
1630    Moves the exit condition of LOOP to the beginning of its header.
1631    REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
1632    bound.  */
1633 
1634 static void
transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree bound)1635 transform_to_exit_first_loop_alt (struct loop *loop,
1636 				  reduction_info_table_type *reduction_list,
1637 				  tree bound)
1638 {
1639   basic_block header = loop->header;
1640   basic_block latch = loop->latch;
1641   edge exit = single_dom_exit (loop);
1642   basic_block exit_block = exit->dest;
1643   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1644   tree control = gimple_cond_lhs (cond_stmt);
1645   edge e;
1646 
1647   /* Rewriting virtuals into loop-closed ssa normal form makes this
1648      transformation simpler.  It also ensures that the virtuals are in
1649      loop-closed ssa normal from after the transformation, which is required by
1650      create_parallel_loop.  */
1651   rewrite_virtuals_into_loop_closed_ssa (loop);
1652 
1653   /* Create the new_header block.  */
1654   basic_block new_header = split_block_before_cond_jump (exit->src);
1655   edge edge_at_split = single_pred_edge (new_header);
1656 
1657   /* Redirect entry edge to new_header.  */
1658   edge entry = loop_preheader_edge (loop);
1659   e = redirect_edge_and_branch (entry, new_header);
1660   gcc_assert (e == entry);
1661 
1662   /* Redirect post_inc_edge to new_header.  */
1663   edge post_inc_edge = single_succ_edge (latch);
1664   e = redirect_edge_and_branch (post_inc_edge, new_header);
1665   gcc_assert (e == post_inc_edge);
1666 
1667   /* Redirect post_cond_edge to header.  */
1668   edge post_cond_edge = single_pred_edge (latch);
1669   e = redirect_edge_and_branch (post_cond_edge, header);
1670   gcc_assert (e == post_cond_edge);
1671 
1672   /* Redirect edge_at_split to latch.  */
1673   e = redirect_edge_and_branch (edge_at_split, latch);
1674   gcc_assert (e == edge_at_split);
1675 
1676   /* Set the new loop bound.  */
1677   gimple_cond_set_rhs (cond_stmt, bound);
1678   update_stmt (cond_stmt);
1679 
1680   /* Repair the ssa.  */
1681   vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
1682   edge_var_map *vm;
1683   gphi_iterator gsi;
1684   int i;
1685   for (gsi = gsi_start_phis (header), i = 0;
1686        !gsi_end_p (gsi) && v->iterate (i, &vm);
1687        gsi_next (&gsi), i++)
1688     {
1689       gphi *phi = gsi.phi ();
1690       tree res_a = PHI_RESULT (phi);
1691 
1692       /* Create new phi.  */
1693       tree res_c = copy_ssa_name (res_a, phi);
1694       gphi *nphi = create_phi_node (res_c, new_header);
1695 
1696       /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
1697       replace_uses_in_bb_by (res_a, res_c, new_header);
1698 
1699       /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
1700       add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
1701 
1702       /* Replace sum_b with sum_c in exit phi.  */
1703       tree res_b = redirect_edge_var_map_def (vm);
1704       replace_uses_in_bb_by (res_b, res_c, exit_block);
1705 
1706       struct reduction_info *red = reduction_phi (reduction_list, phi);
1707       gcc_assert (virtual_operand_p (res_a)
1708 		  || res_a == control
1709 		  || red != NULL);
1710 
1711       if (red)
1712 	{
1713 	  /* Register the new reduction phi.  */
1714 	  red->reduc_phi = nphi;
1715 	  gimple_set_uid (red->reduc_phi, red->reduc_version);
1716 	}
1717     }
1718   gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
1719 
1720   /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
1721   flush_pending_stmts (entry);
1722 
1723   /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
1724   flush_pending_stmts (post_inc_edge);
1725 
1726 
1727   basic_block new_exit_block = NULL;
1728   if (!single_pred_p (exit->dest))
1729     {
1730       /* Create a new empty exit block, inbetween the new loop header and the
1731 	 old exit block.  The function separate_decls_in_region needs this block
1732 	 to insert code that is active on loop exit, but not any other path.  */
1733       new_exit_block = split_edge (exit);
1734     }
1735 
1736   /* Insert and register the reduction exit phis.  */
1737   for (gphi_iterator gsi = gsi_start_phis (exit_block);
1738        !gsi_end_p (gsi);
1739        gsi_next (&gsi))
1740     {
1741       gphi *phi = gsi.phi ();
1742       gphi *nphi = NULL;
1743       tree res_z = PHI_RESULT (phi);
1744       tree res_c;
1745 
1746       if (new_exit_block != NULL)
1747 	{
1748 	  /* Now that we have a new exit block, duplicate the phi of the old
1749 	     exit block in the new exit block to preserve loop-closed ssa.  */
1750 	  edge succ_new_exit_block = single_succ_edge (new_exit_block);
1751 	  edge pred_new_exit_block = single_pred_edge (new_exit_block);
1752 	  tree res_y = copy_ssa_name (res_z, phi);
1753 	  nphi = create_phi_node (res_y, new_exit_block);
1754 	  res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
1755 	  add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
1756 	  add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
1757 	}
1758       else
1759 	res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1760 
1761       if (virtual_operand_p (res_z))
1762 	continue;
1763 
1764       gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
1765       struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
1766       if (red != NULL)
1767 	red->keep_res = (nphi != NULL
1768 			 ? nphi
1769 			 : phi);
1770     }
1771 
1772   /* We're going to cancel the loop at the end of gen_parallel_loop, but until
1773      then we're still using some fields, so only bother about fields that are
1774      still used: header and latch.
1775      The loop has a new header bb, so we update it.  The latch bb stays the
1776      same.  */
1777   loop->header = new_header;
1778 
1779   /* Recalculate dominance info.  */
1780   free_dominance_info (CDI_DOMINATORS);
1781   calculate_dominance_info (CDI_DOMINATORS);
1782 
1783   checking_verify_ssa (true, true);
1784 }
1785 
1786 /* Tries to moves the exit condition of LOOP to the beginning of its header
1787    without duplication of the loop body.  NIT is the number of iterations of the
1788    loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
1789    transformation is successful.  */
1790 
1791 static bool
try_transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1792 try_transform_to_exit_first_loop_alt (struct loop *loop,
1793 				      reduction_info_table_type *reduction_list,
1794 				      tree nit)
1795 {
1796   /* Check whether the latch contains a single statement.  */
1797   if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
1798     return false;
1799 
1800   /* Check whether the latch contains no phis.  */
1801   if (phi_nodes (loop->latch) != NULL)
1802     return false;
1803 
1804   /* Check whether the latch contains the loop iv increment.  */
1805   edge back = single_succ_edge (loop->latch);
1806   edge exit = single_dom_exit (loop);
1807   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1808   tree control = gimple_cond_lhs (cond_stmt);
1809   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
1810   tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
1811   if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
1812     return false;
1813 
1814   /* Check whether there's no code between the loop condition and the latch.  */
1815   if (!single_pred_p (loop->latch)
1816       || single_pred (loop->latch) != exit->src)
1817     return false;
1818 
1819   tree alt_bound = NULL_TREE;
1820   tree nit_type = TREE_TYPE (nit);
1821 
1822   /* Figure out whether nit + 1 overflows.  */
1823   if (TREE_CODE (nit) == INTEGER_CST)
1824     {
1825       if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
1826 	{
1827 	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
1828 				       nit, build_one_cst (nit_type));
1829 
1830 	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
1831 	  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1832 	  return true;
1833 	}
1834       else
1835 	{
1836 	  /* Todo: Figure out if we can trigger this, if it's worth to handle
1837 	     optimally, and if we can handle it optimally.  */
1838 	  return false;
1839 	}
1840     }
1841 
1842   gcc_assert (TREE_CODE (nit) == SSA_NAME);
1843 
1844   /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
1845      iv with base 0 and step 1 that is incremented in the latch, like this:
1846 
1847      <bb header>:
1848      # iv_1 = PHI <0 (preheader), iv_2 (latch)>
1849      ...
1850      if (iv_1 < nit)
1851        goto <bb latch>;
1852      else
1853        goto <bb exit>;
1854 
1855      <bb latch>:
1856      iv_2 = iv_1 + 1;
1857      goto <bb header>;
1858 
1859      The range of iv_1 is [0, nit].  The latch edge is taken for
1860      iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit.  So the
1861      number of latch executions is equal to nit.
1862 
1863      The function max_loop_iterations gives us the maximum number of latch
1864      executions, so it gives us the maximum value of nit.  */
1865   widest_int nit_max;
1866   if (!max_loop_iterations (loop, &nit_max))
1867     return false;
1868 
1869   /* Check if nit + 1 overflows.  */
1870   widest_int type_max = wi::to_widest (TYPE_MAXVAL (nit_type));
1871   if (!wi::lts_p (nit_max, type_max))
1872     return false;
1873 
1874   gimple *def = SSA_NAME_DEF_STMT (nit);
1875 
1876   /* Try to find nit + 1, in the form of n in an assignment nit = n - 1.  */
1877   if (def
1878       && is_gimple_assign (def)
1879       && gimple_assign_rhs_code (def) == PLUS_EXPR)
1880     {
1881       tree op1 = gimple_assign_rhs1 (def);
1882       tree op2 = gimple_assign_rhs2 (def);
1883       if (integer_minus_onep (op1))
1884 	alt_bound = op2;
1885       else if (integer_minus_onep (op2))
1886 	alt_bound = op1;
1887     }
1888 
1889   /* If not found, insert nit + 1.  */
1890   if (alt_bound == NULL_TREE)
1891     {
1892       alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
1893 			       build_int_cst_type (nit_type, 1));
1894 
1895       gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1896 
1897       alt_bound
1898 	= force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
1899 				    GSI_CONTINUE_LINKING);
1900     }
1901 
1902   transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1903   return true;
1904 }
1905 
1906 /* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
1907    number of iterations of the loop.  REDUCTION_LIST describes the reductions in
1908    LOOP.  */
1909 
1910 static void
transform_to_exit_first_loop(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1911 transform_to_exit_first_loop (struct loop *loop,
1912 			      reduction_info_table_type *reduction_list,
1913 			      tree nit)
1914 {
1915   basic_block *bbs, *nbbs, ex_bb, orig_header;
1916   unsigned n;
1917   bool ok;
1918   edge exit = single_dom_exit (loop), hpred;
1919   tree control, control_name, res, t;
1920   gphi *phi, *nphi;
1921   gassign *stmt;
1922   gcond *cond_stmt, *cond_nit;
1923   tree nit_1;
1924 
1925   split_block_after_labels (loop->header);
1926   orig_header = single_succ (loop->header);
1927   hpred = single_succ_edge (loop->header);
1928 
1929   cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1930   control = gimple_cond_lhs (cond_stmt);
1931   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1932 
1933   /* Make sure that we have phi nodes on exit for all loop header phis
1934      (create_parallel_loop requires that).  */
1935   for (gphi_iterator gsi = gsi_start_phis (loop->header);
1936        !gsi_end_p (gsi);
1937        gsi_next (&gsi))
1938     {
1939       phi = gsi.phi ();
1940       res = PHI_RESULT (phi);
1941       t = copy_ssa_name (res, phi);
1942       SET_PHI_RESULT (phi, t);
1943       nphi = create_phi_node (res, orig_header);
1944       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1945 
1946       if (res == control)
1947 	{
1948 	  gimple_cond_set_lhs (cond_stmt, t);
1949 	  update_stmt (cond_stmt);
1950 	  control = t;
1951 	}
1952     }
1953 
1954   bbs = get_loop_body_in_dom_order (loop);
1955 
1956   for (n = 0; bbs[n] != exit->src; n++)
1957    continue;
1958   nbbs = XNEWVEC (basic_block, n);
1959   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1960 				   bbs + 1, n, nbbs);
1961   gcc_assert (ok);
1962   free (bbs);
1963   ex_bb = nbbs[0];
1964   free (nbbs);
1965 
1966   /* Other than reductions, the only gimple reg that should be copied
1967      out of the loop is the control variable.  */
1968   exit = single_dom_exit (loop);
1969   control_name = NULL_TREE;
1970   for (gphi_iterator gsi = gsi_start_phis (ex_bb);
1971        !gsi_end_p (gsi); )
1972     {
1973       phi = gsi.phi ();
1974       res = PHI_RESULT (phi);
1975       if (virtual_operand_p (res))
1976 	{
1977 	  gsi_next (&gsi);
1978 	  continue;
1979 	}
1980 
1981       /* Check if it is a part of reduction.  If it is,
1982          keep the phi at the reduction's keep_res field.  The
1983          PHI_RESULT of this phi is the resulting value of the reduction
1984          variable when exiting the loop.  */
1985 
1986       if (reduction_list->elements () > 0)
1987 	{
1988 	  struct reduction_info *red;
1989 
1990 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1991 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1992 	  if (red)
1993 	    {
1994 	      red->keep_res = phi;
1995 	      gsi_next (&gsi);
1996 	      continue;
1997 	    }
1998 	}
1999       gcc_assert (control_name == NULL_TREE
2000 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2001       control_name = res;
2002       remove_phi_node (&gsi, false);
2003     }
2004   gcc_assert (control_name != NULL_TREE);
2005 
2006   /* Initialize the control variable to number of iterations
2007      according to the rhs of the exit condition.  */
2008   gimple_stmt_iterator gsi = gsi_after_labels (ex_bb);
2009   cond_nit = as_a <gcond *> (last_stmt (exit->src));
2010   nit_1 =  gimple_cond_rhs (cond_nit);
2011   nit_1 = force_gimple_operand_gsi (&gsi,
2012 				  fold_convert (TREE_TYPE (control_name), nit_1),
2013 				  false, NULL_TREE, false, GSI_SAME_STMT);
2014   stmt = gimple_build_assign (control_name, nit_1);
2015   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2016 }
2017 
2018 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2019    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2020    NEW_DATA is the variable that should be initialized from the argument
2021    of LOOP_FN.  N_THREADS is the requested number of threads, which can be 0 if
2022    that number is to be determined later.  */
2023 
2024 static void
create_parallel_loop(struct loop * loop,tree loop_fn,tree data,tree new_data,unsigned n_threads,location_t loc,bool oacc_kernels_p)2025 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
2026 		      tree new_data, unsigned n_threads, location_t loc,
2027 		      bool oacc_kernels_p)
2028 {
2029   gimple_stmt_iterator gsi;
2030   basic_block for_bb, ex_bb, continue_bb;
2031   tree t, param;
2032   gomp_parallel *omp_par_stmt;
2033   gimple *omp_return_stmt1, *omp_return_stmt2;
2034   gimple *phi;
2035   gcond *cond_stmt;
2036   gomp_for *for_stmt;
2037   gomp_continue *omp_cont_stmt;
2038   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2039   edge exit, nexit, guard, end, e;
2040 
2041   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
2042   if (oacc_kernels_p)
2043     {
2044       tree clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
2045       OMP_CLAUSE_NUM_GANGS_EXPR (clause)
2046 	= build_int_cst (integer_type_node, n_threads);
2047       set_oacc_fn_attrib (cfun->decl, clause, true, NULL);
2048     }
2049   else
2050     {
2051       basic_block bb = loop_preheader_edge (loop)->src;
2052       basic_block paral_bb = single_pred (bb);
2053       gsi = gsi_last_bb (paral_bb);
2054 
2055       gcc_checking_assert (n_threads != 0);
2056       t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2057       OMP_CLAUSE_NUM_THREADS_EXPR (t)
2058 	= build_int_cst (integer_type_node, n_threads);
2059       omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2060       gimple_set_location (omp_par_stmt, loc);
2061 
2062       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2063 
2064       /* Initialize NEW_DATA.  */
2065       if (data)
2066 	{
2067 	  gassign *assign_stmt;
2068 
2069 	  gsi = gsi_after_labels (bb);
2070 
2071 	  param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2072 	  assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2073 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2074 
2075 	  assign_stmt = gimple_build_assign (new_data,
2076 					     fold_convert (TREE_TYPE (new_data), param));
2077 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2078 	}
2079 
2080       /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
2081       bb = split_loop_exit_edge (single_dom_exit (loop));
2082       gsi = gsi_last_bb (bb);
2083       omp_return_stmt1 = gimple_build_omp_return (false);
2084       gimple_set_location (omp_return_stmt1, loc);
2085       gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2086     }
2087 
2088   /* Extract data for GIMPLE_OMP_FOR.  */
2089   gcc_assert (loop->header == single_dom_exit (loop)->src);
2090   cond_stmt = as_a <gcond *> (last_stmt (loop->header));
2091 
2092   cvar = gimple_cond_lhs (cond_stmt);
2093   cvar_base = SSA_NAME_VAR (cvar);
2094   phi = SSA_NAME_DEF_STMT (cvar);
2095   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2096   initvar = copy_ssa_name (cvar);
2097   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2098 	   initvar);
2099   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2100 
2101   gsi = gsi_last_nondebug_bb (loop->latch);
2102   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2103   gsi_remove (&gsi, true);
2104 
2105   /* Prepare cfg.  */
2106   for_bb = split_edge (loop_preheader_edge (loop));
2107   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2108   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2109   gcc_assert (exit == single_dom_exit (loop));
2110 
2111   guard = make_edge (for_bb, ex_bb, 0);
2112   /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid.  */
2113   loop->latch = split_edge (single_succ_edge (loop->latch));
2114   single_pred_edge (loop->latch)->flags = 0;
2115   end = make_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU);
2116   rescan_loop_exit (end, true, false);
2117 
2118   for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2119        !gsi_end_p (gpi); gsi_next (&gpi))
2120     {
2121       source_location locus;
2122       gphi *phi = gpi.phi ();
2123       tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2124       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2125 
2126       /* If the exit phi is not connected to a header phi in the same loop, this
2127 	 value is not modified in the loop, and we're done with this phi.  */
2128       if (!(gimple_code (def_stmt) == GIMPLE_PHI
2129 	    && gimple_bb (def_stmt) == loop->header))
2130 	{
2131 	  locus = gimple_phi_arg_location_from_edge (phi, exit);
2132 	  add_phi_arg (phi, def, guard, locus);
2133 	  add_phi_arg (phi, def, end, locus);
2134 	  continue;
2135 	}
2136 
2137       gphi *stmt = as_a <gphi *> (def_stmt);
2138       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2139       locus = gimple_phi_arg_location_from_edge (stmt,
2140 						 loop_preheader_edge (loop));
2141       add_phi_arg (phi, def, guard, locus);
2142 
2143       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2144       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
2145       add_phi_arg (phi, def, end, locus);
2146     }
2147   e = redirect_edge_and_branch (exit, nexit->dest);
2148   PENDING_STMT (e) = NULL;
2149 
2150   /* Emit GIMPLE_OMP_FOR.  */
2151   if (oacc_kernels_p)
2152     /* In combination with the NUM_GANGS on the parallel.  */
2153     t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2154   else
2155     {
2156       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2157       int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE);
2158       enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \
2159 	= (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE);
2160       switch (schedule_type)
2161 	{
2162 	case PARAM_PARLOOPS_SCHEDULE_KIND_static:
2163 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2164 	  break;
2165 	case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic:
2166 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2167 	  break;
2168 	case PARAM_PARLOOPS_SCHEDULE_KIND_guided:
2169 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2170 	  break;
2171 	case PARAM_PARLOOPS_SCHEDULE_KIND_auto:
2172 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2173 	  chunk_size = 0;
2174 	  break;
2175 	case PARAM_PARLOOPS_SCHEDULE_KIND_runtime:
2176 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2177 	  chunk_size = 0;
2178 	  break;
2179 	default:
2180 	  gcc_unreachable ();
2181 	}
2182       if (chunk_size != 0)
2183 	OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2184 	  = build_int_cst (integer_type_node, chunk_size);
2185     }
2186 
2187   for_stmt = gimple_build_omp_for (NULL,
2188 				   (oacc_kernels_p
2189 				    ? GF_OMP_FOR_KIND_OACC_LOOP
2190 				    : GF_OMP_FOR_KIND_FOR),
2191 				   t, 1, NULL);
2192 
2193   gimple_cond_set_lhs (cond_stmt, cvar_base);
2194   type = TREE_TYPE (cvar);
2195   gimple_set_location (for_stmt, loc);
2196   gimple_omp_for_set_index (for_stmt, 0, initvar);
2197   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
2198   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
2199   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
2200   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
2201 						cvar_base,
2202 						build_int_cst (type, 1)));
2203 
2204   gsi = gsi_last_bb (for_bb);
2205   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2206   SSA_NAME_DEF_STMT (initvar) = for_stmt;
2207 
2208   /* Emit GIMPLE_OMP_CONTINUE.  */
2209   continue_bb = single_pred (loop->latch);
2210   gsi = gsi_last_bb (continue_bb);
2211   omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2212   gimple_set_location (omp_cont_stmt, loc);
2213   gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2214   SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2215 
2216   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
2217   gsi = gsi_last_bb (ex_bb);
2218   omp_return_stmt2 = gimple_build_omp_return (true);
2219   gimple_set_location (omp_return_stmt2, loc);
2220   gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2221 
2222   /* After the above dom info is hosed.  Re-compute it.  */
2223   free_dominance_info (CDI_DOMINATORS);
2224   calculate_dominance_info (CDI_DOMINATORS);
2225 }
2226 
2227 /* Generates code to execute the iterations of LOOP in N_THREADS
2228    threads in parallel, which can be 0 if that number is to be determined
2229    later.
2230 
2231    NITER describes number of iterations of LOOP.
2232    REDUCTION_LIST describes the reductions existent in the LOOP.  */
2233 
2234 static void
gen_parallel_loop(struct loop * loop,reduction_info_table_type * reduction_list,unsigned n_threads,struct tree_niter_desc * niter,bool oacc_kernels_p)2235 gen_parallel_loop (struct loop *loop,
2236 		   reduction_info_table_type *reduction_list,
2237 		   unsigned n_threads, struct tree_niter_desc *niter,
2238 		   bool oacc_kernels_p)
2239 {
2240   tree many_iterations_cond, type, nit;
2241   tree arg_struct, new_arg_struct;
2242   gimple_seq stmts;
2243   edge entry, exit;
2244   struct clsn_data clsn_data;
2245   unsigned prob;
2246   location_t loc;
2247   gimple *cond_stmt;
2248   unsigned int m_p_thread=2;
2249 
2250   /* From
2251 
2252      ---------------------------------------------------------------------
2253      loop
2254        {
2255 	 IV = phi (INIT, IV + STEP)
2256 	 BODY1;
2257 	 if (COND)
2258 	   break;
2259 	 BODY2;
2260        }
2261      ---------------------------------------------------------------------
2262 
2263      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2264      we generate the following code:
2265 
2266      ---------------------------------------------------------------------
2267 
2268      if (MAY_BE_ZERO
2269      || NITER < MIN_PER_THREAD * N_THREADS)
2270      goto original;
2271 
2272      BODY1;
2273      store all local loop-invariant variables used in body of the loop to DATA.
2274      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
2275      load the variables from DATA.
2276      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
2277      BODY2;
2278      BODY1;
2279      GIMPLE_OMP_CONTINUE;
2280      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
2281      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
2282      goto end;
2283 
2284      original:
2285      loop
2286        {
2287 	 IV = phi (INIT, IV + STEP)
2288 	 BODY1;
2289 	 if (COND)
2290 	   break;
2291 	 BODY2;
2292        }
2293 
2294      end:
2295 
2296    */
2297 
2298   /* Create two versions of the loop -- in the old one, we know that the
2299      number of iterations is large enough, and we will transform it into the
2300      loop that will be split to loop_fn, the new one will be used for the
2301      remaining iterations.  */
2302 
2303   /* We should compute a better number-of-iterations value for outer loops.
2304      That is, if we have
2305 
2306     for (i = 0; i < n; ++i)
2307       for (j = 0; j < m; ++j)
2308         ...
2309 
2310     we should compute nit = n * m, not nit = n.
2311     Also may_be_zero handling would need to be adjusted.  */
2312 
2313   type = TREE_TYPE (niter->niter);
2314   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
2315 			      NULL_TREE);
2316   if (stmts)
2317     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2318 
2319   if (!oacc_kernels_p)
2320     {
2321       if (loop->inner)
2322 	m_p_thread=2;
2323       else
2324 	m_p_thread=MIN_PER_THREAD;
2325 
2326       gcc_checking_assert (n_threads != 0);
2327       many_iterations_cond =
2328 	fold_build2 (GE_EXPR, boolean_type_node,
2329 		     nit, build_int_cst (type, m_p_thread * n_threads));
2330 
2331       many_iterations_cond
2332 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2333 		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
2334 		       many_iterations_cond);
2335       many_iterations_cond
2336 	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
2337       if (stmts)
2338 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2339       if (!is_gimple_condexpr (many_iterations_cond))
2340 	{
2341 	  many_iterations_cond
2342 	    = force_gimple_operand (many_iterations_cond, &stmts,
2343 				    true, NULL_TREE);
2344 	  if (stmts)
2345 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
2346 					      stmts);
2347 	}
2348 
2349       initialize_original_copy_tables ();
2350 
2351       /* We assume that the loop usually iterates a lot.  */
2352       prob = 4 * REG_BR_PROB_BASE / 5;
2353       loop_version (loop, many_iterations_cond, NULL,
2354 		    prob, prob, REG_BR_PROB_BASE - prob, true);
2355       update_ssa (TODO_update_ssa);
2356       free_original_copy_tables ();
2357     }
2358 
2359   /* Base all the induction variables in LOOP on a single control one.  */
2360   canonicalize_loop_ivs (loop, &nit, true);
2361 
2362   /* Ensure that the exit condition is the first statement in the loop.
2363      The common case is that latch of the loop is empty (apart from the
2364      increment) and immediately follows the loop exit test.  Attempt to move the
2365      entry of the loop directly before the exit check and increase the number of
2366      iterations of the loop by one.  */
2367   if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
2368     {
2369       if (dump_file
2370 	  && (dump_flags & TDF_DETAILS))
2371 	fprintf (dump_file,
2372 		 "alternative exit-first loop transform succeeded"
2373 		 " for loop %d\n", loop->num);
2374     }
2375   else
2376     {
2377       if (oacc_kernels_p)
2378 	n_threads = 1;
2379 
2380       /* Fall back on the method that handles more cases, but duplicates the
2381 	 loop body: move the exit condition of LOOP to the beginning of its
2382 	 header, and duplicate the part of the last iteration that gets disabled
2383 	 to the exit of the loop.  */
2384       transform_to_exit_first_loop (loop, reduction_list, nit);
2385     }
2386 
2387   /* Generate initializations for reductions.  */
2388   if (reduction_list->elements () > 0)
2389     reduction_list->traverse <struct loop *, initialize_reductions> (loop);
2390 
2391   /* Eliminate the references to local variables from the loop.  */
2392   gcc_assert (single_exit (loop));
2393   entry = loop_preheader_edge (loop);
2394   exit = single_dom_exit (loop);
2395 
2396   /* This rewrites the body in terms of new variables.  This has already
2397      been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
2398   if (!oacc_kernels_p)
2399     {
2400       eliminate_local_variables (entry, exit);
2401       /* In the old loop, move all variables non-local to the loop to a
2402 	 structure and back, and create separate decls for the variables used in
2403 	 loop.  */
2404       separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
2405 				&new_arg_struct, &clsn_data);
2406     }
2407   else
2408     {
2409       arg_struct = NULL_TREE;
2410       new_arg_struct = NULL_TREE;
2411       clsn_data.load = NULL_TREE;
2412       clsn_data.load_bb = exit->dest;
2413       clsn_data.store = NULL_TREE;
2414       clsn_data.store_bb = NULL;
2415     }
2416 
2417   /* Create the parallel constructs.  */
2418   loc = UNKNOWN_LOCATION;
2419   cond_stmt = last_stmt (loop->header);
2420   if (cond_stmt)
2421     loc = gimple_location (cond_stmt);
2422   create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
2423 			n_threads, loc, oacc_kernels_p);
2424   if (reduction_list->elements () > 0)
2425     create_call_for_reduction (loop, reduction_list, &clsn_data);
2426 
2427   scev_reset ();
2428 
2429   /* Free loop bound estimations that could contain references to
2430      removed statements.  */
2431   FOR_EACH_LOOP (loop, 0)
2432     free_numbers_of_iterations_estimates_loop (loop);
2433 }
2434 
2435 /* Returns true when LOOP contains vector phi nodes.  */
2436 
2437 static bool
loop_has_vector_phi_nodes(struct loop * loop ATTRIBUTE_UNUSED)2438 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
2439 {
2440   unsigned i;
2441   basic_block *bbs = get_loop_body_in_dom_order (loop);
2442   gphi_iterator gsi;
2443   bool res = true;
2444 
2445   for (i = 0; i < loop->num_nodes; i++)
2446     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2447       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE)
2448 	goto end;
2449 
2450   res = false;
2451  end:
2452   free (bbs);
2453   return res;
2454 }
2455 
2456 /* Create a reduction_info struct, initialize it with REDUC_STMT
2457    and PHI, insert it to the REDUCTION_LIST.  */
2458 
2459 static void
build_new_reduction(reduction_info_table_type * reduction_list,gimple * reduc_stmt,gphi * phi)2460 build_new_reduction (reduction_info_table_type *reduction_list,
2461 		     gimple *reduc_stmt, gphi *phi)
2462 {
2463   reduction_info **slot;
2464   struct reduction_info *new_reduction;
2465   enum tree_code reduction_code;
2466 
2467   gcc_assert (reduc_stmt);
2468 
2469   if (dump_file && (dump_flags & TDF_DETAILS))
2470     {
2471       fprintf (dump_file,
2472 	       "Detected reduction. reduction stmt is:\n");
2473       print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
2474       fprintf (dump_file, "\n");
2475     }
2476 
2477   if (gimple_code (reduc_stmt) == GIMPLE_PHI)
2478     {
2479       tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
2480       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2481       reduction_code = gimple_assign_rhs_code (def1);
2482     }
2483 
2484   else
2485     reduction_code = gimple_assign_rhs_code (reduc_stmt);
2486 
2487   new_reduction = XCNEW (struct reduction_info);
2488 
2489   new_reduction->reduc_stmt = reduc_stmt;
2490   new_reduction->reduc_phi = phi;
2491   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
2492   new_reduction->reduction_code = reduction_code;
2493   slot = reduction_list->find_slot (new_reduction, INSERT);
2494   *slot = new_reduction;
2495 }
2496 
2497 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
2498 
2499 int
set_reduc_phi_uids(reduction_info ** slot,void * data ATTRIBUTE_UNUSED)2500 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
2501 {
2502   struct reduction_info *const red = *slot;
2503   gimple_set_uid (red->reduc_phi, red->reduc_version);
2504   return 1;
2505 }
2506 
2507 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
2508 
2509 static void
gather_scalar_reductions(loop_p loop,reduction_info_table_type * reduction_list)2510 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
2511 {
2512   gphi_iterator gsi;
2513   loop_vec_info simple_loop_info;
2514   auto_vec<gphi *, 4> double_reduc_phis;
2515   auto_vec<gimple *, 4> double_reduc_stmts;
2516 
2517   if (!stmt_vec_info_vec.exists ())
2518     init_stmt_vec_info_vec ();
2519 
2520   simple_loop_info = vect_analyze_loop_form (loop);
2521   if (simple_loop_info == NULL)
2522     goto gather_done;
2523 
2524   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2525     {
2526       gphi *phi = gsi.phi ();
2527       affine_iv iv;
2528       tree res = PHI_RESULT (phi);
2529       bool double_reduc;
2530 
2531       if (virtual_operand_p (res))
2532 	continue;
2533 
2534       if (simple_iv (loop, loop, res, &iv, true))
2535 	continue;
2536 
2537       gimple *reduc_stmt
2538 	= vect_force_simple_reduction (simple_loop_info, phi, true,
2539 				       &double_reduc, true);
2540       if (!reduc_stmt)
2541 	continue;
2542 
2543       if (double_reduc)
2544 	{
2545 	  if (loop->inner->inner != NULL)
2546 	    continue;
2547 
2548 	  double_reduc_phis.safe_push (phi);
2549 	  double_reduc_stmts.safe_push (reduc_stmt);
2550 	  continue;
2551 	}
2552 
2553       build_new_reduction (reduction_list, reduc_stmt, phi);
2554     }
2555   destroy_loop_vec_info (simple_loop_info, true);
2556 
2557   if (!double_reduc_phis.is_empty ())
2558     {
2559       simple_loop_info = vect_analyze_loop_form (loop->inner);
2560       if (simple_loop_info)
2561 	{
2562 	  gphi *phi;
2563 	  unsigned int i;
2564 
2565 	  FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
2566 	    {
2567 	      affine_iv iv;
2568 	      tree res = PHI_RESULT (phi);
2569 	      bool double_reduc;
2570 
2571 	      use_operand_p use_p;
2572 	      gimple *inner_stmt;
2573 	      bool single_use_p = single_imm_use (res, &use_p, &inner_stmt);
2574 	      gcc_assert (single_use_p);
2575 	      if (gimple_code (inner_stmt) != GIMPLE_PHI)
2576 		continue;
2577 	      gphi *inner_phi = as_a <gphi *> (inner_stmt);
2578 	      if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
2579 			     &iv, true))
2580 		continue;
2581 
2582 	      gimple *inner_reduc_stmt
2583 		= vect_force_simple_reduction (simple_loop_info, inner_phi,
2584 					       true, &double_reduc, true);
2585 	      gcc_assert (!double_reduc);
2586 	      if (inner_reduc_stmt == NULL)
2587 		continue;
2588 
2589 	      build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
2590 	    }
2591 	  destroy_loop_vec_info (simple_loop_info, true);
2592 	}
2593     }
2594 
2595  gather_done:
2596   /* Release the claim on gimple_uid.  */
2597   free_stmt_vec_info_vec ();
2598 
2599   if (reduction_list->elements () == 0)
2600     return;
2601 
2602   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
2603      and free_stmt_vec_info_vec, we can set gimple_uid of reduc_phi stmts only
2604      now.  */
2605   basic_block bb;
2606   FOR_EACH_BB_FN (bb, cfun)
2607     for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2608       gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1);
2609   reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
2610 }
2611 
2612 /* Try to initialize NITER for code generation part.  */
2613 
2614 static bool
try_get_loop_niter(loop_p loop,struct tree_niter_desc * niter)2615 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2616 {
2617   edge exit = single_dom_exit (loop);
2618 
2619   gcc_assert (exit);
2620 
2621   /* We need to know # of iterations, and there should be no uses of values
2622      defined inside loop outside of it, unless the values are invariants of
2623      the loop.  */
2624   if (!number_of_iterations_exit (loop, exit, niter, false))
2625     {
2626       if (dump_file && (dump_flags & TDF_DETAILS))
2627 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2628       return false;
2629     }
2630 
2631   return true;
2632 }
2633 
2634 /* Return the default def of the first function argument.  */
2635 
2636 static tree
get_omp_data_i_param(void)2637 get_omp_data_i_param (void)
2638 {
2639   tree decl = DECL_ARGUMENTS (cfun->decl);
2640   gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
2641   return ssa_default_def (cfun, decl);
2642 }
2643 
2644 /* For PHI in loop header of LOOP, look for pattern:
2645 
2646    <bb preheader>
2647    .omp_data_i = &.omp_data_arr;
2648    addr = .omp_data_i->sum;
2649    sum_a = *addr;
2650 
2651    <bb header>:
2652    sum_b = PHI <sum_a (preheader), sum_c (latch)>
2653 
2654    and return addr.  Otherwise, return NULL_TREE.  */
2655 
2656 static tree
find_reduc_addr(struct loop * loop,gphi * phi)2657 find_reduc_addr (struct loop *loop, gphi *phi)
2658 {
2659   edge e = loop_preheader_edge (loop);
2660   tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
2661   gimple *stmt = SSA_NAME_DEF_STMT (arg);
2662   if (!gimple_assign_single_p (stmt))
2663     return NULL_TREE;
2664   tree memref = gimple_assign_rhs1 (stmt);
2665   if (TREE_CODE (memref) != MEM_REF)
2666     return NULL_TREE;
2667   tree addr = TREE_OPERAND (memref, 0);
2668 
2669   gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
2670   if (!gimple_assign_single_p (stmt2))
2671     return NULL_TREE;
2672   tree compref = gimple_assign_rhs1 (stmt2);
2673   if (TREE_CODE (compref) != COMPONENT_REF)
2674     return NULL_TREE;
2675   tree addr2 = TREE_OPERAND (compref, 0);
2676   if (TREE_CODE (addr2) != MEM_REF)
2677     return NULL_TREE;
2678   addr2 = TREE_OPERAND (addr2, 0);
2679   if (TREE_CODE (addr2) != SSA_NAME
2680       || addr2 != get_omp_data_i_param ())
2681     return NULL_TREE;
2682 
2683   return addr;
2684 }
2685 
2686 /* Try to initialize REDUCTION_LIST for code generation part.
2687    REDUCTION_LIST describes the reductions.  */
2688 
2689 static bool
try_create_reduction_list(loop_p loop,reduction_info_table_type * reduction_list,bool oacc_kernels_p)2690 try_create_reduction_list (loop_p loop,
2691 			   reduction_info_table_type *reduction_list,
2692 			   bool oacc_kernels_p)
2693 {
2694   edge exit = single_dom_exit (loop);
2695   gphi_iterator gsi;
2696 
2697   gcc_assert (exit);
2698 
2699   /* Try to get rid of exit phis.  */
2700   final_value_replacement_loop (loop);
2701 
2702   gather_scalar_reductions (loop, reduction_list);
2703 
2704 
2705   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2706     {
2707       gphi *phi = gsi.phi ();
2708       struct reduction_info *red;
2709       imm_use_iterator imm_iter;
2710       use_operand_p use_p;
2711       gimple *reduc_phi;
2712       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2713 
2714       if (!virtual_operand_p (val))
2715 	{
2716 	  if (dump_file && (dump_flags & TDF_DETAILS))
2717 	    {
2718 	      fprintf (dump_file, "phi is ");
2719 	      print_gimple_stmt (dump_file, phi, 0, 0);
2720 	      fprintf (dump_file, "arg of phi to exit:   value ");
2721 	      print_generic_expr (dump_file, val, 0);
2722 	      fprintf (dump_file, " used outside loop\n");
2723 	      fprintf (dump_file,
2724 		       "  checking if it is part of reduction pattern:\n");
2725 	    }
2726 	  if (reduction_list->elements () == 0)
2727 	    {
2728 	      if (dump_file && (dump_flags & TDF_DETAILS))
2729 		fprintf (dump_file,
2730 			 "  FAILED: it is not a part of reduction.\n");
2731 	      return false;
2732 	    }
2733 	  reduc_phi = NULL;
2734 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2735 	    {
2736 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2737 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2738 		{
2739 		  reduc_phi = USE_STMT (use_p);
2740 		  break;
2741 		}
2742 	    }
2743 	  red = reduction_phi (reduction_list, reduc_phi);
2744 	  if (red == NULL)
2745 	    {
2746 	      if (dump_file && (dump_flags & TDF_DETAILS))
2747 		fprintf (dump_file,
2748 			 "  FAILED: it is not a part of reduction.\n");
2749 	      return false;
2750 	    }
2751 	  if (red->keep_res != NULL)
2752 	    {
2753 	      if (dump_file && (dump_flags & TDF_DETAILS))
2754 		fprintf (dump_file,
2755 			 "  FAILED: reduction has multiple exit phis.\n");
2756 	      return false;
2757 	    }
2758 	  red->keep_res = phi;
2759 	  if (dump_file && (dump_flags & TDF_DETAILS))
2760 	    {
2761 	      fprintf (dump_file, "reduction phi is  ");
2762 	      print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
2763 	      fprintf (dump_file, "reduction stmt is  ");
2764 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
2765 	    }
2766 	}
2767     }
2768 
2769   /* The iterations of the loop may communicate only through bivs whose
2770      iteration space can be distributed efficiently.  */
2771   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2772     {
2773       gphi *phi = gsi.phi ();
2774       tree def = PHI_RESULT (phi);
2775       affine_iv iv;
2776 
2777       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2778 	{
2779 	  struct reduction_info *red;
2780 
2781 	  red = reduction_phi (reduction_list, phi);
2782 	  if (red == NULL)
2783 	    {
2784 	      if (dump_file && (dump_flags & TDF_DETAILS))
2785 		fprintf (dump_file,
2786 			 "  FAILED: scalar dependency between iterations\n");
2787 	      return false;
2788 	    }
2789 	}
2790     }
2791 
2792   if (oacc_kernels_p)
2793     {
2794       for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
2795 	   gsi_next (&gsi))
2796 	{
2797 	  gphi *phi = gsi.phi ();
2798 	  tree def = PHI_RESULT (phi);
2799 	  affine_iv iv;
2800 
2801 	  if (!virtual_operand_p (def)
2802 	      && !simple_iv (loop, loop, def, &iv, true))
2803 	    {
2804 	      tree addr = find_reduc_addr (loop, phi);
2805 	      if (addr == NULL_TREE)
2806 		return false;
2807 	      struct reduction_info *red = reduction_phi (reduction_list, phi);
2808 	      red->reduc_addr = addr;
2809 	    }
2810 	}
2811     }
2812 
2813   return true;
2814 }
2815 
2816 /* Return true if LOOP contains phis with ADDR_EXPR in args.  */
2817 
2818 static bool
loop_has_phi_with_address_arg(struct loop * loop)2819 loop_has_phi_with_address_arg (struct loop *loop)
2820 {
2821   basic_block *bbs = get_loop_body (loop);
2822   bool res = false;
2823 
2824   unsigned i, j;
2825   gphi_iterator gsi;
2826   for (i = 0; i < loop->num_nodes; i++)
2827     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2828       {
2829 	gphi *phi = gsi.phi ();
2830 	for (j = 0; j < gimple_phi_num_args (phi); j++)
2831 	  {
2832 	    tree arg = gimple_phi_arg_def (phi, j);
2833 	    if (TREE_CODE (arg) == ADDR_EXPR)
2834 	      {
2835 		/* This should be handled by eliminate_local_variables, but that
2836 		   function currently ignores phis.  */
2837 		res = true;
2838 		goto end;
2839 	      }
2840 	  }
2841       }
2842  end:
2843   free (bbs);
2844 
2845   return res;
2846 }
2847 
2848 /* Return true if memory ref REF (corresponding to the stmt at GSI in
2849    REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
2850    or the statements in REGIONS_BB[I + n].  REF_IS_STORE indicates if REF is a
2851    store.  Ignore conflicts with SKIP_STMT.  */
2852 
2853 static bool
ref_conflicts_with_region(gimple_stmt_iterator gsi,ao_ref * ref,bool ref_is_store,vec<basic_block> region_bbs,unsigned int i,gimple * skip_stmt)2854 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
2855 			   bool ref_is_store, vec<basic_block> region_bbs,
2856 			   unsigned int i, gimple *skip_stmt)
2857 {
2858   basic_block bb = region_bbs[i];
2859   gsi_next (&gsi);
2860 
2861   while (true)
2862     {
2863       for (; !gsi_end_p (gsi);
2864 	   gsi_next (&gsi))
2865 	{
2866 	  gimple *stmt = gsi_stmt (gsi);
2867 	  if (stmt == skip_stmt)
2868 	    {
2869 	      if (dump_file)
2870 		{
2871 		  fprintf (dump_file, "skipping reduction store: ");
2872 		  print_gimple_stmt (dump_file, stmt, 0, 0);
2873 		}
2874 	      continue;
2875 	    }
2876 
2877 	  if (!gimple_vdef (stmt)
2878 	      && !gimple_vuse (stmt))
2879 	    continue;
2880 
2881 	  if (gimple_code (stmt) == GIMPLE_RETURN)
2882 	    continue;
2883 
2884 	  if (ref_is_store)
2885 	    {
2886 	      if (ref_maybe_used_by_stmt_p (stmt, ref))
2887 		{
2888 		  if (dump_file)
2889 		    {
2890 		      fprintf (dump_file, "Stmt ");
2891 		      print_gimple_stmt (dump_file, stmt, 0, 0);
2892 		    }
2893 		  return true;
2894 		}
2895 	    }
2896 	  else
2897 	    {
2898 	      if (stmt_may_clobber_ref_p_1 (stmt, ref))
2899 		{
2900 		  if (dump_file)
2901 		    {
2902 		      fprintf (dump_file, "Stmt ");
2903 		      print_gimple_stmt (dump_file, stmt, 0, 0);
2904 		    }
2905 		  return true;
2906 		}
2907 	    }
2908 	}
2909       i++;
2910       if (i == region_bbs.length ())
2911 	break;
2912       bb = region_bbs[i];
2913       gsi = gsi_start_bb (bb);
2914     }
2915 
2916   return false;
2917 }
2918 
2919 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
2920    in parallel with REGION_BBS containing the loop.  Return the stores of
2921    reduction results in REDUCTION_STORES.  */
2922 
2923 static bool
oacc_entry_exit_ok_1(bitmap in_loop_bbs,vec<basic_block> region_bbs,reduction_info_table_type * reduction_list,bitmap reduction_stores)2924 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs,
2925 		      reduction_info_table_type *reduction_list,
2926 		      bitmap reduction_stores)
2927 {
2928   tree omp_data_i = get_omp_data_i_param ();
2929 
2930   unsigned i;
2931   basic_block bb;
2932   FOR_EACH_VEC_ELT (region_bbs, i, bb)
2933     {
2934       if (bitmap_bit_p (in_loop_bbs, bb->index))
2935 	continue;
2936 
2937       gimple_stmt_iterator gsi;
2938       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2939 	   gsi_next (&gsi))
2940 	{
2941 	  gimple *stmt = gsi_stmt (gsi);
2942 	  gimple *skip_stmt = NULL;
2943 
2944 	  if (is_gimple_debug (stmt)
2945 	      || gimple_code (stmt) == GIMPLE_COND)
2946 	    continue;
2947 
2948 	  ao_ref ref;
2949 	  bool ref_is_store = false;
2950 	  if (gimple_assign_load_p (stmt))
2951 	    {
2952 	      tree rhs = gimple_assign_rhs1 (stmt);
2953 	      tree base = get_base_address (rhs);
2954 	      if (TREE_CODE (base) == MEM_REF
2955 		  && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0))
2956 		continue;
2957 
2958 	      tree lhs = gimple_assign_lhs (stmt);
2959 	      if (TREE_CODE (lhs) == SSA_NAME
2960 		  && has_single_use (lhs))
2961 		{
2962 		  use_operand_p use_p;
2963 		  gimple *use_stmt;
2964 		  single_imm_use (lhs, &use_p, &use_stmt);
2965 		  if (gimple_code (use_stmt) == GIMPLE_PHI)
2966 		    {
2967 		      struct reduction_info *red;
2968 		      red = reduction_phi (reduction_list, use_stmt);
2969 		      tree val = PHI_RESULT (red->keep_res);
2970 		      if (has_single_use (val))
2971 			{
2972 			  single_imm_use (val, &use_p, &use_stmt);
2973 			  if (gimple_store_p (use_stmt))
2974 			    {
2975 			      unsigned int id
2976 				= SSA_NAME_VERSION (gimple_vdef (use_stmt));
2977 			      bitmap_set_bit (reduction_stores, id);
2978 			      skip_stmt = use_stmt;
2979 			      if (dump_file)
2980 				{
2981 				  fprintf (dump_file, "found reduction load: ");
2982 				  print_gimple_stmt (dump_file, stmt, 0, 0);
2983 				}
2984 			    }
2985 			}
2986 		    }
2987 		}
2988 
2989 	      ao_ref_init (&ref, rhs);
2990 	    }
2991 	  else if (gimple_store_p (stmt))
2992 	    {
2993 	      ao_ref_init (&ref, gimple_assign_lhs (stmt));
2994 	      ref_is_store = true;
2995 	    }
2996 	  else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
2997 	    continue;
2998 	  else if (!gimple_has_side_effects (stmt)
2999 		   && !gimple_could_trap_p (stmt)
3000 		   && !stmt_could_throw_p (stmt)
3001 		   && !gimple_vdef (stmt)
3002 		   && !gimple_vuse (stmt))
3003 	    continue;
3004 	  else if (is_gimple_call (stmt)
3005 		   && gimple_call_internal_p (stmt)
3006 		   && gimple_call_internal_fn (stmt) == IFN_GOACC_DIM_POS)
3007 	    continue;
3008 	  else if (gimple_code (stmt) == GIMPLE_RETURN)
3009 	    continue;
3010 	  else
3011 	    {
3012 	      if (dump_file)
3013 		{
3014 		  fprintf (dump_file, "Unhandled stmt in entry/exit: ");
3015 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3016 		}
3017 	      return false;
3018 	    }
3019 
3020 	  if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs,
3021 					 i, skip_stmt))
3022 	    {
3023 	      if (dump_file)
3024 		{
3025 		  fprintf (dump_file, "conflicts with entry/exit stmt: ");
3026 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3027 		}
3028 	      return false;
3029 	    }
3030 	}
3031     }
3032 
3033   return true;
3034 }
3035 
3036 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3037    gang_pos == 0, except when the stores are REDUCTION_STORES.  Return true
3038    if any changes were made.  */
3039 
3040 static bool
oacc_entry_exit_single_gang(bitmap in_loop_bbs,vec<basic_block> region_bbs,bitmap reduction_stores)3041 oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3042 			     bitmap reduction_stores)
3043 {
3044   tree gang_pos = NULL_TREE;
3045   bool changed = false;
3046 
3047   unsigned i;
3048   basic_block bb;
3049   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3050     {
3051       if (bitmap_bit_p (in_loop_bbs, bb->index))
3052 	continue;
3053 
3054       gimple_stmt_iterator gsi;
3055       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
3056 	{
3057 	  gimple *stmt = gsi_stmt (gsi);
3058 
3059 	  if (!gimple_store_p (stmt))
3060 	    {
3061 	      /* Update gsi to point to next stmt.  */
3062 	      gsi_next (&gsi);
3063 	      continue;
3064 	    }
3065 
3066 	  if (bitmap_bit_p (reduction_stores,
3067 			    SSA_NAME_VERSION (gimple_vdef (stmt))))
3068 	    {
3069 	      if (dump_file)
3070 		{
3071 		  fprintf (dump_file,
3072 			   "skipped reduction store for single-gang"
3073 			   " neutering: ");
3074 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3075 		}
3076 
3077 	      /* Update gsi to point to next stmt.  */
3078 	      gsi_next (&gsi);
3079 	      continue;
3080 	    }
3081 
3082 	  changed = true;
3083 
3084 	  if (gang_pos == NULL_TREE)
3085 	    {
3086 	      tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3087 	      gcall *gang_single
3088 		= gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3089 	      gang_pos = make_ssa_name (integer_type_node);
3090 	      gimple_call_set_lhs (gang_single, gang_pos);
3091 	      gimple_stmt_iterator start
3092 		= gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3093 	      tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3094 	      gimple_set_vuse (gang_single, vuse);
3095 	      gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3096 	    }
3097 
3098 	  if (dump_file)
3099 	    {
3100 	      fprintf (dump_file,
3101 		       "found store that needs single-gang neutering: ");
3102 	      print_gimple_stmt (dump_file, stmt, 0, 0);
3103 	    }
3104 
3105 	  {
3106 	    /* Split block before store.  */
3107 	    gimple_stmt_iterator gsi2 = gsi;
3108 	    gsi_prev (&gsi2);
3109 	    edge e;
3110 	    if (gsi_end_p (gsi2))
3111 	      {
3112 		e = split_block_after_labels (bb);
3113 		gsi2 = gsi_last_bb (bb);
3114 	      }
3115 	    else
3116 	      e = split_block (bb, gsi_stmt (gsi2));
3117 	    basic_block bb2 = e->dest;
3118 
3119 	    /* Split block after store.  */
3120 	    gimple_stmt_iterator gsi3 = gsi_start_bb (bb2);
3121 	    edge e2 = split_block (bb2, gsi_stmt (gsi3));
3122 	    basic_block bb3 = e2->dest;
3123 
3124 	    gimple *cond
3125 	      = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3126 				   NULL_TREE, NULL_TREE);
3127 	    gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3128 
3129 	    edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3130 	    e->flags = EDGE_TRUE_VALUE;
3131 
3132 	    tree vdef = gimple_vdef (stmt);
3133 	    tree vuse = gimple_vuse (stmt);
3134 
3135 	    tree phi_res = copy_ssa_name (vdef);
3136 	    gphi *new_phi = create_phi_node (phi_res, bb3);
3137 	    replace_uses_by (vdef, phi_res);
3138 	    add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3139 	    add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3140 
3141 	    /* Update gsi to point to next stmt.  */
3142 	    bb = bb3;
3143 	    gsi = gsi_start_bb (bb);
3144 	  }
3145 	}
3146     }
3147 
3148   return changed;
3149 }
3150 
3151 /* Return true if the statements before and after the LOOP can be executed in
3152    parallel with the function containing the loop.  Resolve conflicting stores
3153    outside LOOP by guarding them such that only a single gang executes them.  */
3154 
3155 static bool
oacc_entry_exit_ok(struct loop * loop,reduction_info_table_type * reduction_list)3156 oacc_entry_exit_ok (struct loop *loop,
3157 		    reduction_info_table_type *reduction_list)
3158 {
3159   basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3160   vec<basic_block> region_bbs
3161     = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3162 
3163   bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3164   bitmap_clear (in_loop_bbs);
3165   for (unsigned int i = 0; i < loop->num_nodes; i++)
3166     bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3167 
3168   bitmap reduction_stores = BITMAP_ALLOC (NULL);
3169   bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3170 				   reduction_stores);
3171 
3172   if (res)
3173     {
3174       bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3175 						  reduction_stores);
3176       if (changed)
3177 	{
3178 	  free_dominance_info (CDI_DOMINATORS);
3179 	  calculate_dominance_info (CDI_DOMINATORS);
3180 	}
3181     }
3182 
3183   free (loop_bbs);
3184 
3185   BITMAP_FREE (in_loop_bbs);
3186   BITMAP_FREE (reduction_stores);
3187 
3188   return res;
3189 }
3190 
3191 /* Detect parallel loops and generate parallel code using libgomp
3192    primitives.  Returns true if some loop was parallelized, false
3193    otherwise.  */
3194 
3195 static bool
parallelize_loops(bool oacc_kernels_p)3196 parallelize_loops (bool oacc_kernels_p)
3197 {
3198   unsigned n_threads;
3199   bool changed = false;
3200   struct loop *loop;
3201   struct loop *skip_loop = NULL;
3202   struct tree_niter_desc niter_desc;
3203   struct obstack parloop_obstack;
3204   HOST_WIDE_INT estimated;
3205   source_location loop_loc;
3206 
3207   /* Do not parallelize loops in the functions created by parallelization.  */
3208   if (!oacc_kernels_p
3209       && parallelized_function_p (cfun->decl))
3210     return false;
3211 
3212   /* Do not parallelize loops in offloaded functions.  */
3213   if (!oacc_kernels_p
3214       && get_oacc_fn_attrib (cfun->decl) != NULL)
3215      return false;
3216 
3217   if (cfun->has_nonlocal_label)
3218     return false;
3219 
3220   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
3221      the argument to -ftree-parallelize-loops.  */
3222   if (oacc_kernels_p)
3223     n_threads = 0;
3224   else
3225     n_threads = flag_tree_parallelize_loops;
3226 
3227   gcc_obstack_init (&parloop_obstack);
3228   reduction_info_table_type reduction_list (10);
3229 
3230   calculate_dominance_info (CDI_DOMINATORS);
3231 
3232   FOR_EACH_LOOP (loop, 0)
3233     {
3234       if (loop == skip_loop)
3235 	{
3236 	  if (!loop->in_oacc_kernels_region
3237 	      && dump_file && (dump_flags & TDF_DETAILS))
3238 	    fprintf (dump_file,
3239 		     "Skipping loop %d as inner loop of parallelized loop\n",
3240 		     loop->num);
3241 
3242 	  skip_loop = loop->inner;
3243 	  continue;
3244 	}
3245       else
3246 	skip_loop = NULL;
3247 
3248       reduction_list.empty ();
3249 
3250       if (oacc_kernels_p)
3251 	{
3252 	  if (!loop->in_oacc_kernels_region)
3253 	    continue;
3254 
3255 	  /* Don't try to parallelize inner loops in an oacc kernels region.  */
3256 	  if (loop->inner)
3257 	    skip_loop = loop->inner;
3258 
3259 	  if (dump_file && (dump_flags & TDF_DETAILS))
3260 	    fprintf (dump_file,
3261 		     "Trying loop %d with header bb %d in oacc kernels"
3262 		     " region\n", loop->num, loop->header->index);
3263 	}
3264 
3265       if (dump_file && (dump_flags & TDF_DETAILS))
3266       {
3267         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
3268 	if (loop->inner)
3269 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
3270 	else
3271 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
3272       }
3273 
3274       /* If we use autopar in graphite pass, we use its marked dependency
3275       checking results.  */
3276       if (flag_loop_parallelize_all && !loop->can_be_parallel)
3277       {
3278         if (dump_file && (dump_flags & TDF_DETAILS))
3279 	   fprintf (dump_file, "loop is not parallel according to graphite\n");
3280 	continue;
3281       }
3282 
3283       if (!single_dom_exit (loop))
3284       {
3285 
3286         if (dump_file && (dump_flags & TDF_DETAILS))
3287 	  fprintf (dump_file, "loop is !single_dom_exit\n");
3288 
3289 	continue;
3290       }
3291 
3292       if (/* And of course, the loop must be parallelizable.  */
3293 	  !can_duplicate_loop_p (loop)
3294 	  || loop_has_blocks_with_irreducible_flag (loop)
3295 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
3296 	  /* FIXME: the check for vector phi nodes could be removed.  */
3297 	  || loop_has_vector_phi_nodes (loop))
3298 	continue;
3299 
3300       estimated = estimated_stmt_executions_int (loop);
3301       if (estimated == -1)
3302 	estimated = max_stmt_executions_int (loop);
3303       /* FIXME: Bypass this check as graphite doesn't update the
3304 	 count and frequency correctly now.  */
3305       if (!flag_loop_parallelize_all
3306 	  && !oacc_kernels_p
3307 	  && ((estimated != -1
3308 	       && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
3309 	      /* Do not bother with loops in cold areas.  */
3310 	      || optimize_loop_nest_for_size_p (loop)))
3311 	continue;
3312 
3313       if (!try_get_loop_niter (loop, &niter_desc))
3314 	continue;
3315 
3316       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
3317 	continue;
3318 
3319       if (loop_has_phi_with_address_arg (loop))
3320 	continue;
3321 
3322       if (!flag_loop_parallelize_all
3323 	  && !loop_parallel_p (loop, &parloop_obstack))
3324 	continue;
3325 
3326       if (oacc_kernels_p
3327 	&& !oacc_entry_exit_ok (loop, &reduction_list))
3328 	{
3329 	  if (dump_file)
3330 	    fprintf (dump_file, "entry/exit not ok: FAILED\n");
3331 	  continue;
3332 	}
3333 
3334       changed = true;
3335       skip_loop = loop->inner;
3336       if (dump_file && (dump_flags & TDF_DETAILS))
3337       {
3338 	if (loop->inner)
3339 	  fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index);
3340 	else
3341 	  fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index);
3342 	loop_loc = find_loop_location (loop);
3343 	if (loop_loc != UNKNOWN_LOCATION)
3344 	  fprintf (dump_file, "\nloop at %s:%d: ",
3345 		   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
3346       }
3347 
3348       gen_parallel_loop (loop, &reduction_list,
3349 			 n_threads, &niter_desc, oacc_kernels_p);
3350     }
3351 
3352   obstack_free (&parloop_obstack, NULL);
3353 
3354   /* Parallelization will cause new function calls to be inserted through
3355      which local variables will escape.  Reset the points-to solution
3356      for ESCAPED.  */
3357   if (changed)
3358     pt_solution_reset (&cfun->gimple_df->escaped);
3359 
3360   return changed;
3361 }
3362 
3363 /* Parallelization.  */
3364 
3365 namespace {
3366 
3367 const pass_data pass_data_parallelize_loops =
3368 {
3369   GIMPLE_PASS, /* type */
3370   "parloops", /* name */
3371   OPTGROUP_LOOP, /* optinfo_flags */
3372   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
3373   ( PROP_cfg | PROP_ssa ), /* properties_required */
3374   0, /* properties_provided */
3375   0, /* properties_destroyed */
3376   0, /* todo_flags_start */
3377   0, /* todo_flags_finish */
3378 };
3379 
3380 class pass_parallelize_loops : public gimple_opt_pass
3381 {
3382 public:
pass_parallelize_loops(gcc::context * ctxt)3383   pass_parallelize_loops (gcc::context *ctxt)
3384     : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
3385       oacc_kernels_p (false)
3386   {}
3387 
3388   /* opt_pass methods: */
gate(function *)3389   virtual bool gate (function *)
3390   {
3391     if (oacc_kernels_p)
3392       return flag_openacc;
3393     else
3394       return flag_tree_parallelize_loops > 1;
3395   }
3396   virtual unsigned int execute (function *);
clone()3397   opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); }
set_pass_param(unsigned int n,bool param)3398   void set_pass_param (unsigned int n, bool param)
3399     {
3400       gcc_assert (n == 0);
3401       oacc_kernels_p = param;
3402     }
3403 
3404  private:
3405   bool oacc_kernels_p;
3406 }; // class pass_parallelize_loops
3407 
3408 unsigned
execute(function * fun)3409 pass_parallelize_loops::execute (function *fun)
3410 {
3411   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
3412   if (nthreads == NULL_TREE)
3413     return 0;
3414 
3415   bool in_loop_pipeline = scev_initialized_p ();
3416   if (!in_loop_pipeline)
3417     loop_optimizer_init (LOOPS_NORMAL
3418 			 | LOOPS_HAVE_RECORDED_EXITS);
3419 
3420   if (number_of_loops (fun) <= 1)
3421     return 0;
3422 
3423   if (!in_loop_pipeline)
3424     {
3425       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3426       scev_initialize ();
3427     }
3428 
3429   unsigned int todo = 0;
3430   if (parallelize_loops (oacc_kernels_p))
3431     {
3432       fun->curr_properties &= ~(PROP_gimple_eomp);
3433 
3434       checking_verify_loop_structure ();
3435 
3436       todo |= TODO_update_ssa;
3437     }
3438 
3439   if (!in_loop_pipeline)
3440     {
3441       scev_finalize ();
3442       loop_optimizer_finalize ();
3443     }
3444 
3445   return todo;
3446 }
3447 
3448 } // anon namespace
3449 
3450 gimple_opt_pass *
make_pass_parallelize_loops(gcc::context * ctxt)3451 make_pass_parallelize_loops (gcc::context *ctxt)
3452 {
3453   return new pass_parallelize_loops (ctxt);
3454 }
3455