1 /* Loop autoparallelization.
2    Copyright (C) 2006-2019 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "gimple-pretty-print.h"
33 #include "fold-const.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-walk.h"
38 #include "stor-layout.h"
39 #include "tree-nested.h"
40 #include "tree-cfg.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "tree-into-ssa.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "langhooks.h"
49 #include "tree-vectorizer.h"
50 #include "tree-hasher.h"
51 #include "tree-parloops.h"
52 #include "omp-general.h"
53 #include "omp-low.h"
54 #include "tree-ssa.h"
55 #include "params.h"
56 #include "params-enum.h"
57 #include "tree-ssa-alias.h"
58 #include "tree-eh.h"
59 #include "gomp-constants.h"
60 #include "tree-dfa.h"
61 #include "stringpool.h"
62 #include "attribs.h"
63 
64 /* This pass tries to distribute iterations of loops into several threads.
65    The implementation is straightforward -- for each loop we test whether its
66    iterations are independent, and if it is the case (and some additional
67    conditions regarding profitability and correctness are satisfied), we
68    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
69    machinery do its job.
70 
71    The most of the complexity is in bringing the code into shape expected
72    by the omp expanders:
73    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
74       variable and that the exit test is at the start of the loop body
75    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
76       variables by accesses through pointers, and breaking up ssa chains
77       by storing the values incoming to the parallelized loop to a structure
78       passed to the new function as an argument (something similar is done
79       in omp gimplification, unfortunately only a small part of the code
80       can be shared).
81 
82    TODO:
83    -- if there are several parallelizable loops in a function, it may be
84       possible to generate the threads just once (using synchronization to
85       ensure that cross-loop dependences are obeyed).
86    -- handling of common reduction patterns for outer loops.
87 
88    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
89 /*
90   Reduction handling:
91   currently we use vect_force_simple_reduction() to detect reduction patterns.
92   The code transformation will be introduced by an example.
93 
94 
95 parloop
96 {
97   int sum=1;
98 
99   for (i = 0; i < N; i++)
100    {
101     x[i] = i + 3;
102     sum+=x[i];
103    }
104 }
105 
106 gimple-like code:
107 header_bb:
108 
109   # sum_29 = PHI <sum_11(5), 1(3)>
110   # i_28 = PHI <i_12(5), 0(3)>
111   D.1795_8 = i_28 + 3;
112   x[i_28] = D.1795_8;
113   sum_11 = D.1795_8 + sum_29;
114   i_12 = i_28 + 1;
115   if (N_6(D) > i_12)
116     goto header_bb;
117 
118 
119 exit_bb:
120 
121   # sum_21 = PHI <sum_11(4)>
122   printf (&"%d"[0], sum_21);
123 
124 
125 after reduction transformation (only relevant parts):
126 
127 parloop
128 {
129 
130 ....
131 
132 
133   # Storing the initial value given by the user.  #
134 
135   .paral_data_store.32.sum.27 = 1;
136 
137   #pragma omp parallel num_threads(4)
138 
139   #pragma omp for schedule(static)
140 
141   # The neutral element corresponding to the particular
142   reduction's operation, e.g. 0 for PLUS_EXPR,
143   1 for MULT_EXPR, etc. replaces the user's initial value.  #
144 
145   # sum.27_29 = PHI <sum.27_11, 0>
146 
147   sum.27_11 = D.1827_8 + sum.27_29;
148 
149   GIMPLE_OMP_CONTINUE
150 
151   # Adding this reduction phi is done at create_phi_for_local_result() #
152   # sum.27_56 = PHI <sum.27_11, 0>
153   GIMPLE_OMP_RETURN
154 
155   # Creating the atomic operation is done at
156   create_call_for_reduction_1()  #
157 
158   #pragma omp atomic_load
159   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
160   D.1840_60 = sum.27_56 + D.1839_59;
161   #pragma omp atomic_store (D.1840_60);
162 
163   GIMPLE_OMP_RETURN
164 
165  # collecting the result after the join of the threads is done at
166   create_loads_for_reductions().
167   The value computed by the threads is loaded from the
168   shared struct.  #
169 
170 
171   .paral_data_load.33_52 = &.paral_data_store.32;
172   sum_37 =  .paral_data_load.33_52->sum.27;
173   sum_43 = D.1795_41 + sum_37;
174 
175   exit bb:
176   # sum_21 = PHI <sum_43, sum_26>
177   printf (&"%d"[0], sum_21);
178 
179 ...
180 
181 }
182 
183 */
184 
185 /* Minimal number of iterations of a loop that should be executed in each
186    thread.  */
187 #define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
188 
189 /* Element of the hashtable, representing a
190    reduction in the current loop.  */
191 struct reduction_info
192 {
193   gimple *reduc_stmt;		/* reduction statement.  */
194   gimple *reduc_phi;		/* The phi node defining the reduction.  */
195   enum tree_code reduction_code;/* code for the reduction operation.  */
196   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
197 				   result.  */
198   gphi *keep_res;		/* The PHI_RESULT of this phi is the resulting value
199 				   of the reduction variable when existing the loop. */
200   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
201   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
202   tree reduc_addr;		/* The address of the reduction variable for
203 				   openacc reductions.  */
204   tree init;			/* reduction initialization value.  */
205   gphi *new_phi;		/* (helper field) Newly created phi node whose result
206 				   will be passed to the atomic operation.  Represents
207 				   the local result each thread computed for the reduction
208 				   operation.  */
209 };
210 
211 /* Reduction info hashtable helpers.  */
212 
213 struct reduction_hasher : free_ptr_hash <reduction_info>
214 {
215   static inline hashval_t hash (const reduction_info *);
216   static inline bool equal (const reduction_info *, const reduction_info *);
217 };
218 
219 /* Equality and hash functions for hashtab code.  */
220 
221 inline bool
equal(const reduction_info * a,const reduction_info * b)222 reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
223 {
224   return (a->reduc_phi == b->reduc_phi);
225 }
226 
227 inline hashval_t
hash(const reduction_info * a)228 reduction_hasher::hash (const reduction_info *a)
229 {
230   return a->reduc_version;
231 }
232 
233 typedef hash_table<reduction_hasher> reduction_info_table_type;
234 
235 
236 static struct reduction_info *
reduction_phi(reduction_info_table_type * reduction_list,gimple * phi)237 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
238 {
239   struct reduction_info tmpred, *red;
240 
241   if (reduction_list->elements () == 0 || phi == NULL)
242     return NULL;
243 
244   if (gimple_uid (phi) == (unsigned int)-1
245       || gimple_uid (phi) == 0)
246     return NULL;
247 
248   tmpred.reduc_phi = phi;
249   tmpred.reduc_version = gimple_uid (phi);
250   red = reduction_list->find (&tmpred);
251   gcc_assert (red == NULL || red->reduc_phi == phi);
252 
253   return red;
254 }
255 
256 /* Element of hashtable of names to copy.  */
257 
258 struct name_to_copy_elt
259 {
260   unsigned version;	/* The version of the name to copy.  */
261   tree new_name;	/* The new name used in the copy.  */
262   tree field;		/* The field of the structure used to pass the
263 			   value.  */
264 };
265 
266 /* Name copies hashtable helpers.  */
267 
268 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
269 {
270   static inline hashval_t hash (const name_to_copy_elt *);
271   static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
272 };
273 
274 /* Equality and hash functions for hashtab code.  */
275 
276 inline bool
equal(const name_to_copy_elt * a,const name_to_copy_elt * b)277 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
278 {
279   return a->version == b->version;
280 }
281 
282 inline hashval_t
hash(const name_to_copy_elt * a)283 name_to_copy_hasher::hash (const name_to_copy_elt *a)
284 {
285   return (hashval_t) a->version;
286 }
287 
288 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
289 
290 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
291    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
292    represents the denominator for every element in the matrix.  */
293 typedef struct lambda_trans_matrix_s
294 {
295   lambda_matrix matrix;
296   int rowsize;
297   int colsize;
298   int denominator;
299 } *lambda_trans_matrix;
300 #define LTM_MATRIX(T) ((T)->matrix)
301 #define LTM_ROWSIZE(T) ((T)->rowsize)
302 #define LTM_COLSIZE(T) ((T)->colsize)
303 #define LTM_DENOMINATOR(T) ((T)->denominator)
304 
305 /* Allocate a new transformation matrix.  */
306 
307 static lambda_trans_matrix
lambda_trans_matrix_new(int colsize,int rowsize,struct obstack * lambda_obstack)308 lambda_trans_matrix_new (int colsize, int rowsize,
309 			 struct obstack * lambda_obstack)
310 {
311   lambda_trans_matrix ret;
312 
313   ret = (lambda_trans_matrix)
314     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
315   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
316   LTM_ROWSIZE (ret) = rowsize;
317   LTM_COLSIZE (ret) = colsize;
318   LTM_DENOMINATOR (ret) = 1;
319   return ret;
320 }
321 
322 /* Multiply a vector VEC by a matrix MAT.
323    MAT is an M*N matrix, and VEC is a vector with length N.  The result
324    is stored in DEST which must be a vector of length M.  */
325 
326 static void
lambda_matrix_vector_mult(lambda_matrix matrix,int m,int n,lambda_vector vec,lambda_vector dest)327 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
328 			   lambda_vector vec, lambda_vector dest)
329 {
330   int i, j;
331 
332   lambda_vector_clear (dest, m);
333   for (i = 0; i < m; i++)
334     for (j = 0; j < n; j++)
335       dest[i] += matrix[i][j] * vec[j];
336 }
337 
338 /* Return true if TRANS is a legal transformation matrix that respects
339    the dependence vectors in DISTS and DIRS.  The conservative answer
340    is false.
341 
342    "Wolfe proves that a unimodular transformation represented by the
343    matrix T is legal when applied to a loop nest with a set of
344    lexicographically non-negative distance vectors RDG if and only if
345    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
346    i.e.: if and only if it transforms the lexicographically positive
347    distance vectors to lexicographically positive vectors.  Note that
348    a unimodular matrix must transform the zero vector (and only it) to
349    the zero vector." S.Muchnick.  */
350 
351 static bool
lambda_transform_legal_p(lambda_trans_matrix trans,int nb_loops,vec<ddr_p> dependence_relations)352 lambda_transform_legal_p (lambda_trans_matrix trans,
353 			  int nb_loops,
354 			  vec<ddr_p> dependence_relations)
355 {
356   unsigned int i, j;
357   lambda_vector distres;
358   struct data_dependence_relation *ddr;
359 
360   gcc_assert (LTM_COLSIZE (trans) == nb_loops
361 	      && LTM_ROWSIZE (trans) == nb_loops);
362 
363   /* When there are no dependences, the transformation is correct.  */
364   if (dependence_relations.length () == 0)
365     return true;
366 
367   ddr = dependence_relations[0];
368   if (ddr == NULL)
369     return true;
370 
371   /* When there is an unknown relation in the dependence_relations, we
372      know that it is no worth looking at this loop nest: give up.  */
373   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
374     return false;
375 
376   distres = lambda_vector_new (nb_loops);
377 
378   /* For each distance vector in the dependence graph.  */
379   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
380     {
381       /* Don't care about relations for which we know that there is no
382 	 dependence, nor about read-read (aka. output-dependences):
383 	 these data accesses can happen in any order.  */
384       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
385 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
386 	continue;
387 
388       /* Conservatively answer: "this transformation is not valid".  */
389       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
390 	return false;
391 
392       /* If the dependence could not be captured by a distance vector,
393 	 conservatively answer that the transform is not valid.  */
394       if (DDR_NUM_DIST_VECTS (ddr) == 0)
395 	return false;
396 
397       /* Compute trans.dist_vect */
398       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
399 	{
400 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
401 				     DDR_DIST_VECT (ddr, j), distres);
402 
403 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
404 	    return false;
405 	}
406     }
407   return true;
408 }
409 
410 /* Data dependency analysis. Returns true if the iterations of LOOP
411    are independent on each other (that is, if we can execute them
412    in parallel).  */
413 
414 static bool
loop_parallel_p(struct loop * loop,struct obstack * parloop_obstack)415 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
416 {
417   vec<ddr_p> dependence_relations;
418   vec<data_reference_p> datarefs;
419   lambda_trans_matrix trans;
420   bool ret = false;
421 
422   if (dump_file && (dump_flags & TDF_DETAILS))
423   {
424     fprintf (dump_file, "Considering loop %d\n", loop->num);
425     if (!loop->inner)
426       fprintf (dump_file, "loop is innermost\n");
427     else
428       fprintf (dump_file, "loop NOT innermost\n");
429    }
430 
431   /* Check for problems with dependences.  If the loop can be reversed,
432      the iterations are independent.  */
433   auto_vec<loop_p, 3> loop_nest;
434   datarefs.create (10);
435   dependence_relations.create (100);
436   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
437 					   &dependence_relations))
438     {
439       if (dump_file && (dump_flags & TDF_DETAILS))
440 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
441       ret = false;
442       goto end;
443     }
444   if (dump_file && (dump_flags & TDF_DETAILS))
445     dump_data_dependence_relations (dump_file, dependence_relations);
446 
447   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
448   LTM_MATRIX (trans)[0][0] = -1;
449 
450   if (lambda_transform_legal_p (trans, 1, dependence_relations))
451     {
452       ret = true;
453       if (dump_file && (dump_flags & TDF_DETAILS))
454 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
455     }
456   else if (dump_file && (dump_flags & TDF_DETAILS))
457     fprintf (dump_file,
458 	     "  FAILED: data dependencies exist across iterations\n");
459 
460  end:
461   free_dependence_relations (dependence_relations);
462   free_data_refs (datarefs);
463 
464   return ret;
465 }
466 
467 /* Return true when LOOP contains basic blocks marked with the
468    BB_IRREDUCIBLE_LOOP flag.  */
469 
470 static inline bool
loop_has_blocks_with_irreducible_flag(struct loop * loop)471 loop_has_blocks_with_irreducible_flag (struct loop *loop)
472 {
473   unsigned i;
474   basic_block *bbs = get_loop_body_in_dom_order (loop);
475   bool res = true;
476 
477   for (i = 0; i < loop->num_nodes; i++)
478     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
479       goto end;
480 
481   res = false;
482  end:
483   free (bbs);
484   return res;
485 }
486 
487 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
488    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
489    to their addresses that can be reused.  The address of OBJ is known to
490    be invariant in the whole function.  Other needed statements are placed
491    right before GSI.  */
492 
493 static tree
take_address_of(tree obj,tree type,edge entry,int_tree_htab_type * decl_address,gimple_stmt_iterator * gsi)494 take_address_of (tree obj, tree type, edge entry,
495 		 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
496 {
497   int uid;
498   tree *var_p, name, addr;
499   gassign *stmt;
500   gimple_seq stmts;
501 
502   /* Since the address of OBJ is invariant, the trees may be shared.
503      Avoid rewriting unrelated parts of the code.  */
504   obj = unshare_expr (obj);
505   for (var_p = &obj;
506        handled_component_p (*var_p);
507        var_p = &TREE_OPERAND (*var_p, 0))
508     continue;
509 
510   /* Canonicalize the access to base on a MEM_REF.  */
511   if (DECL_P (*var_p))
512     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
513 
514   /* Assign a canonical SSA name to the address of the base decl used
515      in the address and share it for all accesses and addresses based
516      on it.  */
517   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
518   int_tree_map elt;
519   elt.uid = uid;
520   int_tree_map *slot = decl_address->find_slot (elt, INSERT);
521   if (!slot->to)
522     {
523       if (gsi == NULL)
524 	return NULL;
525       addr = TREE_OPERAND (*var_p, 0);
526       const char *obj_name
527 	= get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
528       if (obj_name)
529 	name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
530       else
531 	name = make_ssa_name (TREE_TYPE (addr));
532       stmt = gimple_build_assign (name, addr);
533       gsi_insert_on_edge_immediate (entry, stmt);
534 
535       slot->uid = uid;
536       slot->to = name;
537     }
538   else
539     name = slot->to;
540 
541   /* Express the address in terms of the canonical SSA name.  */
542   TREE_OPERAND (*var_p, 0) = name;
543   if (gsi == NULL)
544     return build_fold_addr_expr_with_type (obj, type);
545 
546   name = force_gimple_operand (build_addr (obj),
547 			       &stmts, true, NULL_TREE);
548   if (!gimple_seq_empty_p (stmts))
549     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
550 
551   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
552     {
553       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
554 				   NULL_TREE);
555       if (!gimple_seq_empty_p (stmts))
556 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
557     }
558 
559   return name;
560 }
561 
562 static tree
reduc_stmt_res(gimple * stmt)563 reduc_stmt_res (gimple *stmt)
564 {
565   return (gimple_code (stmt) == GIMPLE_PHI
566 	  ? gimple_phi_result (stmt)
567 	  : gimple_assign_lhs (stmt));
568 }
569 
570 /* Callback for htab_traverse.  Create the initialization statement
571    for reduction described in SLOT, and place it at the preheader of
572    the loop described in DATA.  */
573 
574 int
initialize_reductions(reduction_info ** slot,struct loop * loop)575 initialize_reductions (reduction_info **slot, struct loop *loop)
576 {
577   tree init;
578   tree type, arg;
579   edge e;
580 
581   struct reduction_info *const reduc = *slot;
582 
583   /* Create initialization in preheader:
584      reduction_variable = initialization value of reduction.  */
585 
586   /* In the phi node at the header, replace the argument coming
587      from the preheader with the reduction initialization value.  */
588 
589   /* Initialize the reduction.  */
590   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
591   init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt),
592 				reduc->reduction_code, type);
593   reduc->init = init;
594 
595   /* Replace the argument representing the initialization value
596      with the initialization value for the reduction (neutral
597      element for the particular operation, e.g. 0 for PLUS_EXPR,
598      1 for MULT_EXPR, etc).
599      Keep the old value in a new variable "reduction_initial",
600      that will be taken in consideration after the parallel
601      computing is done.  */
602 
603   e = loop_preheader_edge (loop);
604   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
605   /* Create new variable to hold the initial value.  */
606 
607   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
608 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
609   reduc->initial_value = arg;
610   return 1;
611 }
612 
613 struct elv_data
614 {
615   struct walk_stmt_info info;
616   edge entry;
617   int_tree_htab_type *decl_address;
618   gimple_stmt_iterator *gsi;
619   bool changed;
620   bool reset;
621 };
622 
623 /* Eliminates references to local variables in *TP out of the single
624    entry single exit region starting at DTA->ENTRY.
625    DECL_ADDRESS contains addresses of the references that had their
626    address taken already.  If the expression is changed, CHANGED is
627    set to true.  Callback for walk_tree.  */
628 
629 static tree
eliminate_local_variables_1(tree * tp,int * walk_subtrees,void * data)630 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
631 {
632   struct elv_data *const dta = (struct elv_data *) data;
633   tree t = *tp, var, addr, addr_type, type, obj;
634 
635   if (DECL_P (t))
636     {
637       *walk_subtrees = 0;
638 
639       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
640 	return NULL_TREE;
641 
642       type = TREE_TYPE (t);
643       addr_type = build_pointer_type (type);
644       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
645 			      dta->gsi);
646       if (dta->gsi == NULL && addr == NULL_TREE)
647 	{
648 	  dta->reset = true;
649 	  return NULL_TREE;
650 	}
651 
652       *tp = build_simple_mem_ref (addr);
653 
654       dta->changed = true;
655       return NULL_TREE;
656     }
657 
658   if (TREE_CODE (t) == ADDR_EXPR)
659     {
660       /* ADDR_EXPR may appear in two contexts:
661 	 -- as a gimple operand, when the address taken is a function invariant
662 	 -- as gimple rhs, when the resulting address in not a function
663 	    invariant
664 	 We do not need to do anything special in the latter case (the base of
665 	 the memory reference whose address is taken may be replaced in the
666 	 DECL_P case).  The former case is more complicated, as we need to
667 	 ensure that the new address is still a gimple operand.  Thus, it
668 	 is not sufficient to replace just the base of the memory reference --
669 	 we need to move the whole computation of the address out of the
670 	 loop.  */
671       if (!is_gimple_val (t))
672 	return NULL_TREE;
673 
674       *walk_subtrees = 0;
675       obj = TREE_OPERAND (t, 0);
676       var = get_base_address (obj);
677       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
678 	return NULL_TREE;
679 
680       addr_type = TREE_TYPE (t);
681       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
682 			      dta->gsi);
683       if (dta->gsi == NULL && addr == NULL_TREE)
684 	{
685 	  dta->reset = true;
686 	  return NULL_TREE;
687 	}
688       *tp = addr;
689 
690       dta->changed = true;
691       return NULL_TREE;
692     }
693 
694   if (!EXPR_P (t))
695     *walk_subtrees = 0;
696 
697   return NULL_TREE;
698 }
699 
700 /* Moves the references to local variables in STMT at *GSI out of the single
701    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
702    addresses of the references that had their address taken
703    already.  */
704 
705 static void
eliminate_local_variables_stmt(edge entry,gimple_stmt_iterator * gsi,int_tree_htab_type * decl_address)706 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
707 				int_tree_htab_type *decl_address)
708 {
709   struct elv_data dta;
710   gimple *stmt = gsi_stmt (*gsi);
711 
712   memset (&dta.info, '\0', sizeof (dta.info));
713   dta.entry = entry;
714   dta.decl_address = decl_address;
715   dta.changed = false;
716   dta.reset = false;
717 
718   if (gimple_debug_bind_p (stmt))
719     {
720       dta.gsi = NULL;
721       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
722 		 eliminate_local_variables_1, &dta.info, NULL);
723       if (dta.reset)
724 	{
725 	  gimple_debug_bind_reset_value (stmt);
726 	  dta.changed = true;
727 	}
728     }
729   else if (gimple_clobber_p (stmt))
730     {
731       unlink_stmt_vdef (stmt);
732       stmt = gimple_build_nop ();
733       gsi_replace (gsi, stmt, false);
734       dta.changed = true;
735     }
736   else
737     {
738       dta.gsi = gsi;
739       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
740     }
741 
742   if (dta.changed)
743     update_stmt (stmt);
744 }
745 
746 /* Eliminates the references to local variables from the single entry
747    single exit region between the ENTRY and EXIT edges.
748 
749    This includes:
750    1) Taking address of a local variable -- these are moved out of the
751    region (and temporary variable is created to hold the address if
752    necessary).
753 
754    2) Dereferencing a local variable -- these are replaced with indirect
755    references.  */
756 
757 static void
eliminate_local_variables(edge entry,edge exit)758 eliminate_local_variables (edge entry, edge exit)
759 {
760   basic_block bb;
761   auto_vec<basic_block, 3> body;
762   unsigned i;
763   gimple_stmt_iterator gsi;
764   bool has_debug_stmt = false;
765   int_tree_htab_type decl_address (10);
766   basic_block entry_bb = entry->src;
767   basic_block exit_bb = exit->dest;
768 
769   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
770 
771   FOR_EACH_VEC_ELT (body, i, bb)
772     if (bb != entry_bb && bb != exit_bb)
773       {
774         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
775 	  if (is_gimple_debug (gsi_stmt (gsi)))
776 	    {
777 	      if (gimple_debug_bind_p (gsi_stmt (gsi)))
778 	        has_debug_stmt = true;
779 	    }
780 	  else
781 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
782       }
783 
784   if (has_debug_stmt)
785     FOR_EACH_VEC_ELT (body, i, bb)
786       if (bb != entry_bb && bb != exit_bb)
787 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
788 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
789 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
790 }
791 
792 /* Returns true if expression EXPR is not defined between ENTRY and
793    EXIT, i.e. if all its operands are defined outside of the region.  */
794 
795 static bool
expr_invariant_in_region_p(edge entry,edge exit,tree expr)796 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
797 {
798   basic_block entry_bb = entry->src;
799   basic_block exit_bb = exit->dest;
800   basic_block def_bb;
801 
802   if (is_gimple_min_invariant (expr))
803     return true;
804 
805   if (TREE_CODE (expr) == SSA_NAME)
806     {
807       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
808       if (def_bb
809 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
810 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
811 	return false;
812 
813       return true;
814     }
815 
816   return false;
817 }
818 
819 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
820    The copies are stored to NAME_COPIES, if NAME was already duplicated,
821    its duplicate stored in NAME_COPIES is returned.
822 
823    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
824    duplicated, storing the copies in DECL_COPIES.  */
825 
826 static tree
separate_decls_in_region_name(tree name,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies,bool copy_name_p)827 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
828 			       int_tree_htab_type *decl_copies,
829 			       bool copy_name_p)
830 {
831   tree copy, var, var_copy;
832   unsigned idx, uid, nuid;
833   struct int_tree_map ielt;
834   struct name_to_copy_elt elt, *nelt;
835   name_to_copy_elt **slot;
836   int_tree_map *dslot;
837 
838   if (TREE_CODE (name) != SSA_NAME)
839     return name;
840 
841   idx = SSA_NAME_VERSION (name);
842   elt.version = idx;
843   slot = name_copies->find_slot_with_hash (&elt, idx,
844 					   copy_name_p ? INSERT : NO_INSERT);
845   if (slot && *slot)
846     return (*slot)->new_name;
847 
848   if (copy_name_p)
849     {
850       copy = duplicate_ssa_name (name, NULL);
851       nelt = XNEW (struct name_to_copy_elt);
852       nelt->version = idx;
853       nelt->new_name = copy;
854       nelt->field = NULL_TREE;
855       *slot = nelt;
856     }
857   else
858     {
859       gcc_assert (!slot);
860       copy = name;
861     }
862 
863   var = SSA_NAME_VAR (name);
864   if (!var)
865     return copy;
866 
867   uid = DECL_UID (var);
868   ielt.uid = uid;
869   dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT);
870   if (!dslot->to)
871     {
872       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
873       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
874       dslot->uid = uid;
875       dslot->to = var_copy;
876 
877       /* Ensure that when we meet this decl next time, we won't duplicate
878          it again.  */
879       nuid = DECL_UID (var_copy);
880       ielt.uid = nuid;
881       dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT);
882       gcc_assert (!dslot->to);
883       dslot->uid = nuid;
884       dslot->to = var_copy;
885     }
886   else
887     var_copy = dslot->to;
888 
889   replace_ssa_name_symbol (copy, var_copy);
890   return copy;
891 }
892 
893 /* Finds the ssa names used in STMT that are defined outside the
894    region between ENTRY and EXIT and replaces such ssa names with
895    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
896    decls of all ssa names used in STMT (including those defined in
897    LOOP) are replaced with the new temporary variables; the
898    replacement decls are stored in DECL_COPIES.  */
899 
900 static void
separate_decls_in_region_stmt(edge entry,edge exit,gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)901 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
902 			       name_to_copy_table_type *name_copies,
903 			       int_tree_htab_type *decl_copies)
904 {
905   use_operand_p use;
906   def_operand_p def;
907   ssa_op_iter oi;
908   tree name, copy;
909   bool copy_name_p;
910 
911   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
912   {
913     name = DEF_FROM_PTR (def);
914     gcc_assert (TREE_CODE (name) == SSA_NAME);
915     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
916 					  false);
917     gcc_assert (copy == name);
918   }
919 
920   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
921   {
922     name = USE_FROM_PTR (use);
923     if (TREE_CODE (name) != SSA_NAME)
924       continue;
925 
926     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
927     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
928 					  copy_name_p);
929     SET_USE (use, copy);
930   }
931 }
932 
933 /* Finds the ssa names used in STMT that are defined outside the
934    region between ENTRY and EXIT and replaces such ssa names with
935    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
936    decls of all ssa names used in STMT (including those defined in
937    LOOP) are replaced with the new temporary variables; the
938    replacement decls are stored in DECL_COPIES.  */
939 
940 static bool
separate_decls_in_region_debug(gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)941 separate_decls_in_region_debug (gimple *stmt,
942 				name_to_copy_table_type *name_copies,
943 				int_tree_htab_type *decl_copies)
944 {
945   use_operand_p use;
946   ssa_op_iter oi;
947   tree var, name;
948   struct int_tree_map ielt;
949   struct name_to_copy_elt elt;
950   name_to_copy_elt **slot;
951   int_tree_map *dslot;
952 
953   if (gimple_debug_bind_p (stmt))
954     var = gimple_debug_bind_get_var (stmt);
955   else if (gimple_debug_source_bind_p (stmt))
956     var = gimple_debug_source_bind_get_var (stmt);
957   else
958     return true;
959   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
960     return true;
961   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
962   ielt.uid = DECL_UID (var);
963   dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT);
964   if (!dslot)
965     return true;
966   if (gimple_debug_bind_p (stmt))
967     gimple_debug_bind_set_var (stmt, dslot->to);
968   else if (gimple_debug_source_bind_p (stmt))
969     gimple_debug_source_bind_set_var (stmt, dslot->to);
970 
971   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
972   {
973     name = USE_FROM_PTR (use);
974     if (TREE_CODE (name) != SSA_NAME)
975       continue;
976 
977     elt.version = SSA_NAME_VERSION (name);
978     slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT);
979     if (!slot)
980       {
981 	gimple_debug_bind_reset_value (stmt);
982 	update_stmt (stmt);
983 	break;
984       }
985 
986     SET_USE (use, (*slot)->new_name);
987   }
988 
989   return false;
990 }
991 
992 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
993    specified in SLOT. The type is passed in DATA.  */
994 
995 int
add_field_for_reduction(reduction_info ** slot,tree type)996 add_field_for_reduction (reduction_info **slot, tree type)
997 {
998 
999   struct reduction_info *const red = *slot;
1000   tree var = reduc_stmt_res (red->reduc_stmt);
1001   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
1002 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1003 
1004   insert_field_into_struct (type, field);
1005 
1006   red->field = field;
1007 
1008   return 1;
1009 }
1010 
1011 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1012    described in SLOT. The type is passed in DATA.  */
1013 
1014 int
add_field_for_name(name_to_copy_elt ** slot,tree type)1015 add_field_for_name (name_to_copy_elt **slot, tree type)
1016 {
1017   struct name_to_copy_elt *const elt = *slot;
1018   tree name = ssa_name (elt->version);
1019   tree field = build_decl (UNKNOWN_LOCATION,
1020 			   FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1021 			   TREE_TYPE (name));
1022 
1023   insert_field_into_struct (type, field);
1024   elt->field = field;
1025 
1026   return 1;
1027 }
1028 
1029 /* Callback for htab_traverse.  A local result is the intermediate result
1030    computed by a single
1031    thread, or the initial value in case no iteration was executed.
1032    This function creates a phi node reflecting these values.
1033    The phi's result will be stored in NEW_PHI field of the
1034    reduction's data structure.  */
1035 
1036 int
create_phi_for_local_result(reduction_info ** slot,struct loop * loop)1037 create_phi_for_local_result (reduction_info **slot, struct loop *loop)
1038 {
1039   struct reduction_info *const reduc = *slot;
1040   edge e;
1041   gphi *new_phi;
1042   basic_block store_bb, continue_bb;
1043   tree local_res;
1044   location_t locus;
1045 
1046   /* STORE_BB is the block where the phi
1047      should be stored.  It is the destination of the loop exit.
1048      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1049   continue_bb = single_pred (loop->latch);
1050   store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1051 
1052   /* STORE_BB has two predecessors.  One coming from  the loop
1053      (the reduction's result is computed at the loop),
1054      and another coming from a block preceding the loop,
1055      when no iterations
1056      are executed (the initial value should be taken).  */
1057   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1058     e = EDGE_PRED (store_bb, 1);
1059   else
1060     e = EDGE_PRED (store_bb, 0);
1061   tree lhs = reduc_stmt_res (reduc->reduc_stmt);
1062   local_res = copy_ssa_name (lhs);
1063   locus = gimple_location (reduc->reduc_stmt);
1064   new_phi = create_phi_node (local_res, store_bb);
1065   add_phi_arg (new_phi, reduc->init, e, locus);
1066   add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1067   reduc->new_phi = new_phi;
1068 
1069   return 1;
1070 }
1071 
1072 struct clsn_data
1073 {
1074   tree store;
1075   tree load;
1076 
1077   basic_block store_bb;
1078   basic_block load_bb;
1079 };
1080 
1081 /* Callback for htab_traverse.  Create an atomic instruction for the
1082    reduction described in SLOT.
1083    DATA annotates the place in memory the atomic operation relates to,
1084    and the basic block it needs to be generated in.  */
1085 
1086 int
create_call_for_reduction_1(reduction_info ** slot,struct clsn_data * clsn_data)1087 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1088 {
1089   struct reduction_info *const reduc = *slot;
1090   gimple_stmt_iterator gsi;
1091   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1092   tree load_struct;
1093   basic_block bb;
1094   basic_block new_bb;
1095   edge e;
1096   tree t, addr, ref, x;
1097   tree tmp_load, name;
1098   gimple *load;
1099 
1100   if (reduc->reduc_addr == NULL_TREE)
1101     {
1102       load_struct = build_simple_mem_ref (clsn_data->load);
1103       t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1104 
1105       addr = build_addr (t);
1106     }
1107   else
1108     {
1109       /* Set the address for the atomic store.  */
1110       addr = reduc->reduc_addr;
1111 
1112       /* Remove the non-atomic store '*addr = sum'.  */
1113       tree res = PHI_RESULT (reduc->keep_res);
1114       use_operand_p use_p;
1115       gimple *stmt;
1116       bool single_use_p = single_imm_use (res, &use_p, &stmt);
1117       gcc_assert (single_use_p);
1118       replace_uses_by (gimple_vdef (stmt),
1119 		       gimple_vuse (stmt));
1120       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1121       gsi_remove (&gsi, true);
1122     }
1123 
1124   /* Create phi node.  */
1125   bb = clsn_data->load_bb;
1126 
1127   gsi = gsi_last_bb (bb);
1128   e = split_block (bb, gsi_stmt (gsi));
1129   new_bb = e->dest;
1130 
1131   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1132   tmp_load = make_ssa_name (tmp_load);
1133   load = gimple_build_omp_atomic_load (tmp_load, addr,
1134 				       OMP_MEMORY_ORDER_RELAXED);
1135   SSA_NAME_DEF_STMT (tmp_load) = load;
1136   gsi = gsi_start_bb (new_bb);
1137   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1138 
1139   e = split_block (new_bb, load);
1140   new_bb = e->dest;
1141   gsi = gsi_start_bb (new_bb);
1142   ref = tmp_load;
1143   x = fold_build2 (reduc->reduction_code,
1144 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1145 		   PHI_RESULT (reduc->new_phi));
1146 
1147   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1148 				   GSI_CONTINUE_LINKING);
1149 
1150   gimple *store = gimple_build_omp_atomic_store (name,
1151 						 OMP_MEMORY_ORDER_RELAXED);
1152   gsi_insert_after (&gsi, store, GSI_NEW_STMT);
1153   return 1;
1154 }
1155 
1156 /* Create the atomic operation at the join point of the threads.
1157    REDUCTION_LIST describes the reductions in the LOOP.
1158    LD_ST_DATA describes the shared data structure where
1159    shared data is stored in and loaded from.  */
1160 static void
create_call_for_reduction(struct loop * loop,reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1161 create_call_for_reduction (struct loop *loop,
1162 			   reduction_info_table_type *reduction_list,
1163 			   struct clsn_data *ld_st_data)
1164 {
1165   reduction_list->traverse <struct loop *, create_phi_for_local_result> (loop);
1166   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1167   basic_block continue_bb = single_pred (loop->latch);
1168   ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1169   reduction_list
1170     ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1171 }
1172 
1173 /* Callback for htab_traverse.  Loads the final reduction value at the
1174    join point of all threads, and inserts it in the right place.  */
1175 
1176 int
create_loads_for_reductions(reduction_info ** slot,struct clsn_data * clsn_data)1177 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1178 {
1179   struct reduction_info *const red = *slot;
1180   gimple *stmt;
1181   gimple_stmt_iterator gsi;
1182   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1183   tree load_struct;
1184   tree name;
1185   tree x;
1186 
1187   /* If there's no exit phi, the result of the reduction is unused.  */
1188   if (red->keep_res == NULL)
1189     return 1;
1190 
1191   gsi = gsi_after_labels (clsn_data->load_bb);
1192   load_struct = build_simple_mem_ref (clsn_data->load);
1193   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1194 			NULL_TREE);
1195 
1196   x = load_struct;
1197   name = PHI_RESULT (red->keep_res);
1198   stmt = gimple_build_assign (name, x);
1199 
1200   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1201 
1202   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1203        !gsi_end_p (gsi); gsi_next (&gsi))
1204     if (gsi_stmt (gsi) == red->keep_res)
1205       {
1206 	remove_phi_node (&gsi, false);
1207 	return 1;
1208       }
1209   gcc_unreachable ();
1210 }
1211 
1212 /* Load the reduction result that was stored in LD_ST_DATA.
1213    REDUCTION_LIST describes the list of reductions that the
1214    loads should be generated for.  */
1215 static void
create_final_loads_for_reduction(reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1216 create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1217 				  struct clsn_data *ld_st_data)
1218 {
1219   gimple_stmt_iterator gsi;
1220   tree t;
1221   gimple *stmt;
1222 
1223   gsi = gsi_after_labels (ld_st_data->load_bb);
1224   t = build_fold_addr_expr (ld_st_data->store);
1225   stmt = gimple_build_assign (ld_st_data->load, t);
1226 
1227   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1228 
1229   reduction_list
1230     ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1231 
1232 }
1233 
1234 /* Callback for htab_traverse.  Store the neutral value for the
1235   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1236   1 for MULT_EXPR, etc. into the reduction field.
1237   The reduction is specified in SLOT. The store information is
1238   passed in DATA.  */
1239 
1240 int
create_stores_for_reduction(reduction_info ** slot,struct clsn_data * clsn_data)1241 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1242 {
1243   struct reduction_info *const red = *slot;
1244   tree t;
1245   gimple *stmt;
1246   gimple_stmt_iterator gsi;
1247   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1248 
1249   gsi = gsi_last_bb (clsn_data->store_bb);
1250   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1251   stmt = gimple_build_assign (t, red->initial_value);
1252   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1253 
1254   return 1;
1255 }
1256 
1257 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1258    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1259    specified in SLOT.  */
1260 
1261 int
create_loads_and_stores_for_name(name_to_copy_elt ** slot,struct clsn_data * clsn_data)1262 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1263 				  struct clsn_data *clsn_data)
1264 {
1265   struct name_to_copy_elt *const elt = *slot;
1266   tree t;
1267   gimple *stmt;
1268   gimple_stmt_iterator gsi;
1269   tree type = TREE_TYPE (elt->new_name);
1270   tree load_struct;
1271 
1272   gsi = gsi_last_bb (clsn_data->store_bb);
1273   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1274   stmt = gimple_build_assign (t, ssa_name (elt->version));
1275   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1276 
1277   gsi = gsi_last_bb (clsn_data->load_bb);
1278   load_struct = build_simple_mem_ref (clsn_data->load);
1279   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1280   stmt = gimple_build_assign (elt->new_name, t);
1281   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1282 
1283   return 1;
1284 }
1285 
1286 /* Moves all the variables used in LOOP and defined outside of it (including
1287    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1288    name) to a structure created for this purpose.  The code
1289 
1290    while (1)
1291      {
1292        use (a);
1293        use (b);
1294      }
1295 
1296    is transformed this way:
1297 
1298    bb0:
1299    old.a = a;
1300    old.b = b;
1301 
1302    bb1:
1303    a' = new->a;
1304    b' = new->b;
1305    while (1)
1306      {
1307        use (a');
1308        use (b');
1309      }
1310 
1311    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1312    pointer `new' is intentionally not initialized (the loop will be split to a
1313    separate function later, and `new' will be initialized from its arguments).
1314    LD_ST_DATA holds information about the shared data structure used to pass
1315    information among the threads.  It is initialized here, and
1316    gen_parallel_loop will pass it to create_call_for_reduction that
1317    needs this information.  REDUCTION_LIST describes the reductions
1318    in LOOP.  */
1319 
1320 static void
separate_decls_in_region(edge entry,edge exit,reduction_info_table_type * reduction_list,tree * arg_struct,tree * new_arg_struct,struct clsn_data * ld_st_data)1321 separate_decls_in_region (edge entry, edge exit,
1322 			  reduction_info_table_type *reduction_list,
1323 			  tree *arg_struct, tree *new_arg_struct,
1324 			  struct clsn_data *ld_st_data)
1325 
1326 {
1327   basic_block bb1 = split_edge (entry);
1328   basic_block bb0 = single_pred (bb1);
1329   name_to_copy_table_type name_copies (10);
1330   int_tree_htab_type decl_copies (10);
1331   unsigned i;
1332   tree type, type_name, nvar;
1333   gimple_stmt_iterator gsi;
1334   struct clsn_data clsn_data;
1335   auto_vec<basic_block, 3> body;
1336   basic_block bb;
1337   basic_block entry_bb = bb1;
1338   basic_block exit_bb = exit->dest;
1339   bool has_debug_stmt = false;
1340 
1341   entry = single_succ_edge (entry_bb);
1342   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1343 
1344   FOR_EACH_VEC_ELT (body, i, bb)
1345     {
1346       if (bb != entry_bb && bb != exit_bb)
1347 	{
1348 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1349 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1350 					   &name_copies, &decl_copies);
1351 
1352 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1353 	    {
1354 	      gimple *stmt = gsi_stmt (gsi);
1355 
1356 	      if (is_gimple_debug (stmt))
1357 		has_debug_stmt = true;
1358 	      else
1359 		separate_decls_in_region_stmt (entry, exit, stmt,
1360 					       &name_copies, &decl_copies);
1361 	    }
1362 	}
1363     }
1364 
1365   /* Now process debug bind stmts.  We must not create decls while
1366      processing debug stmts, so we defer their processing so as to
1367      make sure we will have debug info for as many variables as
1368      possible (all of those that were dealt with in the loop above),
1369      and discard those for which we know there's nothing we can
1370      do.  */
1371   if (has_debug_stmt)
1372     FOR_EACH_VEC_ELT (body, i, bb)
1373       if (bb != entry_bb && bb != exit_bb)
1374 	{
1375 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1376 	    {
1377 	      gimple *stmt = gsi_stmt (gsi);
1378 
1379 	      if (is_gimple_debug (stmt))
1380 		{
1381 		  if (separate_decls_in_region_debug (stmt, &name_copies,
1382 						      &decl_copies))
1383 		    {
1384 		      gsi_remove (&gsi, true);
1385 		      continue;
1386 		    }
1387 		}
1388 
1389 	      gsi_next (&gsi);
1390 	    }
1391 	}
1392 
1393   if (name_copies.elements () == 0 && reduction_list->elements () == 0)
1394     {
1395       /* It may happen that there is nothing to copy (if there are only
1396          loop carried and external variables in the loop).  */
1397       *arg_struct = NULL;
1398       *new_arg_struct = NULL;
1399     }
1400   else
1401     {
1402       /* Create the type for the structure to store the ssa names to.  */
1403       type = lang_hooks.types.make_type (RECORD_TYPE);
1404       type_name = build_decl (UNKNOWN_LOCATION,
1405 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1406 			      type);
1407       TYPE_NAME (type) = type_name;
1408 
1409       name_copies.traverse <tree, add_field_for_name> (type);
1410       if (reduction_list && reduction_list->elements () > 0)
1411 	{
1412 	  /* Create the fields for reductions.  */
1413 	  reduction_list->traverse <tree, add_field_for_reduction> (type);
1414 	}
1415       layout_type (type);
1416 
1417       /* Create the loads and stores.  */
1418       *arg_struct = create_tmp_var (type, ".paral_data_store");
1419       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1420       *new_arg_struct = make_ssa_name (nvar);
1421 
1422       ld_st_data->store = *arg_struct;
1423       ld_st_data->load = *new_arg_struct;
1424       ld_st_data->store_bb = bb0;
1425       ld_st_data->load_bb = bb1;
1426 
1427       name_copies
1428 	.traverse <struct clsn_data *, create_loads_and_stores_for_name>
1429 		  (ld_st_data);
1430 
1431       /* Load the calculation from memory (after the join of the threads).  */
1432 
1433       if (reduction_list && reduction_list->elements () > 0)
1434 	{
1435 	  reduction_list
1436 	    ->traverse <struct clsn_data *, create_stores_for_reduction>
1437 	    (ld_st_data);
1438 	  clsn_data.load = make_ssa_name (nvar);
1439 	  clsn_data.load_bb = exit->dest;
1440 	  clsn_data.store = ld_st_data->store;
1441 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1442 	}
1443     }
1444 }
1445 
1446 /* Returns true if FN was created to run in parallel.  */
1447 
1448 bool
parallelized_function_p(tree fndecl)1449 parallelized_function_p (tree fndecl)
1450 {
1451   cgraph_node *node = cgraph_node::get (fndecl);
1452   gcc_assert (node != NULL);
1453   return node->parallelized_function;
1454 }
1455 
1456 /* Creates and returns an empty function that will receive the body of
1457    a parallelized loop.  */
1458 
1459 static tree
create_loop_fn(location_t loc)1460 create_loop_fn (location_t loc)
1461 {
1462   char buf[100];
1463   char *tname;
1464   tree decl, type, name, t;
1465   struct function *act_cfun = cfun;
1466   static unsigned loopfn_num;
1467 
1468   loc = LOCATION_LOCUS (loc);
1469   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1470   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1471   clean_symbol_name (tname);
1472   name = get_identifier (tname);
1473   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1474 
1475   decl = build_decl (loc, FUNCTION_DECL, name, type);
1476   TREE_STATIC (decl) = 1;
1477   TREE_USED (decl) = 1;
1478   DECL_ARTIFICIAL (decl) = 1;
1479   DECL_IGNORED_P (decl) = 0;
1480   TREE_PUBLIC (decl) = 0;
1481   DECL_UNINLINABLE (decl) = 1;
1482   DECL_EXTERNAL (decl) = 0;
1483   DECL_CONTEXT (decl) = NULL_TREE;
1484   DECL_INITIAL (decl) = make_node (BLOCK);
1485   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
1486 
1487   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1488   DECL_ARTIFICIAL (t) = 1;
1489   DECL_IGNORED_P (t) = 1;
1490   DECL_RESULT (decl) = t;
1491 
1492   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1493 		  ptr_type_node);
1494   DECL_ARTIFICIAL (t) = 1;
1495   DECL_ARG_TYPE (t) = ptr_type_node;
1496   DECL_CONTEXT (t) = decl;
1497   TREE_USED (t) = 1;
1498   DECL_ARGUMENTS (decl) = t;
1499 
1500   allocate_struct_function (decl, false);
1501   DECL_STRUCT_FUNCTION (decl)->last_clique = act_cfun->last_clique;
1502 
1503   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1504      it.  */
1505   set_cfun (act_cfun);
1506 
1507   return decl;
1508 }
1509 
1510 /* Replace uses of NAME by VAL in block BB.  */
1511 
1512 static void
replace_uses_in_bb_by(tree name,tree val,basic_block bb)1513 replace_uses_in_bb_by (tree name, tree val, basic_block bb)
1514 {
1515   gimple *use_stmt;
1516   imm_use_iterator imm_iter;
1517 
1518   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
1519     {
1520       if (gimple_bb (use_stmt) != bb)
1521 	continue;
1522 
1523       use_operand_p use_p;
1524       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1525 	SET_USE (use_p, val);
1526     }
1527 }
1528 
1529 /* Do transformation from:
1530 
1531      <bb preheader>:
1532      ...
1533      goto <bb header>
1534 
1535      <bb header>:
1536      ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1537      sum_a = PHI <sum_init (preheader), sum_b (latch)>
1538      ...
1539      use (ivtmp_a)
1540      ...
1541      sum_b = sum_a + sum_update
1542      ...
1543      if (ivtmp_a < n)
1544        goto <bb latch>;
1545      else
1546        goto <bb exit>;
1547 
1548      <bb latch>:
1549      ivtmp_b = ivtmp_a + 1;
1550      goto <bb header>
1551 
1552      <bb exit>:
1553      sum_z = PHI <sum_b (cond[1]), ...>
1554 
1555      [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
1556 	 that's <bb header>.
1557 
1558    to:
1559 
1560      <bb preheader>:
1561      ...
1562      goto <bb newheader>
1563 
1564      <bb header>:
1565      ivtmp_a = PHI <ivtmp_c (latch)>
1566      sum_a = PHI <sum_c (latch)>
1567      ...
1568      use (ivtmp_a)
1569      ...
1570      sum_b = sum_a + sum_update
1571      ...
1572      goto <bb latch>;
1573 
1574      <bb newheader>:
1575      ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1576      sum_c = PHI <sum_init (preheader), sum_b (latch)>
1577      if (ivtmp_c < n + 1)
1578        goto <bb header>;
1579      else
1580        goto <bb newexit>;
1581 
1582      <bb latch>:
1583      ivtmp_b = ivtmp_a + 1;
1584      goto <bb newheader>
1585 
1586      <bb newexit>:
1587      sum_y = PHI <sum_c (newheader)>
1588 
1589      <bb exit>:
1590      sum_z = PHI <sum_y (newexit), ...>
1591 
1592 
1593    In unified diff format:
1594 
1595       <bb preheader>:
1596       ...
1597 -     goto <bb header>
1598 +     goto <bb newheader>
1599 
1600       <bb header>:
1601 -     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1602 -     sum_a = PHI <sum_init (preheader), sum_b (latch)>
1603 +     ivtmp_a = PHI <ivtmp_c (latch)>
1604 +     sum_a = PHI <sum_c (latch)>
1605       ...
1606       use (ivtmp_a)
1607       ...
1608       sum_b = sum_a + sum_update
1609       ...
1610 -     if (ivtmp_a < n)
1611 -       goto <bb latch>;
1612 +     goto <bb latch>;
1613 +
1614 +     <bb newheader>:
1615 +     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1616 +     sum_c = PHI <sum_init (preheader), sum_b (latch)>
1617 +     if (ivtmp_c < n + 1)
1618 +       goto <bb header>;
1619       else
1620 	goto <bb exit>;
1621 
1622       <bb latch>:
1623       ivtmp_b = ivtmp_a + 1;
1624 -     goto <bb header>
1625 +     goto <bb newheader>
1626 
1627 +    <bb newexit>:
1628 +    sum_y = PHI <sum_c (newheader)>
1629 
1630       <bb exit>:
1631 -     sum_z = PHI <sum_b (cond[1]), ...>
1632 +     sum_z = PHI <sum_y (newexit), ...>
1633 
1634    Note: the example does not show any virtual phis, but these are handled more
1635    or less as reductions.
1636 
1637 
1638    Moves the exit condition of LOOP to the beginning of its header.
1639    REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
1640    bound.  */
1641 
1642 static void
transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree bound)1643 transform_to_exit_first_loop_alt (struct loop *loop,
1644 				  reduction_info_table_type *reduction_list,
1645 				  tree bound)
1646 {
1647   basic_block header = loop->header;
1648   basic_block latch = loop->latch;
1649   edge exit = single_dom_exit (loop);
1650   basic_block exit_block = exit->dest;
1651   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1652   tree control = gimple_cond_lhs (cond_stmt);
1653   edge e;
1654 
1655   /* Rewriting virtuals into loop-closed ssa normal form makes this
1656      transformation simpler.  It also ensures that the virtuals are in
1657      loop-closed ssa normal from after the transformation, which is required by
1658      create_parallel_loop.  */
1659   rewrite_virtuals_into_loop_closed_ssa (loop);
1660 
1661   /* Create the new_header block.  */
1662   basic_block new_header = split_block_before_cond_jump (exit->src);
1663   edge edge_at_split = single_pred_edge (new_header);
1664 
1665   /* Redirect entry edge to new_header.  */
1666   edge entry = loop_preheader_edge (loop);
1667   e = redirect_edge_and_branch (entry, new_header);
1668   gcc_assert (e == entry);
1669 
1670   /* Redirect post_inc_edge to new_header.  */
1671   edge post_inc_edge = single_succ_edge (latch);
1672   e = redirect_edge_and_branch (post_inc_edge, new_header);
1673   gcc_assert (e == post_inc_edge);
1674 
1675   /* Redirect post_cond_edge to header.  */
1676   edge post_cond_edge = single_pred_edge (latch);
1677   e = redirect_edge_and_branch (post_cond_edge, header);
1678   gcc_assert (e == post_cond_edge);
1679 
1680   /* Redirect edge_at_split to latch.  */
1681   e = redirect_edge_and_branch (edge_at_split, latch);
1682   gcc_assert (e == edge_at_split);
1683 
1684   /* Set the new loop bound.  */
1685   gimple_cond_set_rhs (cond_stmt, bound);
1686   update_stmt (cond_stmt);
1687 
1688   /* Repair the ssa.  */
1689   vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
1690   edge_var_map *vm;
1691   gphi_iterator gsi;
1692   int i;
1693   for (gsi = gsi_start_phis (header), i = 0;
1694        !gsi_end_p (gsi) && v->iterate (i, &vm);
1695        gsi_next (&gsi), i++)
1696     {
1697       gphi *phi = gsi.phi ();
1698       tree res_a = PHI_RESULT (phi);
1699 
1700       /* Create new phi.  */
1701       tree res_c = copy_ssa_name (res_a, phi);
1702       gphi *nphi = create_phi_node (res_c, new_header);
1703 
1704       /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
1705       replace_uses_in_bb_by (res_a, res_c, new_header);
1706 
1707       /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
1708       add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
1709 
1710       /* Replace sum_b with sum_c in exit phi.  */
1711       tree res_b = redirect_edge_var_map_def (vm);
1712       replace_uses_in_bb_by (res_b, res_c, exit_block);
1713 
1714       struct reduction_info *red = reduction_phi (reduction_list, phi);
1715       gcc_assert (virtual_operand_p (res_a)
1716 		  || res_a == control
1717 		  || red != NULL);
1718 
1719       if (red)
1720 	{
1721 	  /* Register the new reduction phi.  */
1722 	  red->reduc_phi = nphi;
1723 	  gimple_set_uid (red->reduc_phi, red->reduc_version);
1724 	}
1725     }
1726   gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
1727 
1728   /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
1729   flush_pending_stmts (entry);
1730 
1731   /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
1732   flush_pending_stmts (post_inc_edge);
1733 
1734 
1735   basic_block new_exit_block = NULL;
1736   if (!single_pred_p (exit->dest))
1737     {
1738       /* Create a new empty exit block, inbetween the new loop header and the
1739 	 old exit block.  The function separate_decls_in_region needs this block
1740 	 to insert code that is active on loop exit, but not any other path.  */
1741       new_exit_block = split_edge (exit);
1742     }
1743 
1744   /* Insert and register the reduction exit phis.  */
1745   for (gphi_iterator gsi = gsi_start_phis (exit_block);
1746        !gsi_end_p (gsi);
1747        gsi_next (&gsi))
1748     {
1749       gphi *phi = gsi.phi ();
1750       gphi *nphi = NULL;
1751       tree res_z = PHI_RESULT (phi);
1752       tree res_c;
1753 
1754       if (new_exit_block != NULL)
1755 	{
1756 	  /* Now that we have a new exit block, duplicate the phi of the old
1757 	     exit block in the new exit block to preserve loop-closed ssa.  */
1758 	  edge succ_new_exit_block = single_succ_edge (new_exit_block);
1759 	  edge pred_new_exit_block = single_pred_edge (new_exit_block);
1760 	  tree res_y = copy_ssa_name (res_z, phi);
1761 	  nphi = create_phi_node (res_y, new_exit_block);
1762 	  res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
1763 	  add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
1764 	  add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
1765 	}
1766       else
1767 	res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1768 
1769       if (virtual_operand_p (res_z))
1770 	continue;
1771 
1772       gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
1773       struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
1774       if (red != NULL)
1775 	red->keep_res = (nphi != NULL
1776 			 ? nphi
1777 			 : phi);
1778     }
1779 
1780   /* We're going to cancel the loop at the end of gen_parallel_loop, but until
1781      then we're still using some fields, so only bother about fields that are
1782      still used: header and latch.
1783      The loop has a new header bb, so we update it.  The latch bb stays the
1784      same.  */
1785   loop->header = new_header;
1786 
1787   /* Recalculate dominance info.  */
1788   free_dominance_info (CDI_DOMINATORS);
1789   calculate_dominance_info (CDI_DOMINATORS);
1790 
1791   checking_verify_ssa (true, true);
1792 }
1793 
1794 /* Tries to moves the exit condition of LOOP to the beginning of its header
1795    without duplication of the loop body.  NIT is the number of iterations of the
1796    loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
1797    transformation is successful.  */
1798 
1799 static bool
try_transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1800 try_transform_to_exit_first_loop_alt (struct loop *loop,
1801 				      reduction_info_table_type *reduction_list,
1802 				      tree nit)
1803 {
1804   /* Check whether the latch contains a single statement.  */
1805   if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
1806     return false;
1807 
1808   /* Check whether the latch contains no phis.  */
1809   if (phi_nodes (loop->latch) != NULL)
1810     return false;
1811 
1812   /* Check whether the latch contains the loop iv increment.  */
1813   edge back = single_succ_edge (loop->latch);
1814   edge exit = single_dom_exit (loop);
1815   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1816   tree control = gimple_cond_lhs (cond_stmt);
1817   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
1818   tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
1819   if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
1820     return false;
1821 
1822   /* Check whether there's no code between the loop condition and the latch.  */
1823   if (!single_pred_p (loop->latch)
1824       || single_pred (loop->latch) != exit->src)
1825     return false;
1826 
1827   tree alt_bound = NULL_TREE;
1828   tree nit_type = TREE_TYPE (nit);
1829 
1830   /* Figure out whether nit + 1 overflows.  */
1831   if (TREE_CODE (nit) == INTEGER_CST)
1832     {
1833       if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
1834 	{
1835 	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
1836 				       nit, build_one_cst (nit_type));
1837 
1838 	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
1839 	  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1840 	  return true;
1841 	}
1842       else
1843 	{
1844 	  /* Todo: Figure out if we can trigger this, if it's worth to handle
1845 	     optimally, and if we can handle it optimally.  */
1846 	  return false;
1847 	}
1848     }
1849 
1850   gcc_assert (TREE_CODE (nit) == SSA_NAME);
1851 
1852   /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
1853      iv with base 0 and step 1 that is incremented in the latch, like this:
1854 
1855      <bb header>:
1856      # iv_1 = PHI <0 (preheader), iv_2 (latch)>
1857      ...
1858      if (iv_1 < nit)
1859        goto <bb latch>;
1860      else
1861        goto <bb exit>;
1862 
1863      <bb latch>:
1864      iv_2 = iv_1 + 1;
1865      goto <bb header>;
1866 
1867      The range of iv_1 is [0, nit].  The latch edge is taken for
1868      iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit.  So the
1869      number of latch executions is equal to nit.
1870 
1871      The function max_loop_iterations gives us the maximum number of latch
1872      executions, so it gives us the maximum value of nit.  */
1873   widest_int nit_max;
1874   if (!max_loop_iterations (loop, &nit_max))
1875     return false;
1876 
1877   /* Check if nit + 1 overflows.  */
1878   widest_int type_max = wi::to_widest (TYPE_MAX_VALUE (nit_type));
1879   if (nit_max >= type_max)
1880     return false;
1881 
1882   gimple *def = SSA_NAME_DEF_STMT (nit);
1883 
1884   /* Try to find nit + 1, in the form of n in an assignment nit = n - 1.  */
1885   if (def
1886       && is_gimple_assign (def)
1887       && gimple_assign_rhs_code (def) == PLUS_EXPR)
1888     {
1889       tree op1 = gimple_assign_rhs1 (def);
1890       tree op2 = gimple_assign_rhs2 (def);
1891       if (integer_minus_onep (op1))
1892 	alt_bound = op2;
1893       else if (integer_minus_onep (op2))
1894 	alt_bound = op1;
1895     }
1896 
1897   /* If not found, insert nit + 1.  */
1898   if (alt_bound == NULL_TREE)
1899     {
1900       alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
1901 			       build_int_cst_type (nit_type, 1));
1902 
1903       gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1904 
1905       alt_bound
1906 	= force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
1907 				    GSI_CONTINUE_LINKING);
1908     }
1909 
1910   transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1911   return true;
1912 }
1913 
1914 /* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
1915    number of iterations of the loop.  REDUCTION_LIST describes the reductions in
1916    LOOP.  */
1917 
1918 static void
transform_to_exit_first_loop(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1919 transform_to_exit_first_loop (struct loop *loop,
1920 			      reduction_info_table_type *reduction_list,
1921 			      tree nit)
1922 {
1923   basic_block *bbs, *nbbs, ex_bb, orig_header;
1924   unsigned n;
1925   bool ok;
1926   edge exit = single_dom_exit (loop), hpred;
1927   tree control, control_name, res, t;
1928   gphi *phi, *nphi;
1929   gassign *stmt;
1930   gcond *cond_stmt, *cond_nit;
1931   tree nit_1;
1932 
1933   split_block_after_labels (loop->header);
1934   orig_header = single_succ (loop->header);
1935   hpred = single_succ_edge (loop->header);
1936 
1937   cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1938   control = gimple_cond_lhs (cond_stmt);
1939   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1940 
1941   /* Make sure that we have phi nodes on exit for all loop header phis
1942      (create_parallel_loop requires that).  */
1943   for (gphi_iterator gsi = gsi_start_phis (loop->header);
1944        !gsi_end_p (gsi);
1945        gsi_next (&gsi))
1946     {
1947       phi = gsi.phi ();
1948       res = PHI_RESULT (phi);
1949       t = copy_ssa_name (res, phi);
1950       SET_PHI_RESULT (phi, t);
1951       nphi = create_phi_node (res, orig_header);
1952       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1953 
1954       if (res == control)
1955 	{
1956 	  gimple_cond_set_lhs (cond_stmt, t);
1957 	  update_stmt (cond_stmt);
1958 	  control = t;
1959 	}
1960     }
1961 
1962   bbs = get_loop_body_in_dom_order (loop);
1963 
1964   for (n = 0; bbs[n] != exit->src; n++)
1965    continue;
1966   nbbs = XNEWVEC (basic_block, n);
1967   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1968 				   bbs + 1, n, nbbs);
1969   gcc_assert (ok);
1970   free (bbs);
1971   ex_bb = nbbs[0];
1972   free (nbbs);
1973 
1974   /* Other than reductions, the only gimple reg that should be copied
1975      out of the loop is the control variable.  */
1976   exit = single_dom_exit (loop);
1977   control_name = NULL_TREE;
1978   for (gphi_iterator gsi = gsi_start_phis (ex_bb);
1979        !gsi_end_p (gsi); )
1980     {
1981       phi = gsi.phi ();
1982       res = PHI_RESULT (phi);
1983       if (virtual_operand_p (res))
1984 	{
1985 	  gsi_next (&gsi);
1986 	  continue;
1987 	}
1988 
1989       /* Check if it is a part of reduction.  If it is,
1990          keep the phi at the reduction's keep_res field.  The
1991          PHI_RESULT of this phi is the resulting value of the reduction
1992          variable when exiting the loop.  */
1993 
1994       if (reduction_list->elements () > 0)
1995 	{
1996 	  struct reduction_info *red;
1997 
1998 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1999 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
2000 	  if (red)
2001 	    {
2002 	      red->keep_res = phi;
2003 	      gsi_next (&gsi);
2004 	      continue;
2005 	    }
2006 	}
2007       gcc_assert (control_name == NULL_TREE
2008 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2009       control_name = res;
2010       remove_phi_node (&gsi, false);
2011     }
2012   gcc_assert (control_name != NULL_TREE);
2013 
2014   /* Initialize the control variable to number of iterations
2015      according to the rhs of the exit condition.  */
2016   gimple_stmt_iterator gsi = gsi_after_labels (ex_bb);
2017   cond_nit = as_a <gcond *> (last_stmt (exit->src));
2018   nit_1 =  gimple_cond_rhs (cond_nit);
2019   nit_1 = force_gimple_operand_gsi (&gsi,
2020 				  fold_convert (TREE_TYPE (control_name), nit_1),
2021 				  false, NULL_TREE, false, GSI_SAME_STMT);
2022   stmt = gimple_build_assign (control_name, nit_1);
2023   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2024 }
2025 
2026 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2027    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2028    NEW_DATA is the variable that should be initialized from the argument
2029    of LOOP_FN.  N_THREADS is the requested number of threads, which can be 0 if
2030    that number is to be determined later.  */
2031 
2032 static void
create_parallel_loop(struct loop * loop,tree loop_fn,tree data,tree new_data,unsigned n_threads,location_t loc,bool oacc_kernels_p)2033 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
2034 		      tree new_data, unsigned n_threads, location_t loc,
2035 		      bool oacc_kernels_p)
2036 {
2037   gimple_stmt_iterator gsi;
2038   basic_block for_bb, ex_bb, continue_bb;
2039   tree t, param;
2040   gomp_parallel *omp_par_stmt;
2041   gimple *omp_return_stmt1, *omp_return_stmt2;
2042   gimple *phi;
2043   gcond *cond_stmt;
2044   gomp_for *for_stmt;
2045   gomp_continue *omp_cont_stmt;
2046   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2047   edge exit, nexit, guard, end, e;
2048 
2049   if (oacc_kernels_p)
2050     {
2051       gcc_checking_assert (lookup_attribute ("oacc kernels",
2052 					     DECL_ATTRIBUTES (cfun->decl)));
2053       /* Indicate to later processing that this is a parallelized OpenACC
2054 	 kernels construct.  */
2055       DECL_ATTRIBUTES (cfun->decl)
2056 	= tree_cons (get_identifier ("oacc kernels parallelized"),
2057 		     NULL_TREE, DECL_ATTRIBUTES (cfun->decl));
2058     }
2059   else
2060     {
2061       /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
2062 
2063       basic_block bb = loop_preheader_edge (loop)->src;
2064       basic_block paral_bb = single_pred (bb);
2065       gsi = gsi_last_bb (paral_bb);
2066 
2067       gcc_checking_assert (n_threads != 0);
2068       t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2069       OMP_CLAUSE_NUM_THREADS_EXPR (t)
2070 	= build_int_cst (integer_type_node, n_threads);
2071       omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2072       gimple_set_location (omp_par_stmt, loc);
2073 
2074       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2075 
2076       /* Initialize NEW_DATA.  */
2077       if (data)
2078 	{
2079 	  gassign *assign_stmt;
2080 
2081 	  gsi = gsi_after_labels (bb);
2082 
2083 	  param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2084 	  assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2085 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2086 
2087 	  assign_stmt = gimple_build_assign (new_data,
2088 					     fold_convert (TREE_TYPE (new_data), param));
2089 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2090 	}
2091 
2092       /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
2093       bb = split_loop_exit_edge (single_dom_exit (loop));
2094       gsi = gsi_last_bb (bb);
2095       omp_return_stmt1 = gimple_build_omp_return (false);
2096       gimple_set_location (omp_return_stmt1, loc);
2097       gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2098     }
2099 
2100   /* Extract data for GIMPLE_OMP_FOR.  */
2101   gcc_assert (loop->header == single_dom_exit (loop)->src);
2102   cond_stmt = as_a <gcond *> (last_stmt (loop->header));
2103 
2104   cvar = gimple_cond_lhs (cond_stmt);
2105   cvar_base = SSA_NAME_VAR (cvar);
2106   phi = SSA_NAME_DEF_STMT (cvar);
2107   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2108   initvar = copy_ssa_name (cvar);
2109   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2110 	   initvar);
2111   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2112 
2113   gsi = gsi_last_nondebug_bb (loop->latch);
2114   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2115   gsi_remove (&gsi, true);
2116 
2117   /* Prepare cfg.  */
2118   for_bb = split_edge (loop_preheader_edge (loop));
2119   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2120   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2121   gcc_assert (exit == single_dom_exit (loop));
2122 
2123   guard = make_edge (for_bb, ex_bb, 0);
2124   /* FIXME: What is the probability?  */
2125   guard->probability = profile_probability::guessed_never ();
2126   /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid.  */
2127   loop->latch = split_edge (single_succ_edge (loop->latch));
2128   single_pred_edge (loop->latch)->flags = 0;
2129   end = make_single_succ_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU);
2130   rescan_loop_exit (end, true, false);
2131 
2132   for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2133        !gsi_end_p (gpi); gsi_next (&gpi))
2134     {
2135       location_t locus;
2136       gphi *phi = gpi.phi ();
2137       tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2138       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2139 
2140       /* If the exit phi is not connected to a header phi in the same loop, this
2141 	 value is not modified in the loop, and we're done with this phi.  */
2142       if (!(gimple_code (def_stmt) == GIMPLE_PHI
2143 	    && gimple_bb (def_stmt) == loop->header))
2144 	{
2145 	  locus = gimple_phi_arg_location_from_edge (phi, exit);
2146 	  add_phi_arg (phi, def, guard, locus);
2147 	  add_phi_arg (phi, def, end, locus);
2148 	  continue;
2149 	}
2150 
2151       gphi *stmt = as_a <gphi *> (def_stmt);
2152       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2153       locus = gimple_phi_arg_location_from_edge (stmt,
2154 						 loop_preheader_edge (loop));
2155       add_phi_arg (phi, def, guard, locus);
2156 
2157       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2158       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
2159       add_phi_arg (phi, def, end, locus);
2160     }
2161   e = redirect_edge_and_branch (exit, nexit->dest);
2162   PENDING_STMT (e) = NULL;
2163 
2164   /* Emit GIMPLE_OMP_FOR.  */
2165   if (oacc_kernels_p)
2166     /* Parallelized OpenACC kernels constructs use gang parallelism.  See also
2167        omp-offload.c:execute_oacc_device_lower.  */
2168     t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2169   else
2170     {
2171       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2172       int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE);
2173       enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \
2174 	= (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE);
2175       switch (schedule_type)
2176 	{
2177 	case PARAM_PARLOOPS_SCHEDULE_KIND_static:
2178 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2179 	  break;
2180 	case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic:
2181 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2182 	  break;
2183 	case PARAM_PARLOOPS_SCHEDULE_KIND_guided:
2184 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2185 	  break;
2186 	case PARAM_PARLOOPS_SCHEDULE_KIND_auto:
2187 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2188 	  chunk_size = 0;
2189 	  break;
2190 	case PARAM_PARLOOPS_SCHEDULE_KIND_runtime:
2191 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2192 	  chunk_size = 0;
2193 	  break;
2194 	default:
2195 	  gcc_unreachable ();
2196 	}
2197       if (chunk_size != 0)
2198 	OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2199 	  = build_int_cst (integer_type_node, chunk_size);
2200     }
2201 
2202   for_stmt = gimple_build_omp_for (NULL,
2203 				   (oacc_kernels_p
2204 				    ? GF_OMP_FOR_KIND_OACC_LOOP
2205 				    : GF_OMP_FOR_KIND_FOR),
2206 				   t, 1, NULL);
2207 
2208   gimple_cond_set_lhs (cond_stmt, cvar_base);
2209   type = TREE_TYPE (cvar);
2210   gimple_set_location (for_stmt, loc);
2211   gimple_omp_for_set_index (for_stmt, 0, initvar);
2212   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
2213   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
2214   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
2215   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
2216 						cvar_base,
2217 						build_int_cst (type, 1)));
2218 
2219   gsi = gsi_last_bb (for_bb);
2220   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2221   SSA_NAME_DEF_STMT (initvar) = for_stmt;
2222 
2223   /* Emit GIMPLE_OMP_CONTINUE.  */
2224   continue_bb = single_pred (loop->latch);
2225   gsi = gsi_last_bb (continue_bb);
2226   omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2227   gimple_set_location (omp_cont_stmt, loc);
2228   gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2229   SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2230 
2231   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
2232   gsi = gsi_last_bb (ex_bb);
2233   omp_return_stmt2 = gimple_build_omp_return (true);
2234   gimple_set_location (omp_return_stmt2, loc);
2235   gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2236 
2237   /* After the above dom info is hosed.  Re-compute it.  */
2238   free_dominance_info (CDI_DOMINATORS);
2239   calculate_dominance_info (CDI_DOMINATORS);
2240 }
2241 
2242 /* Return number of phis in bb.  If COUNT_VIRTUAL_P is false, don't count the
2243    virtual phi.  */
2244 
2245 static unsigned int
num_phis(basic_block bb,bool count_virtual_p)2246 num_phis (basic_block bb, bool count_virtual_p)
2247 {
2248   unsigned int nr_phis = 0;
2249   gphi_iterator gsi;
2250   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2251     {
2252       if (!count_virtual_p && virtual_operand_p (PHI_RESULT (gsi.phi ())))
2253 	continue;
2254 
2255       nr_phis++;
2256     }
2257 
2258   return nr_phis;
2259 }
2260 
2261 /* Generates code to execute the iterations of LOOP in N_THREADS
2262    threads in parallel, which can be 0 if that number is to be determined
2263    later.
2264 
2265    NITER describes number of iterations of LOOP.
2266    REDUCTION_LIST describes the reductions existent in the LOOP.  */
2267 
2268 static void
gen_parallel_loop(struct loop * loop,reduction_info_table_type * reduction_list,unsigned n_threads,struct tree_niter_desc * niter,bool oacc_kernels_p)2269 gen_parallel_loop (struct loop *loop,
2270 		   reduction_info_table_type *reduction_list,
2271 		   unsigned n_threads, struct tree_niter_desc *niter,
2272 		   bool oacc_kernels_p)
2273 {
2274   tree many_iterations_cond, type, nit;
2275   tree arg_struct, new_arg_struct;
2276   gimple_seq stmts;
2277   edge entry, exit;
2278   struct clsn_data clsn_data;
2279   location_t loc;
2280   gimple *cond_stmt;
2281   unsigned int m_p_thread=2;
2282 
2283   /* From
2284 
2285      ---------------------------------------------------------------------
2286      loop
2287        {
2288 	 IV = phi (INIT, IV + STEP)
2289 	 BODY1;
2290 	 if (COND)
2291 	   break;
2292 	 BODY2;
2293        }
2294      ---------------------------------------------------------------------
2295 
2296      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2297      we generate the following code:
2298 
2299      ---------------------------------------------------------------------
2300 
2301      if (MAY_BE_ZERO
2302      || NITER < MIN_PER_THREAD * N_THREADS)
2303      goto original;
2304 
2305      BODY1;
2306      store all local loop-invariant variables used in body of the loop to DATA.
2307      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
2308      load the variables from DATA.
2309      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
2310      BODY2;
2311      BODY1;
2312      GIMPLE_OMP_CONTINUE;
2313      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
2314      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
2315      goto end;
2316 
2317      original:
2318      loop
2319        {
2320 	 IV = phi (INIT, IV + STEP)
2321 	 BODY1;
2322 	 if (COND)
2323 	   break;
2324 	 BODY2;
2325        }
2326 
2327      end:
2328 
2329    */
2330 
2331   /* Create two versions of the loop -- in the old one, we know that the
2332      number of iterations is large enough, and we will transform it into the
2333      loop that will be split to loop_fn, the new one will be used for the
2334      remaining iterations.  */
2335 
2336   /* We should compute a better number-of-iterations value for outer loops.
2337      That is, if we have
2338 
2339     for (i = 0; i < n; ++i)
2340       for (j = 0; j < m; ++j)
2341         ...
2342 
2343     we should compute nit = n * m, not nit = n.
2344     Also may_be_zero handling would need to be adjusted.  */
2345 
2346   type = TREE_TYPE (niter->niter);
2347   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
2348 			      NULL_TREE);
2349   if (stmts)
2350     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2351 
2352   if (!oacc_kernels_p)
2353     {
2354       if (loop->inner)
2355 	m_p_thread=2;
2356       else
2357 	m_p_thread=MIN_PER_THREAD;
2358 
2359       gcc_checking_assert (n_threads != 0);
2360       many_iterations_cond =
2361 	fold_build2 (GE_EXPR, boolean_type_node,
2362 		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
2363 
2364       many_iterations_cond
2365 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2366 		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
2367 		       many_iterations_cond);
2368       many_iterations_cond
2369 	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
2370       if (stmts)
2371 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2372       if (!is_gimple_condexpr (many_iterations_cond))
2373 	{
2374 	  many_iterations_cond
2375 	    = force_gimple_operand (many_iterations_cond, &stmts,
2376 				    true, NULL_TREE);
2377 	  if (stmts)
2378 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
2379 					      stmts);
2380 	}
2381 
2382       initialize_original_copy_tables ();
2383 
2384       /* We assume that the loop usually iterates a lot.  */
2385       loop_version (loop, many_iterations_cond, NULL,
2386 		    profile_probability::likely (),
2387 		    profile_probability::unlikely (),
2388 		    profile_probability::likely (),
2389 		    profile_probability::unlikely (), true);
2390       update_ssa (TODO_update_ssa);
2391       free_original_copy_tables ();
2392     }
2393 
2394   /* Base all the induction variables in LOOP on a single control one.  */
2395   canonicalize_loop_ivs (loop, &nit, true);
2396   if (num_phis (loop->header, false) != reduction_list->elements () + 1)
2397     {
2398       /* The call to canonicalize_loop_ivs above failed to "base all the
2399 	 induction variables in LOOP on a single control one".  Do damage
2400 	 control.  */
2401       basic_block preheader = loop_preheader_edge (loop)->src;
2402       basic_block cond_bb = single_pred (preheader);
2403       gcond *cond = as_a <gcond *> (gsi_stmt (gsi_last_bb (cond_bb)));
2404       gimple_cond_make_true (cond);
2405       update_stmt (cond);
2406       /* We've gotten rid of the duplicate loop created by loop_version, but
2407 	 we can't undo whatever canonicalize_loop_ivs has done.
2408 	 TODO: Fix this properly by ensuring that the call to
2409 	 canonicalize_loop_ivs succeeds.  */
2410       if (dump_file
2411 	  && (dump_flags & TDF_DETAILS))
2412 	fprintf (dump_file, "canonicalize_loop_ivs failed for loop %d,"
2413 		 " aborting transformation\n", loop->num);
2414       return;
2415     }
2416 
2417   /* Ensure that the exit condition is the first statement in the loop.
2418      The common case is that latch of the loop is empty (apart from the
2419      increment) and immediately follows the loop exit test.  Attempt to move the
2420      entry of the loop directly before the exit check and increase the number of
2421      iterations of the loop by one.  */
2422   if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
2423     {
2424       if (dump_file
2425 	  && (dump_flags & TDF_DETAILS))
2426 	fprintf (dump_file,
2427 		 "alternative exit-first loop transform succeeded"
2428 		 " for loop %d\n", loop->num);
2429     }
2430   else
2431     {
2432       if (oacc_kernels_p)
2433 	n_threads = 1;
2434 
2435       /* Fall back on the method that handles more cases, but duplicates the
2436 	 loop body: move the exit condition of LOOP to the beginning of its
2437 	 header, and duplicate the part of the last iteration that gets disabled
2438 	 to the exit of the loop.  */
2439       transform_to_exit_first_loop (loop, reduction_list, nit);
2440     }
2441 
2442   /* Generate initializations for reductions.  */
2443   if (reduction_list->elements () > 0)
2444     reduction_list->traverse <struct loop *, initialize_reductions> (loop);
2445 
2446   /* Eliminate the references to local variables from the loop.  */
2447   gcc_assert (single_exit (loop));
2448   entry = loop_preheader_edge (loop);
2449   exit = single_dom_exit (loop);
2450 
2451   /* This rewrites the body in terms of new variables.  This has already
2452      been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
2453   if (!oacc_kernels_p)
2454     {
2455       eliminate_local_variables (entry, exit);
2456       /* In the old loop, move all variables non-local to the loop to a
2457 	 structure and back, and create separate decls for the variables used in
2458 	 loop.  */
2459       separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
2460 				&new_arg_struct, &clsn_data);
2461     }
2462   else
2463     {
2464       arg_struct = NULL_TREE;
2465       new_arg_struct = NULL_TREE;
2466       clsn_data.load = NULL_TREE;
2467       clsn_data.load_bb = exit->dest;
2468       clsn_data.store = NULL_TREE;
2469       clsn_data.store_bb = NULL;
2470     }
2471 
2472   /* Create the parallel constructs.  */
2473   loc = UNKNOWN_LOCATION;
2474   cond_stmt = last_stmt (loop->header);
2475   if (cond_stmt)
2476     loc = gimple_location (cond_stmt);
2477   create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
2478 			n_threads, loc, oacc_kernels_p);
2479   if (reduction_list->elements () > 0)
2480     create_call_for_reduction (loop, reduction_list, &clsn_data);
2481 
2482   scev_reset ();
2483 
2484   /* Free loop bound estimations that could contain references to
2485      removed statements.  */
2486   free_numbers_of_iterations_estimates (cfun);
2487 }
2488 
2489 /* Returns true when LOOP contains vector phi nodes.  */
2490 
2491 static bool
loop_has_vector_phi_nodes(struct loop * loop ATTRIBUTE_UNUSED)2492 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
2493 {
2494   unsigned i;
2495   basic_block *bbs = get_loop_body_in_dom_order (loop);
2496   gphi_iterator gsi;
2497   bool res = true;
2498 
2499   for (i = 0; i < loop->num_nodes; i++)
2500     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2501       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE)
2502 	goto end;
2503 
2504   res = false;
2505  end:
2506   free (bbs);
2507   return res;
2508 }
2509 
2510 /* Create a reduction_info struct, initialize it with REDUC_STMT
2511    and PHI, insert it to the REDUCTION_LIST.  */
2512 
2513 static void
build_new_reduction(reduction_info_table_type * reduction_list,gimple * reduc_stmt,gphi * phi)2514 build_new_reduction (reduction_info_table_type *reduction_list,
2515 		     gimple *reduc_stmt, gphi *phi)
2516 {
2517   reduction_info **slot;
2518   struct reduction_info *new_reduction;
2519   enum tree_code reduction_code;
2520 
2521   gcc_assert (reduc_stmt);
2522 
2523   if (gimple_code (reduc_stmt) == GIMPLE_PHI)
2524     {
2525       tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
2526       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2527       reduction_code = gimple_assign_rhs_code (def1);
2528     }
2529   else
2530     reduction_code = gimple_assign_rhs_code (reduc_stmt);
2531   /* Check for OpenMP supported reduction.  */
2532   switch (reduction_code)
2533     {
2534     case PLUS_EXPR:
2535     case MULT_EXPR:
2536     case MAX_EXPR:
2537     case MIN_EXPR:
2538     case BIT_IOR_EXPR:
2539     case BIT_XOR_EXPR:
2540     case BIT_AND_EXPR:
2541     case TRUTH_OR_EXPR:
2542     case TRUTH_XOR_EXPR:
2543     case TRUTH_AND_EXPR:
2544       break;
2545     default:
2546       return;
2547     }
2548 
2549   if (dump_file && (dump_flags & TDF_DETAILS))
2550     {
2551       fprintf (dump_file,
2552 	       "Detected reduction. reduction stmt is:\n");
2553       print_gimple_stmt (dump_file, reduc_stmt, 0);
2554       fprintf (dump_file, "\n");
2555     }
2556 
2557   new_reduction = XCNEW (struct reduction_info);
2558 
2559   new_reduction->reduc_stmt = reduc_stmt;
2560   new_reduction->reduc_phi = phi;
2561   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
2562   new_reduction->reduction_code = reduction_code;
2563   slot = reduction_list->find_slot (new_reduction, INSERT);
2564   *slot = new_reduction;
2565 }
2566 
2567 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
2568 
2569 int
set_reduc_phi_uids(reduction_info ** slot,void * data ATTRIBUTE_UNUSED)2570 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
2571 {
2572   struct reduction_info *const red = *slot;
2573   gimple_set_uid (red->reduc_phi, red->reduc_version);
2574   return 1;
2575 }
2576 
2577 /* Return true if the type of reduction performed by STMT_INFO is suitable
2578    for this pass.  */
2579 
2580 static bool
valid_reduction_p(stmt_vec_info stmt_info)2581 valid_reduction_p (stmt_vec_info stmt_info)
2582 {
2583   /* Parallelization would reassociate the operation, which isn't
2584      allowed for in-order reductions.  */
2585   vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (stmt_info);
2586   return reduc_type != FOLD_LEFT_REDUCTION;
2587 }
2588 
2589 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
2590 
2591 static void
gather_scalar_reductions(loop_p loop,reduction_info_table_type * reduction_list)2592 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
2593 {
2594   gphi_iterator gsi;
2595   loop_vec_info simple_loop_info;
2596   auto_vec<gphi *, 4> double_reduc_phis;
2597   auto_vec<gimple *, 4> double_reduc_stmts;
2598 
2599   vec_info_shared shared;
2600   simple_loop_info = vect_analyze_loop_form (loop, &shared);
2601   if (simple_loop_info == NULL)
2602     goto gather_done;
2603 
2604   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2605     {
2606       gphi *phi = gsi.phi ();
2607       affine_iv iv;
2608       tree res = PHI_RESULT (phi);
2609       bool double_reduc;
2610 
2611       if (virtual_operand_p (res))
2612 	continue;
2613 
2614       if (simple_iv (loop, loop, res, &iv, true))
2615 	continue;
2616 
2617       stmt_vec_info reduc_stmt_info
2618 	= vect_force_simple_reduction (simple_loop_info,
2619 				       simple_loop_info->lookup_stmt (phi),
2620 				       &double_reduc, true);
2621       if (!reduc_stmt_info || !valid_reduction_p (reduc_stmt_info))
2622 	continue;
2623 
2624       if (double_reduc)
2625 	{
2626 	  if (loop->inner->inner != NULL)
2627 	    continue;
2628 
2629 	  double_reduc_phis.safe_push (phi);
2630 	  double_reduc_stmts.safe_push (reduc_stmt_info->stmt);
2631 	  continue;
2632 	}
2633 
2634       build_new_reduction (reduction_list, reduc_stmt_info->stmt, phi);
2635     }
2636   delete simple_loop_info;
2637 
2638   if (!double_reduc_phis.is_empty ())
2639     {
2640       vec_info_shared shared;
2641       simple_loop_info = vect_analyze_loop_form (loop->inner, &shared);
2642       if (simple_loop_info)
2643 	{
2644 	  gphi *phi;
2645 	  unsigned int i;
2646 
2647 	  FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
2648 	    {
2649 	      affine_iv iv;
2650 	      tree res = PHI_RESULT (phi);
2651 	      bool double_reduc;
2652 
2653 	      use_operand_p use_p;
2654 	      gimple *inner_stmt;
2655 	      bool single_use_p = single_imm_use (res, &use_p, &inner_stmt);
2656 	      gcc_assert (single_use_p);
2657 	      if (gimple_code (inner_stmt) != GIMPLE_PHI)
2658 		continue;
2659 	      gphi *inner_phi = as_a <gphi *> (inner_stmt);
2660 	      if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
2661 			     &iv, true))
2662 		continue;
2663 
2664 	      stmt_vec_info inner_phi_info
2665 		= simple_loop_info->lookup_stmt (inner_phi);
2666 	      stmt_vec_info inner_reduc_stmt_info
2667 		= vect_force_simple_reduction (simple_loop_info,
2668 					       inner_phi_info,
2669 					       &double_reduc, true);
2670 	      gcc_assert (!double_reduc);
2671 	      if (!inner_reduc_stmt_info
2672 		  || !valid_reduction_p (inner_reduc_stmt_info))
2673 		continue;
2674 
2675 	      build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
2676 	    }
2677 	  delete simple_loop_info;
2678 	}
2679     }
2680 
2681  gather_done:
2682   if (reduction_list->elements () == 0)
2683     return;
2684 
2685   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
2686      and delete simple_loop_info, we can set gimple_uid of reduc_phi stmts only
2687      now.  */
2688   basic_block bb;
2689   FOR_EACH_BB_FN (bb, cfun)
2690     for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2691       gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1);
2692   reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
2693 }
2694 
2695 /* Try to initialize NITER for code generation part.  */
2696 
2697 static bool
try_get_loop_niter(loop_p loop,struct tree_niter_desc * niter)2698 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2699 {
2700   edge exit = single_dom_exit (loop);
2701 
2702   gcc_assert (exit);
2703 
2704   /* We need to know # of iterations, and there should be no uses of values
2705      defined inside loop outside of it, unless the values are invariants of
2706      the loop.  */
2707   if (!number_of_iterations_exit (loop, exit, niter, false))
2708     {
2709       if (dump_file && (dump_flags & TDF_DETAILS))
2710 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2711       return false;
2712     }
2713 
2714   return true;
2715 }
2716 
2717 /* Return the default def of the first function argument.  */
2718 
2719 static tree
get_omp_data_i_param(void)2720 get_omp_data_i_param (void)
2721 {
2722   tree decl = DECL_ARGUMENTS (cfun->decl);
2723   gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
2724   return ssa_default_def (cfun, decl);
2725 }
2726 
2727 /* For PHI in loop header of LOOP, look for pattern:
2728 
2729    <bb preheader>
2730    .omp_data_i = &.omp_data_arr;
2731    addr = .omp_data_i->sum;
2732    sum_a = *addr;
2733 
2734    <bb header>:
2735    sum_b = PHI <sum_a (preheader), sum_c (latch)>
2736 
2737    and return addr.  Otherwise, return NULL_TREE.  */
2738 
2739 static tree
find_reduc_addr(struct loop * loop,gphi * phi)2740 find_reduc_addr (struct loop *loop, gphi *phi)
2741 {
2742   edge e = loop_preheader_edge (loop);
2743   tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
2744   gimple *stmt = SSA_NAME_DEF_STMT (arg);
2745   if (!gimple_assign_single_p (stmt))
2746     return NULL_TREE;
2747   tree memref = gimple_assign_rhs1 (stmt);
2748   if (TREE_CODE (memref) != MEM_REF)
2749     return NULL_TREE;
2750   tree addr = TREE_OPERAND (memref, 0);
2751 
2752   gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
2753   if (!gimple_assign_single_p (stmt2))
2754     return NULL_TREE;
2755   tree compref = gimple_assign_rhs1 (stmt2);
2756   if (TREE_CODE (compref) != COMPONENT_REF)
2757     return NULL_TREE;
2758   tree addr2 = TREE_OPERAND (compref, 0);
2759   if (TREE_CODE (addr2) != MEM_REF)
2760     return NULL_TREE;
2761   addr2 = TREE_OPERAND (addr2, 0);
2762   if (TREE_CODE (addr2) != SSA_NAME
2763       || addr2 != get_omp_data_i_param ())
2764     return NULL_TREE;
2765 
2766   return addr;
2767 }
2768 
2769 /* Try to initialize REDUCTION_LIST for code generation part.
2770    REDUCTION_LIST describes the reductions.  */
2771 
2772 static bool
try_create_reduction_list(loop_p loop,reduction_info_table_type * reduction_list,bool oacc_kernels_p)2773 try_create_reduction_list (loop_p loop,
2774 			   reduction_info_table_type *reduction_list,
2775 			   bool oacc_kernels_p)
2776 {
2777   edge exit = single_dom_exit (loop);
2778   gphi_iterator gsi;
2779 
2780   gcc_assert (exit);
2781 
2782   /* Try to get rid of exit phis.  */
2783   final_value_replacement_loop (loop);
2784 
2785   gather_scalar_reductions (loop, reduction_list);
2786 
2787 
2788   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2789     {
2790       gphi *phi = gsi.phi ();
2791       struct reduction_info *red;
2792       imm_use_iterator imm_iter;
2793       use_operand_p use_p;
2794       gimple *reduc_phi;
2795       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2796 
2797       if (TREE_CODE (val) == SSA_NAME && !virtual_operand_p (val))
2798 	{
2799 	  if (dump_file && (dump_flags & TDF_DETAILS))
2800 	    {
2801 	      fprintf (dump_file, "phi is ");
2802 	      print_gimple_stmt (dump_file, phi, 0);
2803 	      fprintf (dump_file, "arg of phi to exit:   value ");
2804 	      print_generic_expr (dump_file, val);
2805 	      fprintf (dump_file, " used outside loop\n");
2806 	      fprintf (dump_file,
2807 		       "  checking if it is part of reduction pattern:\n");
2808 	    }
2809 	  if (reduction_list->elements () == 0)
2810 	    {
2811 	      if (dump_file && (dump_flags & TDF_DETAILS))
2812 		fprintf (dump_file,
2813 			 "  FAILED: it is not a part of reduction.\n");
2814 	      return false;
2815 	    }
2816 	  reduc_phi = NULL;
2817 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2818 	    {
2819 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2820 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2821 		{
2822 		  reduc_phi = USE_STMT (use_p);
2823 		  break;
2824 		}
2825 	    }
2826 	  red = reduction_phi (reduction_list, reduc_phi);
2827 	  if (red == NULL)
2828 	    {
2829 	      if (dump_file && (dump_flags & TDF_DETAILS))
2830 		fprintf (dump_file,
2831 			 "  FAILED: it is not a part of reduction.\n");
2832 	      return false;
2833 	    }
2834 	  if (red->keep_res != NULL)
2835 	    {
2836 	      if (dump_file && (dump_flags & TDF_DETAILS))
2837 		fprintf (dump_file,
2838 			 "  FAILED: reduction has multiple exit phis.\n");
2839 	      return false;
2840 	    }
2841 	  red->keep_res = phi;
2842 	  if (dump_file && (dump_flags & TDF_DETAILS))
2843 	    {
2844 	      fprintf (dump_file, "reduction phi is  ");
2845 	      print_gimple_stmt (dump_file, red->reduc_phi, 0);
2846 	      fprintf (dump_file, "reduction stmt is  ");
2847 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0);
2848 	    }
2849 	}
2850     }
2851 
2852   /* The iterations of the loop may communicate only through bivs whose
2853      iteration space can be distributed efficiently.  */
2854   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2855     {
2856       gphi *phi = gsi.phi ();
2857       tree def = PHI_RESULT (phi);
2858       affine_iv iv;
2859 
2860       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2861 	{
2862 	  struct reduction_info *red;
2863 
2864 	  red = reduction_phi (reduction_list, phi);
2865 	  if (red == NULL)
2866 	    {
2867 	      if (dump_file && (dump_flags & TDF_DETAILS))
2868 		fprintf (dump_file,
2869 			 "  FAILED: scalar dependency between iterations\n");
2870 	      return false;
2871 	    }
2872 	}
2873     }
2874 
2875   if (oacc_kernels_p)
2876     {
2877       for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
2878 	   gsi_next (&gsi))
2879 	{
2880 	  gphi *phi = gsi.phi ();
2881 	  tree def = PHI_RESULT (phi);
2882 	  affine_iv iv;
2883 
2884 	  if (!virtual_operand_p (def)
2885 	      && !simple_iv (loop, loop, def, &iv, true))
2886 	    {
2887 	      tree addr = find_reduc_addr (loop, phi);
2888 	      if (addr == NULL_TREE)
2889 		return false;
2890 	      struct reduction_info *red = reduction_phi (reduction_list, phi);
2891 	      red->reduc_addr = addr;
2892 	    }
2893 	}
2894     }
2895 
2896   return true;
2897 }
2898 
2899 /* Return true if LOOP contains phis with ADDR_EXPR in args.  */
2900 
2901 static bool
loop_has_phi_with_address_arg(struct loop * loop)2902 loop_has_phi_with_address_arg (struct loop *loop)
2903 {
2904   basic_block *bbs = get_loop_body (loop);
2905   bool res = false;
2906 
2907   unsigned i, j;
2908   gphi_iterator gsi;
2909   for (i = 0; i < loop->num_nodes; i++)
2910     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2911       {
2912 	gphi *phi = gsi.phi ();
2913 	for (j = 0; j < gimple_phi_num_args (phi); j++)
2914 	  {
2915 	    tree arg = gimple_phi_arg_def (phi, j);
2916 	    if (TREE_CODE (arg) == ADDR_EXPR)
2917 	      {
2918 		/* This should be handled by eliminate_local_variables, but that
2919 		   function currently ignores phis.  */
2920 		res = true;
2921 		goto end;
2922 	      }
2923 	  }
2924       }
2925  end:
2926   free (bbs);
2927 
2928   return res;
2929 }
2930 
2931 /* Return true if memory ref REF (corresponding to the stmt at GSI in
2932    REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
2933    or the statements in REGIONS_BB[I + n].  REF_IS_STORE indicates if REF is a
2934    store.  Ignore conflicts with SKIP_STMT.  */
2935 
2936 static bool
ref_conflicts_with_region(gimple_stmt_iterator gsi,ao_ref * ref,bool ref_is_store,vec<basic_block> region_bbs,unsigned int i,gimple * skip_stmt)2937 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
2938 			   bool ref_is_store, vec<basic_block> region_bbs,
2939 			   unsigned int i, gimple *skip_stmt)
2940 {
2941   basic_block bb = region_bbs[i];
2942   gsi_next (&gsi);
2943 
2944   while (true)
2945     {
2946       for (; !gsi_end_p (gsi);
2947 	   gsi_next (&gsi))
2948 	{
2949 	  gimple *stmt = gsi_stmt (gsi);
2950 	  if (stmt == skip_stmt)
2951 	    {
2952 	      if (dump_file)
2953 		{
2954 		  fprintf (dump_file, "skipping reduction store: ");
2955 		  print_gimple_stmt (dump_file, stmt, 0);
2956 		}
2957 	      continue;
2958 	    }
2959 
2960 	  if (!gimple_vdef (stmt)
2961 	      && !gimple_vuse (stmt))
2962 	    continue;
2963 
2964 	  if (gimple_code (stmt) == GIMPLE_RETURN)
2965 	    continue;
2966 
2967 	  if (ref_is_store)
2968 	    {
2969 	      if (ref_maybe_used_by_stmt_p (stmt, ref))
2970 		{
2971 		  if (dump_file)
2972 		    {
2973 		      fprintf (dump_file, "Stmt ");
2974 		      print_gimple_stmt (dump_file, stmt, 0);
2975 		    }
2976 		  return true;
2977 		}
2978 	    }
2979 	  else
2980 	    {
2981 	      if (stmt_may_clobber_ref_p_1 (stmt, ref))
2982 		{
2983 		  if (dump_file)
2984 		    {
2985 		      fprintf (dump_file, "Stmt ");
2986 		      print_gimple_stmt (dump_file, stmt, 0);
2987 		    }
2988 		  return true;
2989 		}
2990 	    }
2991 	}
2992       i++;
2993       if (i == region_bbs.length ())
2994 	break;
2995       bb = region_bbs[i];
2996       gsi = gsi_start_bb (bb);
2997     }
2998 
2999   return false;
3000 }
3001 
3002 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
3003    in parallel with REGION_BBS containing the loop.  Return the stores of
3004    reduction results in REDUCTION_STORES.  */
3005 
3006 static bool
oacc_entry_exit_ok_1(bitmap in_loop_bbs,vec<basic_block> region_bbs,reduction_info_table_type * reduction_list,bitmap reduction_stores)3007 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3008 		      reduction_info_table_type *reduction_list,
3009 		      bitmap reduction_stores)
3010 {
3011   tree omp_data_i = get_omp_data_i_param ();
3012 
3013   unsigned i;
3014   basic_block bb;
3015   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3016     {
3017       if (bitmap_bit_p (in_loop_bbs, bb->index))
3018 	continue;
3019 
3020       gimple_stmt_iterator gsi;
3021       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3022 	   gsi_next (&gsi))
3023 	{
3024 	  gimple *stmt = gsi_stmt (gsi);
3025 	  gimple *skip_stmt = NULL;
3026 
3027 	  if (is_gimple_debug (stmt)
3028 	      || gimple_code (stmt) == GIMPLE_COND)
3029 	    continue;
3030 
3031 	  ao_ref ref;
3032 	  bool ref_is_store = false;
3033 	  if (gimple_assign_load_p (stmt))
3034 	    {
3035 	      tree rhs = gimple_assign_rhs1 (stmt);
3036 	      tree base = get_base_address (rhs);
3037 	      if (TREE_CODE (base) == MEM_REF
3038 		  && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0))
3039 		continue;
3040 
3041 	      tree lhs = gimple_assign_lhs (stmt);
3042 	      if (TREE_CODE (lhs) == SSA_NAME
3043 		  && has_single_use (lhs))
3044 		{
3045 		  use_operand_p use_p;
3046 		  gimple *use_stmt;
3047 		  single_imm_use (lhs, &use_p, &use_stmt);
3048 		  if (gimple_code (use_stmt) == GIMPLE_PHI)
3049 		    {
3050 		      struct reduction_info *red;
3051 		      red = reduction_phi (reduction_list, use_stmt);
3052 		      tree val = PHI_RESULT (red->keep_res);
3053 		      if (has_single_use (val))
3054 			{
3055 			  single_imm_use (val, &use_p, &use_stmt);
3056 			  if (gimple_store_p (use_stmt))
3057 			    {
3058 			      unsigned int id
3059 				= SSA_NAME_VERSION (gimple_vdef (use_stmt));
3060 			      bitmap_set_bit (reduction_stores, id);
3061 			      skip_stmt = use_stmt;
3062 			      if (dump_file)
3063 				{
3064 				  fprintf (dump_file, "found reduction load: ");
3065 				  print_gimple_stmt (dump_file, stmt, 0);
3066 				}
3067 			    }
3068 			}
3069 		    }
3070 		}
3071 
3072 	      ao_ref_init (&ref, rhs);
3073 	    }
3074 	  else if (gimple_store_p (stmt))
3075 	    {
3076 	      ao_ref_init (&ref, gimple_assign_lhs (stmt));
3077 	      ref_is_store = true;
3078 	    }
3079 	  else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
3080 	    continue;
3081 	  else if (!gimple_has_side_effects (stmt)
3082 		   && !gimple_could_trap_p (stmt)
3083 		   && !stmt_could_throw_p (cfun, stmt)
3084 		   && !gimple_vdef (stmt)
3085 		   && !gimple_vuse (stmt))
3086 	    continue;
3087 	  else if (gimple_call_internal_p (stmt, IFN_GOACC_DIM_POS))
3088 	    continue;
3089 	  else if (gimple_code (stmt) == GIMPLE_RETURN)
3090 	    continue;
3091 	  else
3092 	    {
3093 	      if (dump_file)
3094 		{
3095 		  fprintf (dump_file, "Unhandled stmt in entry/exit: ");
3096 		  print_gimple_stmt (dump_file, stmt, 0);
3097 		}
3098 	      return false;
3099 	    }
3100 
3101 	  if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs,
3102 					 i, skip_stmt))
3103 	    {
3104 	      if (dump_file)
3105 		{
3106 		  fprintf (dump_file, "conflicts with entry/exit stmt: ");
3107 		  print_gimple_stmt (dump_file, stmt, 0);
3108 		}
3109 	      return false;
3110 	    }
3111 	}
3112     }
3113 
3114   return true;
3115 }
3116 
3117 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3118    gang_pos == 0, except when the stores are REDUCTION_STORES.  Return true
3119    if any changes were made.  */
3120 
3121 static bool
oacc_entry_exit_single_gang(bitmap in_loop_bbs,vec<basic_block> region_bbs,bitmap reduction_stores)3122 oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3123 			     bitmap reduction_stores)
3124 {
3125   tree gang_pos = NULL_TREE;
3126   bool changed = false;
3127 
3128   unsigned i;
3129   basic_block bb;
3130   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3131     {
3132       if (bitmap_bit_p (in_loop_bbs, bb->index))
3133 	continue;
3134 
3135       gimple_stmt_iterator gsi;
3136       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
3137 	{
3138 	  gimple *stmt = gsi_stmt (gsi);
3139 
3140 	  if (!gimple_store_p (stmt))
3141 	    {
3142 	      /* Update gsi to point to next stmt.  */
3143 	      gsi_next (&gsi);
3144 	      continue;
3145 	    }
3146 
3147 	  if (bitmap_bit_p (reduction_stores,
3148 			    SSA_NAME_VERSION (gimple_vdef (stmt))))
3149 	    {
3150 	      if (dump_file)
3151 		{
3152 		  fprintf (dump_file,
3153 			   "skipped reduction store for single-gang"
3154 			   " neutering: ");
3155 		  print_gimple_stmt (dump_file, stmt, 0);
3156 		}
3157 
3158 	      /* Update gsi to point to next stmt.  */
3159 	      gsi_next (&gsi);
3160 	      continue;
3161 	    }
3162 
3163 	  changed = true;
3164 
3165 	  if (gang_pos == NULL_TREE)
3166 	    {
3167 	      tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3168 	      gcall *gang_single
3169 		= gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3170 	      gang_pos = make_ssa_name (integer_type_node);
3171 	      gimple_call_set_lhs (gang_single, gang_pos);
3172 	      gimple_stmt_iterator start
3173 		= gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3174 	      tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3175 	      gimple_set_vuse (gang_single, vuse);
3176 	      gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3177 	    }
3178 
3179 	  if (dump_file)
3180 	    {
3181 	      fprintf (dump_file,
3182 		       "found store that needs single-gang neutering: ");
3183 	      print_gimple_stmt (dump_file, stmt, 0);
3184 	    }
3185 
3186 	  {
3187 	    /* Split block before store.  */
3188 	    gimple_stmt_iterator gsi2 = gsi;
3189 	    gsi_prev (&gsi2);
3190 	    edge e;
3191 	    if (gsi_end_p (gsi2))
3192 	      {
3193 		e = split_block_after_labels (bb);
3194 		gsi2 = gsi_last_bb (bb);
3195 	      }
3196 	    else
3197 	      e = split_block (bb, gsi_stmt (gsi2));
3198 	    basic_block bb2 = e->dest;
3199 
3200 	    /* Split block after store.  */
3201 	    gimple_stmt_iterator gsi3 = gsi_start_bb (bb2);
3202 	    edge e2 = split_block (bb2, gsi_stmt (gsi3));
3203 	    basic_block bb3 = e2->dest;
3204 
3205 	    gimple *cond
3206 	      = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3207 				   NULL_TREE, NULL_TREE);
3208 	    gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3209 
3210 	    edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3211 	    /* FIXME: What is the probability?  */
3212 	    e3->probability = profile_probability::guessed_never ();
3213 	    e->flags = EDGE_TRUE_VALUE;
3214 
3215 	    tree vdef = gimple_vdef (stmt);
3216 	    tree vuse = gimple_vuse (stmt);
3217 
3218 	    tree phi_res = copy_ssa_name (vdef);
3219 	    gphi *new_phi = create_phi_node (phi_res, bb3);
3220 	    replace_uses_by (vdef, phi_res);
3221 	    add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3222 	    add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3223 
3224 	    /* Update gsi to point to next stmt.  */
3225 	    bb = bb3;
3226 	    gsi = gsi_start_bb (bb);
3227 	  }
3228 	}
3229     }
3230 
3231   return changed;
3232 }
3233 
3234 /* Return true if the statements before and after the LOOP can be executed in
3235    parallel with the function containing the loop.  Resolve conflicting stores
3236    outside LOOP by guarding them such that only a single gang executes them.  */
3237 
3238 static bool
oacc_entry_exit_ok(struct loop * loop,reduction_info_table_type * reduction_list)3239 oacc_entry_exit_ok (struct loop *loop,
3240 		    reduction_info_table_type *reduction_list)
3241 {
3242   basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3243   vec<basic_block> region_bbs
3244     = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3245 
3246   bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3247   bitmap_clear (in_loop_bbs);
3248   for (unsigned int i = 0; i < loop->num_nodes; i++)
3249     bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3250 
3251   bitmap reduction_stores = BITMAP_ALLOC (NULL);
3252   bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3253 				   reduction_stores);
3254 
3255   if (res)
3256     {
3257       bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3258 						  reduction_stores);
3259       if (changed)
3260 	{
3261 	  free_dominance_info (CDI_DOMINATORS);
3262 	  calculate_dominance_info (CDI_DOMINATORS);
3263 	}
3264     }
3265 
3266   region_bbs.release ();
3267   free (loop_bbs);
3268 
3269   BITMAP_FREE (in_loop_bbs);
3270   BITMAP_FREE (reduction_stores);
3271 
3272   return res;
3273 }
3274 
3275 /* Detect parallel loops and generate parallel code using libgomp
3276    primitives.  Returns true if some loop was parallelized, false
3277    otherwise.  */
3278 
3279 static bool
parallelize_loops(bool oacc_kernels_p)3280 parallelize_loops (bool oacc_kernels_p)
3281 {
3282   unsigned n_threads;
3283   bool changed = false;
3284   struct loop *loop;
3285   struct loop *skip_loop = NULL;
3286   struct tree_niter_desc niter_desc;
3287   struct obstack parloop_obstack;
3288   HOST_WIDE_INT estimated;
3289 
3290   /* Do not parallelize loops in the functions created by parallelization.  */
3291   if (!oacc_kernels_p
3292       && parallelized_function_p (cfun->decl))
3293     return false;
3294 
3295   /* Do not parallelize loops in offloaded functions.  */
3296   if (!oacc_kernels_p
3297       && oacc_get_fn_attrib (cfun->decl) != NULL)
3298      return false;
3299 
3300   if (cfun->has_nonlocal_label)
3301     return false;
3302 
3303   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
3304      the argument to -ftree-parallelize-loops.  */
3305   if (oacc_kernels_p)
3306     n_threads = 0;
3307   else
3308     n_threads = flag_tree_parallelize_loops;
3309 
3310   gcc_obstack_init (&parloop_obstack);
3311   reduction_info_table_type reduction_list (10);
3312 
3313   calculate_dominance_info (CDI_DOMINATORS);
3314 
3315   FOR_EACH_LOOP (loop, 0)
3316     {
3317       if (loop == skip_loop)
3318 	{
3319 	  if (!loop->in_oacc_kernels_region
3320 	      && dump_file && (dump_flags & TDF_DETAILS))
3321 	    fprintf (dump_file,
3322 		     "Skipping loop %d as inner loop of parallelized loop\n",
3323 		     loop->num);
3324 
3325 	  skip_loop = loop->inner;
3326 	  continue;
3327 	}
3328       else
3329 	skip_loop = NULL;
3330 
3331       reduction_list.empty ();
3332 
3333       if (oacc_kernels_p)
3334 	{
3335 	  if (!loop->in_oacc_kernels_region)
3336 	    continue;
3337 
3338 	  /* Don't try to parallelize inner loops in an oacc kernels region.  */
3339 	  if (loop->inner)
3340 	    skip_loop = loop->inner;
3341 
3342 	  if (dump_file && (dump_flags & TDF_DETAILS))
3343 	    fprintf (dump_file,
3344 		     "Trying loop %d with header bb %d in oacc kernels"
3345 		     " region\n", loop->num, loop->header->index);
3346 	}
3347 
3348       if (dump_file && (dump_flags & TDF_DETAILS))
3349       {
3350         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
3351 	if (loop->inner)
3352 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
3353 	else
3354 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
3355       }
3356 
3357       if (!single_dom_exit (loop))
3358       {
3359 
3360         if (dump_file && (dump_flags & TDF_DETAILS))
3361 	  fprintf (dump_file, "loop is !single_dom_exit\n");
3362 
3363 	continue;
3364       }
3365 
3366       if (/* And of course, the loop must be parallelizable.  */
3367 	  !can_duplicate_loop_p (loop)
3368 	  || loop_has_blocks_with_irreducible_flag (loop)
3369 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
3370 	  /* FIXME: the check for vector phi nodes could be removed.  */
3371 	  || loop_has_vector_phi_nodes (loop))
3372 	continue;
3373 
3374       estimated = estimated_loop_iterations_int (loop);
3375       if (estimated == -1)
3376 	estimated = get_likely_max_loop_iterations_int (loop);
3377       /* FIXME: Bypass this check as graphite doesn't update the
3378 	 count and frequency correctly now.  */
3379       if (!flag_loop_parallelize_all
3380 	  && !oacc_kernels_p
3381 	  && ((estimated != -1
3382 	       && (estimated
3383 		   < ((HOST_WIDE_INT) n_threads
3384 		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
3385 	      /* Do not bother with loops in cold areas.  */
3386 	      || optimize_loop_nest_for_size_p (loop)))
3387 	continue;
3388 
3389       if (!try_get_loop_niter (loop, &niter_desc))
3390 	continue;
3391 
3392       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
3393 	continue;
3394 
3395       if (loop_has_phi_with_address_arg (loop))
3396 	continue;
3397 
3398       if (!loop->can_be_parallel
3399 	  && !loop_parallel_p (loop, &parloop_obstack))
3400 	continue;
3401 
3402       if (oacc_kernels_p
3403 	&& !oacc_entry_exit_ok (loop, &reduction_list))
3404 	{
3405 	  if (dump_file)
3406 	    fprintf (dump_file, "entry/exit not ok: FAILED\n");
3407 	  continue;
3408 	}
3409 
3410       changed = true;
3411       skip_loop = loop->inner;
3412 
3413       if (dump_enabled_p ())
3414 	{
3415 	  dump_user_location_t loop_loc = find_loop_location (loop);
3416 	  if (loop->inner)
3417 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3418 			     "parallelizing outer loop %d\n", loop->num);
3419 	  else
3420 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3421 			     "parallelizing inner loop %d\n", loop->num);
3422 	}
3423 
3424       gen_parallel_loop (loop, &reduction_list,
3425 			 n_threads, &niter_desc, oacc_kernels_p);
3426     }
3427 
3428   obstack_free (&parloop_obstack, NULL);
3429 
3430   /* Parallelization will cause new function calls to be inserted through
3431      which local variables will escape.  Reset the points-to solution
3432      for ESCAPED.  */
3433   if (changed)
3434     pt_solution_reset (&cfun->gimple_df->escaped);
3435 
3436   return changed;
3437 }
3438 
3439 /* Parallelization.  */
3440 
3441 namespace {
3442 
3443 const pass_data pass_data_parallelize_loops =
3444 {
3445   GIMPLE_PASS, /* type */
3446   "parloops", /* name */
3447   OPTGROUP_LOOP, /* optinfo_flags */
3448   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
3449   ( PROP_cfg | PROP_ssa ), /* properties_required */
3450   0, /* properties_provided */
3451   0, /* properties_destroyed */
3452   0, /* todo_flags_start */
3453   0, /* todo_flags_finish */
3454 };
3455 
3456 class pass_parallelize_loops : public gimple_opt_pass
3457 {
3458 public:
pass_parallelize_loops(gcc::context * ctxt)3459   pass_parallelize_loops (gcc::context *ctxt)
3460     : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
3461       oacc_kernels_p (false)
3462   {}
3463 
3464   /* opt_pass methods: */
gate(function *)3465   virtual bool gate (function *)
3466   {
3467     if (oacc_kernels_p)
3468       return flag_openacc;
3469     else
3470       return flag_tree_parallelize_loops > 1;
3471   }
3472   virtual unsigned int execute (function *);
clone()3473   opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); }
set_pass_param(unsigned int n,bool param)3474   void set_pass_param (unsigned int n, bool param)
3475     {
3476       gcc_assert (n == 0);
3477       oacc_kernels_p = param;
3478     }
3479 
3480  private:
3481   bool oacc_kernels_p;
3482 }; // class pass_parallelize_loops
3483 
3484 unsigned
execute(function * fun)3485 pass_parallelize_loops::execute (function *fun)
3486 {
3487   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
3488   if (nthreads == NULL_TREE)
3489     return 0;
3490 
3491   bool in_loop_pipeline = scev_initialized_p ();
3492   if (!in_loop_pipeline)
3493     loop_optimizer_init (LOOPS_NORMAL
3494 			 | LOOPS_HAVE_RECORDED_EXITS);
3495 
3496   if (number_of_loops (fun) <= 1)
3497     return 0;
3498 
3499   if (!in_loop_pipeline)
3500     {
3501       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3502       scev_initialize ();
3503     }
3504 
3505   unsigned int todo = 0;
3506   if (parallelize_loops (oacc_kernels_p))
3507     {
3508       fun->curr_properties &= ~(PROP_gimple_eomp);
3509 
3510       checking_verify_loop_structure ();
3511 
3512       todo |= TODO_update_ssa;
3513     }
3514 
3515   if (!in_loop_pipeline)
3516     {
3517       scev_finalize ();
3518       loop_optimizer_finalize ();
3519     }
3520 
3521   return todo;
3522 }
3523 
3524 } // anon namespace
3525 
3526 gimple_opt_pass *
make_pass_parallelize_loops(gcc::context * ctxt)3527 make_pass_parallelize_loops (gcc::context *ctxt)
3528 {
3529   return new pass_parallelize_loops (ctxt);
3530 }
3531