1 /* Loop autoparallelization.
2    Copyright (C) 2006-2018 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "gimple-pretty-print.h"
33 #include "fold-const.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-walk.h"
38 #include "stor-layout.h"
39 #include "tree-nested.h"
40 #include "tree-cfg.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "tree-into-ssa.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "langhooks.h"
49 #include "tree-vectorizer.h"
50 #include "tree-hasher.h"
51 #include "tree-parloops.h"
52 #include "omp-general.h"
53 #include "omp-low.h"
54 #include "tree-ssa.h"
55 #include "params.h"
56 #include "params-enum.h"
57 #include "tree-ssa-alias.h"
58 #include "tree-eh.h"
59 #include "gomp-constants.h"
60 #include "tree-dfa.h"
61 #include "stringpool.h"
62 #include "attribs.h"
63 
64 /* This pass tries to distribute iterations of loops into several threads.
65    The implementation is straightforward -- for each loop we test whether its
66    iterations are independent, and if it is the case (and some additional
67    conditions regarding profitability and correctness are satisfied), we
68    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
69    machinery do its job.
70 
71    The most of the complexity is in bringing the code into shape expected
72    by the omp expanders:
73    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
74       variable and that the exit test is at the start of the loop body
75    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
76       variables by accesses through pointers, and breaking up ssa chains
77       by storing the values incoming to the parallelized loop to a structure
78       passed to the new function as an argument (something similar is done
79       in omp gimplification, unfortunately only a small part of the code
80       can be shared).
81 
82    TODO:
83    -- if there are several parallelizable loops in a function, it may be
84       possible to generate the threads just once (using synchronization to
85       ensure that cross-loop dependences are obeyed).
86    -- handling of common reduction patterns for outer loops.
87 
88    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
89 /*
90   Reduction handling:
91   currently we use vect_force_simple_reduction() to detect reduction patterns.
92   The code transformation will be introduced by an example.
93 
94 
95 parloop
96 {
97   int sum=1;
98 
99   for (i = 0; i < N; i++)
100    {
101     x[i] = i + 3;
102     sum+=x[i];
103    }
104 }
105 
106 gimple-like code:
107 header_bb:
108 
109   # sum_29 = PHI <sum_11(5), 1(3)>
110   # i_28 = PHI <i_12(5), 0(3)>
111   D.1795_8 = i_28 + 3;
112   x[i_28] = D.1795_8;
113   sum_11 = D.1795_8 + sum_29;
114   i_12 = i_28 + 1;
115   if (N_6(D) > i_12)
116     goto header_bb;
117 
118 
119 exit_bb:
120 
121   # sum_21 = PHI <sum_11(4)>
122   printf (&"%d"[0], sum_21);
123 
124 
125 after reduction transformation (only relevant parts):
126 
127 parloop
128 {
129 
130 ....
131 
132 
133   # Storing the initial value given by the user.  #
134 
135   .paral_data_store.32.sum.27 = 1;
136 
137   #pragma omp parallel num_threads(4)
138 
139   #pragma omp for schedule(static)
140 
141   # The neutral element corresponding to the particular
142   reduction's operation, e.g. 0 for PLUS_EXPR,
143   1 for MULT_EXPR, etc. replaces the user's initial value.  #
144 
145   # sum.27_29 = PHI <sum.27_11, 0>
146 
147   sum.27_11 = D.1827_8 + sum.27_29;
148 
149   GIMPLE_OMP_CONTINUE
150 
151   # Adding this reduction phi is done at create_phi_for_local_result() #
152   # sum.27_56 = PHI <sum.27_11, 0>
153   GIMPLE_OMP_RETURN
154 
155   # Creating the atomic operation is done at
156   create_call_for_reduction_1()  #
157 
158   #pragma omp atomic_load
159   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
160   D.1840_60 = sum.27_56 + D.1839_59;
161   #pragma omp atomic_store (D.1840_60);
162 
163   GIMPLE_OMP_RETURN
164 
165  # collecting the result after the join of the threads is done at
166   create_loads_for_reductions().
167   The value computed by the threads is loaded from the
168   shared struct.  #
169 
170 
171   .paral_data_load.33_52 = &.paral_data_store.32;
172   sum_37 =  .paral_data_load.33_52->sum.27;
173   sum_43 = D.1795_41 + sum_37;
174 
175   exit bb:
176   # sum_21 = PHI <sum_43, sum_26>
177   printf (&"%d"[0], sum_21);
178 
179 ...
180 
181 }
182 
183 */
184 
185 /* Minimal number of iterations of a loop that should be executed in each
186    thread.  */
187 #define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
188 
189 /* Element of the hashtable, representing a
190    reduction in the current loop.  */
191 struct reduction_info
192 {
193   gimple *reduc_stmt;		/* reduction statement.  */
194   gimple *reduc_phi;		/* The phi node defining the reduction.  */
195   enum tree_code reduction_code;/* code for the reduction operation.  */
196   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
197 				   result.  */
198   gphi *keep_res;		/* The PHI_RESULT of this phi is the resulting value
199 				   of the reduction variable when existing the loop. */
200   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
201   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
202   tree reduc_addr;		/* The address of the reduction variable for
203 				   openacc reductions.  */
204   tree init;			/* reduction initialization value.  */
205   gphi *new_phi;		/* (helper field) Newly created phi node whose result
206 				   will be passed to the atomic operation.  Represents
207 				   the local result each thread computed for the reduction
208 				   operation.  */
209 };
210 
211 /* Reduction info hashtable helpers.  */
212 
213 struct reduction_hasher : free_ptr_hash <reduction_info>
214 {
215   static inline hashval_t hash (const reduction_info *);
216   static inline bool equal (const reduction_info *, const reduction_info *);
217 };
218 
219 /* Equality and hash functions for hashtab code.  */
220 
221 inline bool
equal(const reduction_info * a,const reduction_info * b)222 reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
223 {
224   return (a->reduc_phi == b->reduc_phi);
225 }
226 
227 inline hashval_t
hash(const reduction_info * a)228 reduction_hasher::hash (const reduction_info *a)
229 {
230   return a->reduc_version;
231 }
232 
233 typedef hash_table<reduction_hasher> reduction_info_table_type;
234 
235 
236 static struct reduction_info *
reduction_phi(reduction_info_table_type * reduction_list,gimple * phi)237 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
238 {
239   struct reduction_info tmpred, *red;
240 
241   if (reduction_list->elements () == 0 || phi == NULL)
242     return NULL;
243 
244   if (gimple_uid (phi) == (unsigned int)-1
245       || gimple_uid (phi) == 0)
246     return NULL;
247 
248   tmpred.reduc_phi = phi;
249   tmpred.reduc_version = gimple_uid (phi);
250   red = reduction_list->find (&tmpred);
251   gcc_assert (red == NULL || red->reduc_phi == phi);
252 
253   return red;
254 }
255 
256 /* Element of hashtable of names to copy.  */
257 
258 struct name_to_copy_elt
259 {
260   unsigned version;	/* The version of the name to copy.  */
261   tree new_name;	/* The new name used in the copy.  */
262   tree field;		/* The field of the structure used to pass the
263 			   value.  */
264 };
265 
266 /* Name copies hashtable helpers.  */
267 
268 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
269 {
270   static inline hashval_t hash (const name_to_copy_elt *);
271   static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
272 };
273 
274 /* Equality and hash functions for hashtab code.  */
275 
276 inline bool
equal(const name_to_copy_elt * a,const name_to_copy_elt * b)277 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
278 {
279   return a->version == b->version;
280 }
281 
282 inline hashval_t
hash(const name_to_copy_elt * a)283 name_to_copy_hasher::hash (const name_to_copy_elt *a)
284 {
285   return (hashval_t) a->version;
286 }
287 
288 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
289 
290 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
291    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
292    represents the denominator for every element in the matrix.  */
293 typedef struct lambda_trans_matrix_s
294 {
295   lambda_matrix matrix;
296   int rowsize;
297   int colsize;
298   int denominator;
299 } *lambda_trans_matrix;
300 #define LTM_MATRIX(T) ((T)->matrix)
301 #define LTM_ROWSIZE(T) ((T)->rowsize)
302 #define LTM_COLSIZE(T) ((T)->colsize)
303 #define LTM_DENOMINATOR(T) ((T)->denominator)
304 
305 /* Allocate a new transformation matrix.  */
306 
307 static lambda_trans_matrix
lambda_trans_matrix_new(int colsize,int rowsize,struct obstack * lambda_obstack)308 lambda_trans_matrix_new (int colsize, int rowsize,
309 			 struct obstack * lambda_obstack)
310 {
311   lambda_trans_matrix ret;
312 
313   ret = (lambda_trans_matrix)
314     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
315   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
316   LTM_ROWSIZE (ret) = rowsize;
317   LTM_COLSIZE (ret) = colsize;
318   LTM_DENOMINATOR (ret) = 1;
319   return ret;
320 }
321 
322 /* Multiply a vector VEC by a matrix MAT.
323    MAT is an M*N matrix, and VEC is a vector with length N.  The result
324    is stored in DEST which must be a vector of length M.  */
325 
326 static void
lambda_matrix_vector_mult(lambda_matrix matrix,int m,int n,lambda_vector vec,lambda_vector dest)327 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
328 			   lambda_vector vec, lambda_vector dest)
329 {
330   int i, j;
331 
332   lambda_vector_clear (dest, m);
333   for (i = 0; i < m; i++)
334     for (j = 0; j < n; j++)
335       dest[i] += matrix[i][j] * vec[j];
336 }
337 
338 /* Return true if TRANS is a legal transformation matrix that respects
339    the dependence vectors in DISTS and DIRS.  The conservative answer
340    is false.
341 
342    "Wolfe proves that a unimodular transformation represented by the
343    matrix T is legal when applied to a loop nest with a set of
344    lexicographically non-negative distance vectors RDG if and only if
345    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
346    i.e.: if and only if it transforms the lexicographically positive
347    distance vectors to lexicographically positive vectors.  Note that
348    a unimodular matrix must transform the zero vector (and only it) to
349    the zero vector." S.Muchnick.  */
350 
351 static bool
lambda_transform_legal_p(lambda_trans_matrix trans,int nb_loops,vec<ddr_p> dependence_relations)352 lambda_transform_legal_p (lambda_trans_matrix trans,
353 			  int nb_loops,
354 			  vec<ddr_p> dependence_relations)
355 {
356   unsigned int i, j;
357   lambda_vector distres;
358   struct data_dependence_relation *ddr;
359 
360   gcc_assert (LTM_COLSIZE (trans) == nb_loops
361 	      && LTM_ROWSIZE (trans) == nb_loops);
362 
363   /* When there are no dependences, the transformation is correct.  */
364   if (dependence_relations.length () == 0)
365     return true;
366 
367   ddr = dependence_relations[0];
368   if (ddr == NULL)
369     return true;
370 
371   /* When there is an unknown relation in the dependence_relations, we
372      know that it is no worth looking at this loop nest: give up.  */
373   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
374     return false;
375 
376   distres = lambda_vector_new (nb_loops);
377 
378   /* For each distance vector in the dependence graph.  */
379   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
380     {
381       /* Don't care about relations for which we know that there is no
382 	 dependence, nor about read-read (aka. output-dependences):
383 	 these data accesses can happen in any order.  */
384       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
385 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
386 	continue;
387 
388       /* Conservatively answer: "this transformation is not valid".  */
389       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
390 	return false;
391 
392       /* If the dependence could not be captured by a distance vector,
393 	 conservatively answer that the transform is not valid.  */
394       if (DDR_NUM_DIST_VECTS (ddr) == 0)
395 	return false;
396 
397       /* Compute trans.dist_vect */
398       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
399 	{
400 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
401 				     DDR_DIST_VECT (ddr, j), distres);
402 
403 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
404 	    return false;
405 	}
406     }
407   return true;
408 }
409 
410 /* Data dependency analysis. Returns true if the iterations of LOOP
411    are independent on each other (that is, if we can execute them
412    in parallel).  */
413 
414 static bool
loop_parallel_p(struct loop * loop,struct obstack * parloop_obstack)415 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
416 {
417   vec<ddr_p> dependence_relations;
418   vec<data_reference_p> datarefs;
419   lambda_trans_matrix trans;
420   bool ret = false;
421 
422   if (dump_file && (dump_flags & TDF_DETAILS))
423   {
424     fprintf (dump_file, "Considering loop %d\n", loop->num);
425     if (!loop->inner)
426       fprintf (dump_file, "loop is innermost\n");
427     else
428       fprintf (dump_file, "loop NOT innermost\n");
429    }
430 
431   /* Check for problems with dependences.  If the loop can be reversed,
432      the iterations are independent.  */
433   auto_vec<loop_p, 3> loop_nest;
434   datarefs.create (10);
435   dependence_relations.create (100);
436   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
437 					   &dependence_relations))
438     {
439       if (dump_file && (dump_flags & TDF_DETAILS))
440 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
441       ret = false;
442       goto end;
443     }
444   if (dump_file && (dump_flags & TDF_DETAILS))
445     dump_data_dependence_relations (dump_file, dependence_relations);
446 
447   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
448   LTM_MATRIX (trans)[0][0] = -1;
449 
450   if (lambda_transform_legal_p (trans, 1, dependence_relations))
451     {
452       ret = true;
453       if (dump_file && (dump_flags & TDF_DETAILS))
454 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
455     }
456   else if (dump_file && (dump_flags & TDF_DETAILS))
457     fprintf (dump_file,
458 	     "  FAILED: data dependencies exist across iterations\n");
459 
460  end:
461   free_dependence_relations (dependence_relations);
462   free_data_refs (datarefs);
463 
464   return ret;
465 }
466 
467 /* Return true when LOOP contains basic blocks marked with the
468    BB_IRREDUCIBLE_LOOP flag.  */
469 
470 static inline bool
loop_has_blocks_with_irreducible_flag(struct loop * loop)471 loop_has_blocks_with_irreducible_flag (struct loop *loop)
472 {
473   unsigned i;
474   basic_block *bbs = get_loop_body_in_dom_order (loop);
475   bool res = true;
476 
477   for (i = 0; i < loop->num_nodes; i++)
478     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
479       goto end;
480 
481   res = false;
482  end:
483   free (bbs);
484   return res;
485 }
486 
487 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
488    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
489    to their addresses that can be reused.  The address of OBJ is known to
490    be invariant in the whole function.  Other needed statements are placed
491    right before GSI.  */
492 
493 static tree
take_address_of(tree obj,tree type,edge entry,int_tree_htab_type * decl_address,gimple_stmt_iterator * gsi)494 take_address_of (tree obj, tree type, edge entry,
495 		 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
496 {
497   int uid;
498   tree *var_p, name, addr;
499   gassign *stmt;
500   gimple_seq stmts;
501 
502   /* Since the address of OBJ is invariant, the trees may be shared.
503      Avoid rewriting unrelated parts of the code.  */
504   obj = unshare_expr (obj);
505   for (var_p = &obj;
506        handled_component_p (*var_p);
507        var_p = &TREE_OPERAND (*var_p, 0))
508     continue;
509 
510   /* Canonicalize the access to base on a MEM_REF.  */
511   if (DECL_P (*var_p))
512     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
513 
514   /* Assign a canonical SSA name to the address of the base decl used
515      in the address and share it for all accesses and addresses based
516      on it.  */
517   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
518   int_tree_map elt;
519   elt.uid = uid;
520   int_tree_map *slot = decl_address->find_slot (elt, INSERT);
521   if (!slot->to)
522     {
523       if (gsi == NULL)
524 	return NULL;
525       addr = TREE_OPERAND (*var_p, 0);
526       const char *obj_name
527 	= get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
528       if (obj_name)
529 	name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
530       else
531 	name = make_ssa_name (TREE_TYPE (addr));
532       stmt = gimple_build_assign (name, addr);
533       gsi_insert_on_edge_immediate (entry, stmt);
534 
535       slot->uid = uid;
536       slot->to = name;
537     }
538   else
539     name = slot->to;
540 
541   /* Express the address in terms of the canonical SSA name.  */
542   TREE_OPERAND (*var_p, 0) = name;
543   if (gsi == NULL)
544     return build_fold_addr_expr_with_type (obj, type);
545 
546   name = force_gimple_operand (build_addr (obj),
547 			       &stmts, true, NULL_TREE);
548   if (!gimple_seq_empty_p (stmts))
549     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
550 
551   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
552     {
553       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
554 				   NULL_TREE);
555       if (!gimple_seq_empty_p (stmts))
556 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
557     }
558 
559   return name;
560 }
561 
562 static tree
reduc_stmt_res(gimple * stmt)563 reduc_stmt_res (gimple *stmt)
564 {
565   return (gimple_code (stmt) == GIMPLE_PHI
566 	  ? gimple_phi_result (stmt)
567 	  : gimple_assign_lhs (stmt));
568 }
569 
570 /* Callback for htab_traverse.  Create the initialization statement
571    for reduction described in SLOT, and place it at the preheader of
572    the loop described in DATA.  */
573 
574 int
initialize_reductions(reduction_info ** slot,struct loop * loop)575 initialize_reductions (reduction_info **slot, struct loop *loop)
576 {
577   tree init;
578   tree type, arg;
579   edge e;
580 
581   struct reduction_info *const reduc = *slot;
582 
583   /* Create initialization in preheader:
584      reduction_variable = initialization value of reduction.  */
585 
586   /* In the phi node at the header, replace the argument coming
587      from the preheader with the reduction initialization value.  */
588 
589   /* Initialize the reduction.  */
590   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
591   init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt),
592 				reduc->reduction_code, type);
593   reduc->init = init;
594 
595   /* Replace the argument representing the initialization value
596      with the initialization value for the reduction (neutral
597      element for the particular operation, e.g. 0 for PLUS_EXPR,
598      1 for MULT_EXPR, etc).
599      Keep the old value in a new variable "reduction_initial",
600      that will be taken in consideration after the parallel
601      computing is done.  */
602 
603   e = loop_preheader_edge (loop);
604   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
605   /* Create new variable to hold the initial value.  */
606 
607   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
608 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
609   reduc->initial_value = arg;
610   return 1;
611 }
612 
613 struct elv_data
614 {
615   struct walk_stmt_info info;
616   edge entry;
617   int_tree_htab_type *decl_address;
618   gimple_stmt_iterator *gsi;
619   bool changed;
620   bool reset;
621 };
622 
623 /* Eliminates references to local variables in *TP out of the single
624    entry single exit region starting at DTA->ENTRY.
625    DECL_ADDRESS contains addresses of the references that had their
626    address taken already.  If the expression is changed, CHANGED is
627    set to true.  Callback for walk_tree.  */
628 
629 static tree
eliminate_local_variables_1(tree * tp,int * walk_subtrees,void * data)630 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
631 {
632   struct elv_data *const dta = (struct elv_data *) data;
633   tree t = *tp, var, addr, addr_type, type, obj;
634 
635   if (DECL_P (t))
636     {
637       *walk_subtrees = 0;
638 
639       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
640 	return NULL_TREE;
641 
642       type = TREE_TYPE (t);
643       addr_type = build_pointer_type (type);
644       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
645 			      dta->gsi);
646       if (dta->gsi == NULL && addr == NULL_TREE)
647 	{
648 	  dta->reset = true;
649 	  return NULL_TREE;
650 	}
651 
652       *tp = build_simple_mem_ref (addr);
653 
654       dta->changed = true;
655       return NULL_TREE;
656     }
657 
658   if (TREE_CODE (t) == ADDR_EXPR)
659     {
660       /* ADDR_EXPR may appear in two contexts:
661 	 -- as a gimple operand, when the address taken is a function invariant
662 	 -- as gimple rhs, when the resulting address in not a function
663 	    invariant
664 	 We do not need to do anything special in the latter case (the base of
665 	 the memory reference whose address is taken may be replaced in the
666 	 DECL_P case).  The former case is more complicated, as we need to
667 	 ensure that the new address is still a gimple operand.  Thus, it
668 	 is not sufficient to replace just the base of the memory reference --
669 	 we need to move the whole computation of the address out of the
670 	 loop.  */
671       if (!is_gimple_val (t))
672 	return NULL_TREE;
673 
674       *walk_subtrees = 0;
675       obj = TREE_OPERAND (t, 0);
676       var = get_base_address (obj);
677       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
678 	return NULL_TREE;
679 
680       addr_type = TREE_TYPE (t);
681       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
682 			      dta->gsi);
683       if (dta->gsi == NULL && addr == NULL_TREE)
684 	{
685 	  dta->reset = true;
686 	  return NULL_TREE;
687 	}
688       *tp = addr;
689 
690       dta->changed = true;
691       return NULL_TREE;
692     }
693 
694   if (!EXPR_P (t))
695     *walk_subtrees = 0;
696 
697   return NULL_TREE;
698 }
699 
700 /* Moves the references to local variables in STMT at *GSI out of the single
701    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
702    addresses of the references that had their address taken
703    already.  */
704 
705 static void
eliminate_local_variables_stmt(edge entry,gimple_stmt_iterator * gsi,int_tree_htab_type * decl_address)706 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
707 				int_tree_htab_type *decl_address)
708 {
709   struct elv_data dta;
710   gimple *stmt = gsi_stmt (*gsi);
711 
712   memset (&dta.info, '\0', sizeof (dta.info));
713   dta.entry = entry;
714   dta.decl_address = decl_address;
715   dta.changed = false;
716   dta.reset = false;
717 
718   if (gimple_debug_bind_p (stmt))
719     {
720       dta.gsi = NULL;
721       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
722 		 eliminate_local_variables_1, &dta.info, NULL);
723       if (dta.reset)
724 	{
725 	  gimple_debug_bind_reset_value (stmt);
726 	  dta.changed = true;
727 	}
728     }
729   else if (gimple_clobber_p (stmt))
730     {
731       unlink_stmt_vdef (stmt);
732       stmt = gimple_build_nop ();
733       gsi_replace (gsi, stmt, false);
734       dta.changed = true;
735     }
736   else
737     {
738       dta.gsi = gsi;
739       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
740     }
741 
742   if (dta.changed)
743     update_stmt (stmt);
744 }
745 
746 /* Eliminates the references to local variables from the single entry
747    single exit region between the ENTRY and EXIT edges.
748 
749    This includes:
750    1) Taking address of a local variable -- these are moved out of the
751    region (and temporary variable is created to hold the address if
752    necessary).
753 
754    2) Dereferencing a local variable -- these are replaced with indirect
755    references.  */
756 
757 static void
eliminate_local_variables(edge entry,edge exit)758 eliminate_local_variables (edge entry, edge exit)
759 {
760   basic_block bb;
761   auto_vec<basic_block, 3> body;
762   unsigned i;
763   gimple_stmt_iterator gsi;
764   bool has_debug_stmt = false;
765   int_tree_htab_type decl_address (10);
766   basic_block entry_bb = entry->src;
767   basic_block exit_bb = exit->dest;
768 
769   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
770 
771   FOR_EACH_VEC_ELT (body, i, bb)
772     if (bb != entry_bb && bb != exit_bb)
773       {
774         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
775 	  if (is_gimple_debug (gsi_stmt (gsi)))
776 	    {
777 	      if (gimple_debug_bind_p (gsi_stmt (gsi)))
778 	        has_debug_stmt = true;
779 	    }
780 	  else
781 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
782       }
783 
784   if (has_debug_stmt)
785     FOR_EACH_VEC_ELT (body, i, bb)
786       if (bb != entry_bb && bb != exit_bb)
787 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
788 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
789 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
790 }
791 
792 /* Returns true if expression EXPR is not defined between ENTRY and
793    EXIT, i.e. if all its operands are defined outside of the region.  */
794 
795 static bool
expr_invariant_in_region_p(edge entry,edge exit,tree expr)796 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
797 {
798   basic_block entry_bb = entry->src;
799   basic_block exit_bb = exit->dest;
800   basic_block def_bb;
801 
802   if (is_gimple_min_invariant (expr))
803     return true;
804 
805   if (TREE_CODE (expr) == SSA_NAME)
806     {
807       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
808       if (def_bb
809 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
810 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
811 	return false;
812 
813       return true;
814     }
815 
816   return false;
817 }
818 
819 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
820    The copies are stored to NAME_COPIES, if NAME was already duplicated,
821    its duplicate stored in NAME_COPIES is returned.
822 
823    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
824    duplicated, storing the copies in DECL_COPIES.  */
825 
826 static tree
separate_decls_in_region_name(tree name,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies,bool copy_name_p)827 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
828 			       int_tree_htab_type *decl_copies,
829 			       bool copy_name_p)
830 {
831   tree copy, var, var_copy;
832   unsigned idx, uid, nuid;
833   struct int_tree_map ielt;
834   struct name_to_copy_elt elt, *nelt;
835   name_to_copy_elt **slot;
836   int_tree_map *dslot;
837 
838   if (TREE_CODE (name) != SSA_NAME)
839     return name;
840 
841   idx = SSA_NAME_VERSION (name);
842   elt.version = idx;
843   slot = name_copies->find_slot_with_hash (&elt, idx,
844 					   copy_name_p ? INSERT : NO_INSERT);
845   if (slot && *slot)
846     return (*slot)->new_name;
847 
848   if (copy_name_p)
849     {
850       copy = duplicate_ssa_name (name, NULL);
851       nelt = XNEW (struct name_to_copy_elt);
852       nelt->version = idx;
853       nelt->new_name = copy;
854       nelt->field = NULL_TREE;
855       *slot = nelt;
856     }
857   else
858     {
859       gcc_assert (!slot);
860       copy = name;
861     }
862 
863   var = SSA_NAME_VAR (name);
864   if (!var)
865     return copy;
866 
867   uid = DECL_UID (var);
868   ielt.uid = uid;
869   dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT);
870   if (!dslot->to)
871     {
872       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
873       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
874       dslot->uid = uid;
875       dslot->to = var_copy;
876 
877       /* Ensure that when we meet this decl next time, we won't duplicate
878          it again.  */
879       nuid = DECL_UID (var_copy);
880       ielt.uid = nuid;
881       dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT);
882       gcc_assert (!dslot->to);
883       dslot->uid = nuid;
884       dslot->to = var_copy;
885     }
886   else
887     var_copy = dslot->to;
888 
889   replace_ssa_name_symbol (copy, var_copy);
890   return copy;
891 }
892 
893 /* Finds the ssa names used in STMT that are defined outside the
894    region between ENTRY and EXIT and replaces such ssa names with
895    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
896    decls of all ssa names used in STMT (including those defined in
897    LOOP) are replaced with the new temporary variables; the
898    replacement decls are stored in DECL_COPIES.  */
899 
900 static void
separate_decls_in_region_stmt(edge entry,edge exit,gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)901 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
902 			       name_to_copy_table_type *name_copies,
903 			       int_tree_htab_type *decl_copies)
904 {
905   use_operand_p use;
906   def_operand_p def;
907   ssa_op_iter oi;
908   tree name, copy;
909   bool copy_name_p;
910 
911   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
912   {
913     name = DEF_FROM_PTR (def);
914     gcc_assert (TREE_CODE (name) == SSA_NAME);
915     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
916 					  false);
917     gcc_assert (copy == name);
918   }
919 
920   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
921   {
922     name = USE_FROM_PTR (use);
923     if (TREE_CODE (name) != SSA_NAME)
924       continue;
925 
926     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
927     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
928 					  copy_name_p);
929     SET_USE (use, copy);
930   }
931 }
932 
933 /* Finds the ssa names used in STMT that are defined outside the
934    region between ENTRY and EXIT and replaces such ssa names with
935    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
936    decls of all ssa names used in STMT (including those defined in
937    LOOP) are replaced with the new temporary variables; the
938    replacement decls are stored in DECL_COPIES.  */
939 
940 static bool
separate_decls_in_region_debug(gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)941 separate_decls_in_region_debug (gimple *stmt,
942 				name_to_copy_table_type *name_copies,
943 				int_tree_htab_type *decl_copies)
944 {
945   use_operand_p use;
946   ssa_op_iter oi;
947   tree var, name;
948   struct int_tree_map ielt;
949   struct name_to_copy_elt elt;
950   name_to_copy_elt **slot;
951   int_tree_map *dslot;
952 
953   if (gimple_debug_bind_p (stmt))
954     var = gimple_debug_bind_get_var (stmt);
955   else if (gimple_debug_source_bind_p (stmt))
956     var = gimple_debug_source_bind_get_var (stmt);
957   else
958     return true;
959   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
960     return true;
961   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
962   ielt.uid = DECL_UID (var);
963   dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT);
964   if (!dslot)
965     return true;
966   if (gimple_debug_bind_p (stmt))
967     gimple_debug_bind_set_var (stmt, dslot->to);
968   else if (gimple_debug_source_bind_p (stmt))
969     gimple_debug_source_bind_set_var (stmt, dslot->to);
970 
971   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
972   {
973     name = USE_FROM_PTR (use);
974     if (TREE_CODE (name) != SSA_NAME)
975       continue;
976 
977     elt.version = SSA_NAME_VERSION (name);
978     slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT);
979     if (!slot)
980       {
981 	gimple_debug_bind_reset_value (stmt);
982 	update_stmt (stmt);
983 	break;
984       }
985 
986     SET_USE (use, (*slot)->new_name);
987   }
988 
989   return false;
990 }
991 
992 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
993    specified in SLOT. The type is passed in DATA.  */
994 
995 int
add_field_for_reduction(reduction_info ** slot,tree type)996 add_field_for_reduction (reduction_info **slot, tree type)
997 {
998 
999   struct reduction_info *const red = *slot;
1000   tree var = reduc_stmt_res (red->reduc_stmt);
1001   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
1002 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1003 
1004   insert_field_into_struct (type, field);
1005 
1006   red->field = field;
1007 
1008   return 1;
1009 }
1010 
1011 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1012    described in SLOT. The type is passed in DATA.  */
1013 
1014 int
add_field_for_name(name_to_copy_elt ** slot,tree type)1015 add_field_for_name (name_to_copy_elt **slot, tree type)
1016 {
1017   struct name_to_copy_elt *const elt = *slot;
1018   tree name = ssa_name (elt->version);
1019   tree field = build_decl (UNKNOWN_LOCATION,
1020 			   FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1021 			   TREE_TYPE (name));
1022 
1023   insert_field_into_struct (type, field);
1024   elt->field = field;
1025 
1026   return 1;
1027 }
1028 
1029 /* Callback for htab_traverse.  A local result is the intermediate result
1030    computed by a single
1031    thread, or the initial value in case no iteration was executed.
1032    This function creates a phi node reflecting these values.
1033    The phi's result will be stored in NEW_PHI field of the
1034    reduction's data structure.  */
1035 
1036 int
create_phi_for_local_result(reduction_info ** slot,struct loop * loop)1037 create_phi_for_local_result (reduction_info **slot, struct loop *loop)
1038 {
1039   struct reduction_info *const reduc = *slot;
1040   edge e;
1041   gphi *new_phi;
1042   basic_block store_bb, continue_bb;
1043   tree local_res;
1044   source_location locus;
1045 
1046   /* STORE_BB is the block where the phi
1047      should be stored.  It is the destination of the loop exit.
1048      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1049   continue_bb = single_pred (loop->latch);
1050   store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1051 
1052   /* STORE_BB has two predecessors.  One coming from  the loop
1053      (the reduction's result is computed at the loop),
1054      and another coming from a block preceding the loop,
1055      when no iterations
1056      are executed (the initial value should be taken).  */
1057   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1058     e = EDGE_PRED (store_bb, 1);
1059   else
1060     e = EDGE_PRED (store_bb, 0);
1061   tree lhs = reduc_stmt_res (reduc->reduc_stmt);
1062   local_res = copy_ssa_name (lhs);
1063   locus = gimple_location (reduc->reduc_stmt);
1064   new_phi = create_phi_node (local_res, store_bb);
1065   add_phi_arg (new_phi, reduc->init, e, locus);
1066   add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1067   reduc->new_phi = new_phi;
1068 
1069   return 1;
1070 }
1071 
1072 struct clsn_data
1073 {
1074   tree store;
1075   tree load;
1076 
1077   basic_block store_bb;
1078   basic_block load_bb;
1079 };
1080 
1081 /* Callback for htab_traverse.  Create an atomic instruction for the
1082    reduction described in SLOT.
1083    DATA annotates the place in memory the atomic operation relates to,
1084    and the basic block it needs to be generated in.  */
1085 
1086 int
create_call_for_reduction_1(reduction_info ** slot,struct clsn_data * clsn_data)1087 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1088 {
1089   struct reduction_info *const reduc = *slot;
1090   gimple_stmt_iterator gsi;
1091   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1092   tree load_struct;
1093   basic_block bb;
1094   basic_block new_bb;
1095   edge e;
1096   tree t, addr, ref, x;
1097   tree tmp_load, name;
1098   gimple *load;
1099 
1100   if (reduc->reduc_addr == NULL_TREE)
1101     {
1102       load_struct = build_simple_mem_ref (clsn_data->load);
1103       t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1104 
1105       addr = build_addr (t);
1106     }
1107   else
1108     {
1109       /* Set the address for the atomic store.  */
1110       addr = reduc->reduc_addr;
1111 
1112       /* Remove the non-atomic store '*addr = sum'.  */
1113       tree res = PHI_RESULT (reduc->keep_res);
1114       use_operand_p use_p;
1115       gimple *stmt;
1116       bool single_use_p = single_imm_use (res, &use_p, &stmt);
1117       gcc_assert (single_use_p);
1118       replace_uses_by (gimple_vdef (stmt),
1119 		       gimple_vuse (stmt));
1120       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1121       gsi_remove (&gsi, true);
1122     }
1123 
1124   /* Create phi node.  */
1125   bb = clsn_data->load_bb;
1126 
1127   gsi = gsi_last_bb (bb);
1128   e = split_block (bb, gsi_stmt (gsi));
1129   new_bb = e->dest;
1130 
1131   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1132   tmp_load = make_ssa_name (tmp_load);
1133   load = gimple_build_omp_atomic_load (tmp_load, addr);
1134   SSA_NAME_DEF_STMT (tmp_load) = load;
1135   gsi = gsi_start_bb (new_bb);
1136   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1137 
1138   e = split_block (new_bb, load);
1139   new_bb = e->dest;
1140   gsi = gsi_start_bb (new_bb);
1141   ref = tmp_load;
1142   x = fold_build2 (reduc->reduction_code,
1143 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1144 		   PHI_RESULT (reduc->new_phi));
1145 
1146   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1147 				   GSI_CONTINUE_LINKING);
1148 
1149   gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1150   return 1;
1151 }
1152 
1153 /* Create the atomic operation at the join point of the threads.
1154    REDUCTION_LIST describes the reductions in the LOOP.
1155    LD_ST_DATA describes the shared data structure where
1156    shared data is stored in and loaded from.  */
1157 static void
create_call_for_reduction(struct loop * loop,reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1158 create_call_for_reduction (struct loop *loop,
1159 			   reduction_info_table_type *reduction_list,
1160 			   struct clsn_data *ld_st_data)
1161 {
1162   reduction_list->traverse <struct loop *, create_phi_for_local_result> (loop);
1163   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1164   basic_block continue_bb = single_pred (loop->latch);
1165   ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1166   reduction_list
1167     ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1168 }
1169 
1170 /* Callback for htab_traverse.  Loads the final reduction value at the
1171    join point of all threads, and inserts it in the right place.  */
1172 
1173 int
create_loads_for_reductions(reduction_info ** slot,struct clsn_data * clsn_data)1174 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1175 {
1176   struct reduction_info *const red = *slot;
1177   gimple *stmt;
1178   gimple_stmt_iterator gsi;
1179   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1180   tree load_struct;
1181   tree name;
1182   tree x;
1183 
1184   /* If there's no exit phi, the result of the reduction is unused.  */
1185   if (red->keep_res == NULL)
1186     return 1;
1187 
1188   gsi = gsi_after_labels (clsn_data->load_bb);
1189   load_struct = build_simple_mem_ref (clsn_data->load);
1190   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1191 			NULL_TREE);
1192 
1193   x = load_struct;
1194   name = PHI_RESULT (red->keep_res);
1195   stmt = gimple_build_assign (name, x);
1196 
1197   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1198 
1199   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1200        !gsi_end_p (gsi); gsi_next (&gsi))
1201     if (gsi_stmt (gsi) == red->keep_res)
1202       {
1203 	remove_phi_node (&gsi, false);
1204 	return 1;
1205       }
1206   gcc_unreachable ();
1207 }
1208 
1209 /* Load the reduction result that was stored in LD_ST_DATA.
1210    REDUCTION_LIST describes the list of reductions that the
1211    loads should be generated for.  */
1212 static void
create_final_loads_for_reduction(reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1213 create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1214 				  struct clsn_data *ld_st_data)
1215 {
1216   gimple_stmt_iterator gsi;
1217   tree t;
1218   gimple *stmt;
1219 
1220   gsi = gsi_after_labels (ld_st_data->load_bb);
1221   t = build_fold_addr_expr (ld_st_data->store);
1222   stmt = gimple_build_assign (ld_st_data->load, t);
1223 
1224   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1225 
1226   reduction_list
1227     ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1228 
1229 }
1230 
1231 /* Callback for htab_traverse.  Store the neutral value for the
1232   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1233   1 for MULT_EXPR, etc. into the reduction field.
1234   The reduction is specified in SLOT. The store information is
1235   passed in DATA.  */
1236 
1237 int
create_stores_for_reduction(reduction_info ** slot,struct clsn_data * clsn_data)1238 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1239 {
1240   struct reduction_info *const red = *slot;
1241   tree t;
1242   gimple *stmt;
1243   gimple_stmt_iterator gsi;
1244   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1245 
1246   gsi = gsi_last_bb (clsn_data->store_bb);
1247   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1248   stmt = gimple_build_assign (t, red->initial_value);
1249   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1250 
1251   return 1;
1252 }
1253 
1254 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1255    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1256    specified in SLOT.  */
1257 
1258 int
create_loads_and_stores_for_name(name_to_copy_elt ** slot,struct clsn_data * clsn_data)1259 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1260 				  struct clsn_data *clsn_data)
1261 {
1262   struct name_to_copy_elt *const elt = *slot;
1263   tree t;
1264   gimple *stmt;
1265   gimple_stmt_iterator gsi;
1266   tree type = TREE_TYPE (elt->new_name);
1267   tree load_struct;
1268 
1269   gsi = gsi_last_bb (clsn_data->store_bb);
1270   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1271   stmt = gimple_build_assign (t, ssa_name (elt->version));
1272   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1273 
1274   gsi = gsi_last_bb (clsn_data->load_bb);
1275   load_struct = build_simple_mem_ref (clsn_data->load);
1276   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1277   stmt = gimple_build_assign (elt->new_name, t);
1278   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1279 
1280   return 1;
1281 }
1282 
1283 /* Moves all the variables used in LOOP and defined outside of it (including
1284    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1285    name) to a structure created for this purpose.  The code
1286 
1287    while (1)
1288      {
1289        use (a);
1290        use (b);
1291      }
1292 
1293    is transformed this way:
1294 
1295    bb0:
1296    old.a = a;
1297    old.b = b;
1298 
1299    bb1:
1300    a' = new->a;
1301    b' = new->b;
1302    while (1)
1303      {
1304        use (a');
1305        use (b');
1306      }
1307 
1308    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1309    pointer `new' is intentionally not initialized (the loop will be split to a
1310    separate function later, and `new' will be initialized from its arguments).
1311    LD_ST_DATA holds information about the shared data structure used to pass
1312    information among the threads.  It is initialized here, and
1313    gen_parallel_loop will pass it to create_call_for_reduction that
1314    needs this information.  REDUCTION_LIST describes the reductions
1315    in LOOP.  */
1316 
1317 static void
separate_decls_in_region(edge entry,edge exit,reduction_info_table_type * reduction_list,tree * arg_struct,tree * new_arg_struct,struct clsn_data * ld_st_data)1318 separate_decls_in_region (edge entry, edge exit,
1319 			  reduction_info_table_type *reduction_list,
1320 			  tree *arg_struct, tree *new_arg_struct,
1321 			  struct clsn_data *ld_st_data)
1322 
1323 {
1324   basic_block bb1 = split_edge (entry);
1325   basic_block bb0 = single_pred (bb1);
1326   name_to_copy_table_type name_copies (10);
1327   int_tree_htab_type decl_copies (10);
1328   unsigned i;
1329   tree type, type_name, nvar;
1330   gimple_stmt_iterator gsi;
1331   struct clsn_data clsn_data;
1332   auto_vec<basic_block, 3> body;
1333   basic_block bb;
1334   basic_block entry_bb = bb1;
1335   basic_block exit_bb = exit->dest;
1336   bool has_debug_stmt = false;
1337 
1338   entry = single_succ_edge (entry_bb);
1339   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1340 
1341   FOR_EACH_VEC_ELT (body, i, bb)
1342     {
1343       if (bb != entry_bb && bb != exit_bb)
1344 	{
1345 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1346 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1347 					   &name_copies, &decl_copies);
1348 
1349 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1350 	    {
1351 	      gimple *stmt = gsi_stmt (gsi);
1352 
1353 	      if (is_gimple_debug (stmt))
1354 		has_debug_stmt = true;
1355 	      else
1356 		separate_decls_in_region_stmt (entry, exit, stmt,
1357 					       &name_copies, &decl_copies);
1358 	    }
1359 	}
1360     }
1361 
1362   /* Now process debug bind stmts.  We must not create decls while
1363      processing debug stmts, so we defer their processing so as to
1364      make sure we will have debug info for as many variables as
1365      possible (all of those that were dealt with in the loop above),
1366      and discard those for which we know there's nothing we can
1367      do.  */
1368   if (has_debug_stmt)
1369     FOR_EACH_VEC_ELT (body, i, bb)
1370       if (bb != entry_bb && bb != exit_bb)
1371 	{
1372 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1373 	    {
1374 	      gimple *stmt = gsi_stmt (gsi);
1375 
1376 	      if (is_gimple_debug (stmt))
1377 		{
1378 		  if (separate_decls_in_region_debug (stmt, &name_copies,
1379 						      &decl_copies))
1380 		    {
1381 		      gsi_remove (&gsi, true);
1382 		      continue;
1383 		    }
1384 		}
1385 
1386 	      gsi_next (&gsi);
1387 	    }
1388 	}
1389 
1390   if (name_copies.elements () == 0 && reduction_list->elements () == 0)
1391     {
1392       /* It may happen that there is nothing to copy (if there are only
1393          loop carried and external variables in the loop).  */
1394       *arg_struct = NULL;
1395       *new_arg_struct = NULL;
1396     }
1397   else
1398     {
1399       /* Create the type for the structure to store the ssa names to.  */
1400       type = lang_hooks.types.make_type (RECORD_TYPE);
1401       type_name = build_decl (UNKNOWN_LOCATION,
1402 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1403 			      type);
1404       TYPE_NAME (type) = type_name;
1405 
1406       name_copies.traverse <tree, add_field_for_name> (type);
1407       if (reduction_list && reduction_list->elements () > 0)
1408 	{
1409 	  /* Create the fields for reductions.  */
1410 	  reduction_list->traverse <tree, add_field_for_reduction> (type);
1411 	}
1412       layout_type (type);
1413 
1414       /* Create the loads and stores.  */
1415       *arg_struct = create_tmp_var (type, ".paral_data_store");
1416       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1417       *new_arg_struct = make_ssa_name (nvar);
1418 
1419       ld_st_data->store = *arg_struct;
1420       ld_st_data->load = *new_arg_struct;
1421       ld_st_data->store_bb = bb0;
1422       ld_st_data->load_bb = bb1;
1423 
1424       name_copies
1425 	.traverse <struct clsn_data *, create_loads_and_stores_for_name>
1426 		  (ld_st_data);
1427 
1428       /* Load the calculation from memory (after the join of the threads).  */
1429 
1430       if (reduction_list && reduction_list->elements () > 0)
1431 	{
1432 	  reduction_list
1433 	    ->traverse <struct clsn_data *, create_stores_for_reduction>
1434 	    (ld_st_data);
1435 	  clsn_data.load = make_ssa_name (nvar);
1436 	  clsn_data.load_bb = exit->dest;
1437 	  clsn_data.store = ld_st_data->store;
1438 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1439 	}
1440     }
1441 }
1442 
1443 /* Returns true if FN was created to run in parallel.  */
1444 
1445 bool
parallelized_function_p(tree fndecl)1446 parallelized_function_p (tree fndecl)
1447 {
1448   cgraph_node *node = cgraph_node::get (fndecl);
1449   gcc_assert (node != NULL);
1450   return node->parallelized_function;
1451 }
1452 
1453 /* Creates and returns an empty function that will receive the body of
1454    a parallelized loop.  */
1455 
1456 static tree
create_loop_fn(location_t loc)1457 create_loop_fn (location_t loc)
1458 {
1459   char buf[100];
1460   char *tname;
1461   tree decl, type, name, t;
1462   struct function *act_cfun = cfun;
1463   static unsigned loopfn_num;
1464 
1465   loc = LOCATION_LOCUS (loc);
1466   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1467   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1468   clean_symbol_name (tname);
1469   name = get_identifier (tname);
1470   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1471 
1472   decl = build_decl (loc, FUNCTION_DECL, name, type);
1473   TREE_STATIC (decl) = 1;
1474   TREE_USED (decl) = 1;
1475   DECL_ARTIFICIAL (decl) = 1;
1476   DECL_IGNORED_P (decl) = 0;
1477   TREE_PUBLIC (decl) = 0;
1478   DECL_UNINLINABLE (decl) = 1;
1479   DECL_EXTERNAL (decl) = 0;
1480   DECL_CONTEXT (decl) = NULL_TREE;
1481   DECL_INITIAL (decl) = make_node (BLOCK);
1482   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
1483 
1484   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1485   DECL_ARTIFICIAL (t) = 1;
1486   DECL_IGNORED_P (t) = 1;
1487   DECL_RESULT (decl) = t;
1488 
1489   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1490 		  ptr_type_node);
1491   DECL_ARTIFICIAL (t) = 1;
1492   DECL_ARG_TYPE (t) = ptr_type_node;
1493   DECL_CONTEXT (t) = decl;
1494   TREE_USED (t) = 1;
1495   DECL_ARGUMENTS (decl) = t;
1496 
1497   allocate_struct_function (decl, false);
1498 
1499   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1500      it.  */
1501   set_cfun (act_cfun);
1502 
1503   return decl;
1504 }
1505 
1506 /* Replace uses of NAME by VAL in block BB.  */
1507 
1508 static void
replace_uses_in_bb_by(tree name,tree val,basic_block bb)1509 replace_uses_in_bb_by (tree name, tree val, basic_block bb)
1510 {
1511   gimple *use_stmt;
1512   imm_use_iterator imm_iter;
1513 
1514   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
1515     {
1516       if (gimple_bb (use_stmt) != bb)
1517 	continue;
1518 
1519       use_operand_p use_p;
1520       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1521 	SET_USE (use_p, val);
1522     }
1523 }
1524 
1525 /* Do transformation from:
1526 
1527      <bb preheader>:
1528      ...
1529      goto <bb header>
1530 
1531      <bb header>:
1532      ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1533      sum_a = PHI <sum_init (preheader), sum_b (latch)>
1534      ...
1535      use (ivtmp_a)
1536      ...
1537      sum_b = sum_a + sum_update
1538      ...
1539      if (ivtmp_a < n)
1540        goto <bb latch>;
1541      else
1542        goto <bb exit>;
1543 
1544      <bb latch>:
1545      ivtmp_b = ivtmp_a + 1;
1546      goto <bb header>
1547 
1548      <bb exit>:
1549      sum_z = PHI <sum_b (cond[1]), ...>
1550 
1551      [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
1552 	 that's <bb header>.
1553 
1554    to:
1555 
1556      <bb preheader>:
1557      ...
1558      goto <bb newheader>
1559 
1560      <bb header>:
1561      ivtmp_a = PHI <ivtmp_c (latch)>
1562      sum_a = PHI <sum_c (latch)>
1563      ...
1564      use (ivtmp_a)
1565      ...
1566      sum_b = sum_a + sum_update
1567      ...
1568      goto <bb latch>;
1569 
1570      <bb newheader>:
1571      ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1572      sum_c = PHI <sum_init (preheader), sum_b (latch)>
1573      if (ivtmp_c < n + 1)
1574        goto <bb header>;
1575      else
1576        goto <bb newexit>;
1577 
1578      <bb latch>:
1579      ivtmp_b = ivtmp_a + 1;
1580      goto <bb newheader>
1581 
1582      <bb newexit>:
1583      sum_y = PHI <sum_c (newheader)>
1584 
1585      <bb exit>:
1586      sum_z = PHI <sum_y (newexit), ...>
1587 
1588 
1589    In unified diff format:
1590 
1591       <bb preheader>:
1592       ...
1593 -     goto <bb header>
1594 +     goto <bb newheader>
1595 
1596       <bb header>:
1597 -     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1598 -     sum_a = PHI <sum_init (preheader), sum_b (latch)>
1599 +     ivtmp_a = PHI <ivtmp_c (latch)>
1600 +     sum_a = PHI <sum_c (latch)>
1601       ...
1602       use (ivtmp_a)
1603       ...
1604       sum_b = sum_a + sum_update
1605       ...
1606 -     if (ivtmp_a < n)
1607 -       goto <bb latch>;
1608 +     goto <bb latch>;
1609 +
1610 +     <bb newheader>:
1611 +     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1612 +     sum_c = PHI <sum_init (preheader), sum_b (latch)>
1613 +     if (ivtmp_c < n + 1)
1614 +       goto <bb header>;
1615       else
1616 	goto <bb exit>;
1617 
1618       <bb latch>:
1619       ivtmp_b = ivtmp_a + 1;
1620 -     goto <bb header>
1621 +     goto <bb newheader>
1622 
1623 +    <bb newexit>:
1624 +    sum_y = PHI <sum_c (newheader)>
1625 
1626       <bb exit>:
1627 -     sum_z = PHI <sum_b (cond[1]), ...>
1628 +     sum_z = PHI <sum_y (newexit), ...>
1629 
1630    Note: the example does not show any virtual phis, but these are handled more
1631    or less as reductions.
1632 
1633 
1634    Moves the exit condition of LOOP to the beginning of its header.
1635    REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
1636    bound.  */
1637 
1638 static void
transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree bound)1639 transform_to_exit_first_loop_alt (struct loop *loop,
1640 				  reduction_info_table_type *reduction_list,
1641 				  tree bound)
1642 {
1643   basic_block header = loop->header;
1644   basic_block latch = loop->latch;
1645   edge exit = single_dom_exit (loop);
1646   basic_block exit_block = exit->dest;
1647   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1648   tree control = gimple_cond_lhs (cond_stmt);
1649   edge e;
1650 
1651   /* Rewriting virtuals into loop-closed ssa normal form makes this
1652      transformation simpler.  It also ensures that the virtuals are in
1653      loop-closed ssa normal from after the transformation, which is required by
1654      create_parallel_loop.  */
1655   rewrite_virtuals_into_loop_closed_ssa (loop);
1656 
1657   /* Create the new_header block.  */
1658   basic_block new_header = split_block_before_cond_jump (exit->src);
1659   edge edge_at_split = single_pred_edge (new_header);
1660 
1661   /* Redirect entry edge to new_header.  */
1662   edge entry = loop_preheader_edge (loop);
1663   e = redirect_edge_and_branch (entry, new_header);
1664   gcc_assert (e == entry);
1665 
1666   /* Redirect post_inc_edge to new_header.  */
1667   edge post_inc_edge = single_succ_edge (latch);
1668   e = redirect_edge_and_branch (post_inc_edge, new_header);
1669   gcc_assert (e == post_inc_edge);
1670 
1671   /* Redirect post_cond_edge to header.  */
1672   edge post_cond_edge = single_pred_edge (latch);
1673   e = redirect_edge_and_branch (post_cond_edge, header);
1674   gcc_assert (e == post_cond_edge);
1675 
1676   /* Redirect edge_at_split to latch.  */
1677   e = redirect_edge_and_branch (edge_at_split, latch);
1678   gcc_assert (e == edge_at_split);
1679 
1680   /* Set the new loop bound.  */
1681   gimple_cond_set_rhs (cond_stmt, bound);
1682   update_stmt (cond_stmt);
1683 
1684   /* Repair the ssa.  */
1685   vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
1686   edge_var_map *vm;
1687   gphi_iterator gsi;
1688   int i;
1689   for (gsi = gsi_start_phis (header), i = 0;
1690        !gsi_end_p (gsi) && v->iterate (i, &vm);
1691        gsi_next (&gsi), i++)
1692     {
1693       gphi *phi = gsi.phi ();
1694       tree res_a = PHI_RESULT (phi);
1695 
1696       /* Create new phi.  */
1697       tree res_c = copy_ssa_name (res_a, phi);
1698       gphi *nphi = create_phi_node (res_c, new_header);
1699 
1700       /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
1701       replace_uses_in_bb_by (res_a, res_c, new_header);
1702 
1703       /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
1704       add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
1705 
1706       /* Replace sum_b with sum_c in exit phi.  */
1707       tree res_b = redirect_edge_var_map_def (vm);
1708       replace_uses_in_bb_by (res_b, res_c, exit_block);
1709 
1710       struct reduction_info *red = reduction_phi (reduction_list, phi);
1711       gcc_assert (virtual_operand_p (res_a)
1712 		  || res_a == control
1713 		  || red != NULL);
1714 
1715       if (red)
1716 	{
1717 	  /* Register the new reduction phi.  */
1718 	  red->reduc_phi = nphi;
1719 	  gimple_set_uid (red->reduc_phi, red->reduc_version);
1720 	}
1721     }
1722   gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
1723 
1724   /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
1725   flush_pending_stmts (entry);
1726 
1727   /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
1728   flush_pending_stmts (post_inc_edge);
1729 
1730 
1731   basic_block new_exit_block = NULL;
1732   if (!single_pred_p (exit->dest))
1733     {
1734       /* Create a new empty exit block, inbetween the new loop header and the
1735 	 old exit block.  The function separate_decls_in_region needs this block
1736 	 to insert code that is active on loop exit, but not any other path.  */
1737       new_exit_block = split_edge (exit);
1738     }
1739 
1740   /* Insert and register the reduction exit phis.  */
1741   for (gphi_iterator gsi = gsi_start_phis (exit_block);
1742        !gsi_end_p (gsi);
1743        gsi_next (&gsi))
1744     {
1745       gphi *phi = gsi.phi ();
1746       gphi *nphi = NULL;
1747       tree res_z = PHI_RESULT (phi);
1748       tree res_c;
1749 
1750       if (new_exit_block != NULL)
1751 	{
1752 	  /* Now that we have a new exit block, duplicate the phi of the old
1753 	     exit block in the new exit block to preserve loop-closed ssa.  */
1754 	  edge succ_new_exit_block = single_succ_edge (new_exit_block);
1755 	  edge pred_new_exit_block = single_pred_edge (new_exit_block);
1756 	  tree res_y = copy_ssa_name (res_z, phi);
1757 	  nphi = create_phi_node (res_y, new_exit_block);
1758 	  res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
1759 	  add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
1760 	  add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
1761 	}
1762       else
1763 	res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1764 
1765       if (virtual_operand_p (res_z))
1766 	continue;
1767 
1768       gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
1769       struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
1770       if (red != NULL)
1771 	red->keep_res = (nphi != NULL
1772 			 ? nphi
1773 			 : phi);
1774     }
1775 
1776   /* We're going to cancel the loop at the end of gen_parallel_loop, but until
1777      then we're still using some fields, so only bother about fields that are
1778      still used: header and latch.
1779      The loop has a new header bb, so we update it.  The latch bb stays the
1780      same.  */
1781   loop->header = new_header;
1782 
1783   /* Recalculate dominance info.  */
1784   free_dominance_info (CDI_DOMINATORS);
1785   calculate_dominance_info (CDI_DOMINATORS);
1786 
1787   checking_verify_ssa (true, true);
1788 }
1789 
1790 /* Tries to moves the exit condition of LOOP to the beginning of its header
1791    without duplication of the loop body.  NIT is the number of iterations of the
1792    loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
1793    transformation is successful.  */
1794 
1795 static bool
try_transform_to_exit_first_loop_alt(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1796 try_transform_to_exit_first_loop_alt (struct loop *loop,
1797 				      reduction_info_table_type *reduction_list,
1798 				      tree nit)
1799 {
1800   /* Check whether the latch contains a single statement.  */
1801   if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
1802     return false;
1803 
1804   /* Check whether the latch contains no phis.  */
1805   if (phi_nodes (loop->latch) != NULL)
1806     return false;
1807 
1808   /* Check whether the latch contains the loop iv increment.  */
1809   edge back = single_succ_edge (loop->latch);
1810   edge exit = single_dom_exit (loop);
1811   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1812   tree control = gimple_cond_lhs (cond_stmt);
1813   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
1814   tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
1815   if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
1816     return false;
1817 
1818   /* Check whether there's no code between the loop condition and the latch.  */
1819   if (!single_pred_p (loop->latch)
1820       || single_pred (loop->latch) != exit->src)
1821     return false;
1822 
1823   tree alt_bound = NULL_TREE;
1824   tree nit_type = TREE_TYPE (nit);
1825 
1826   /* Figure out whether nit + 1 overflows.  */
1827   if (TREE_CODE (nit) == INTEGER_CST)
1828     {
1829       if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
1830 	{
1831 	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
1832 				       nit, build_one_cst (nit_type));
1833 
1834 	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
1835 	  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1836 	  return true;
1837 	}
1838       else
1839 	{
1840 	  /* Todo: Figure out if we can trigger this, if it's worth to handle
1841 	     optimally, and if we can handle it optimally.  */
1842 	  return false;
1843 	}
1844     }
1845 
1846   gcc_assert (TREE_CODE (nit) == SSA_NAME);
1847 
1848   /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
1849      iv with base 0 and step 1 that is incremented in the latch, like this:
1850 
1851      <bb header>:
1852      # iv_1 = PHI <0 (preheader), iv_2 (latch)>
1853      ...
1854      if (iv_1 < nit)
1855        goto <bb latch>;
1856      else
1857        goto <bb exit>;
1858 
1859      <bb latch>:
1860      iv_2 = iv_1 + 1;
1861      goto <bb header>;
1862 
1863      The range of iv_1 is [0, nit].  The latch edge is taken for
1864      iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit.  So the
1865      number of latch executions is equal to nit.
1866 
1867      The function max_loop_iterations gives us the maximum number of latch
1868      executions, so it gives us the maximum value of nit.  */
1869   widest_int nit_max;
1870   if (!max_loop_iterations (loop, &nit_max))
1871     return false;
1872 
1873   /* Check if nit + 1 overflows.  */
1874   widest_int type_max = wi::to_widest (TYPE_MAX_VALUE (nit_type));
1875   if (nit_max >= type_max)
1876     return false;
1877 
1878   gimple *def = SSA_NAME_DEF_STMT (nit);
1879 
1880   /* Try to find nit + 1, in the form of n in an assignment nit = n - 1.  */
1881   if (def
1882       && is_gimple_assign (def)
1883       && gimple_assign_rhs_code (def) == PLUS_EXPR)
1884     {
1885       tree op1 = gimple_assign_rhs1 (def);
1886       tree op2 = gimple_assign_rhs2 (def);
1887       if (integer_minus_onep (op1))
1888 	alt_bound = op2;
1889       else if (integer_minus_onep (op2))
1890 	alt_bound = op1;
1891     }
1892 
1893   /* If not found, insert nit + 1.  */
1894   if (alt_bound == NULL_TREE)
1895     {
1896       alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
1897 			       build_int_cst_type (nit_type, 1));
1898 
1899       gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1900 
1901       alt_bound
1902 	= force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
1903 				    GSI_CONTINUE_LINKING);
1904     }
1905 
1906   transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1907   return true;
1908 }
1909 
1910 /* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
1911    number of iterations of the loop.  REDUCTION_LIST describes the reductions in
1912    LOOP.  */
1913 
1914 static void
transform_to_exit_first_loop(struct loop * loop,reduction_info_table_type * reduction_list,tree nit)1915 transform_to_exit_first_loop (struct loop *loop,
1916 			      reduction_info_table_type *reduction_list,
1917 			      tree nit)
1918 {
1919   basic_block *bbs, *nbbs, ex_bb, orig_header;
1920   unsigned n;
1921   bool ok;
1922   edge exit = single_dom_exit (loop), hpred;
1923   tree control, control_name, res, t;
1924   gphi *phi, *nphi;
1925   gassign *stmt;
1926   gcond *cond_stmt, *cond_nit;
1927   tree nit_1;
1928 
1929   split_block_after_labels (loop->header);
1930   orig_header = single_succ (loop->header);
1931   hpred = single_succ_edge (loop->header);
1932 
1933   cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1934   control = gimple_cond_lhs (cond_stmt);
1935   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1936 
1937   /* Make sure that we have phi nodes on exit for all loop header phis
1938      (create_parallel_loop requires that).  */
1939   for (gphi_iterator gsi = gsi_start_phis (loop->header);
1940        !gsi_end_p (gsi);
1941        gsi_next (&gsi))
1942     {
1943       phi = gsi.phi ();
1944       res = PHI_RESULT (phi);
1945       t = copy_ssa_name (res, phi);
1946       SET_PHI_RESULT (phi, t);
1947       nphi = create_phi_node (res, orig_header);
1948       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1949 
1950       if (res == control)
1951 	{
1952 	  gimple_cond_set_lhs (cond_stmt, t);
1953 	  update_stmt (cond_stmt);
1954 	  control = t;
1955 	}
1956     }
1957 
1958   bbs = get_loop_body_in_dom_order (loop);
1959 
1960   for (n = 0; bbs[n] != exit->src; n++)
1961    continue;
1962   nbbs = XNEWVEC (basic_block, n);
1963   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1964 				   bbs + 1, n, nbbs);
1965   gcc_assert (ok);
1966   free (bbs);
1967   ex_bb = nbbs[0];
1968   free (nbbs);
1969 
1970   /* Other than reductions, the only gimple reg that should be copied
1971      out of the loop is the control variable.  */
1972   exit = single_dom_exit (loop);
1973   control_name = NULL_TREE;
1974   for (gphi_iterator gsi = gsi_start_phis (ex_bb);
1975        !gsi_end_p (gsi); )
1976     {
1977       phi = gsi.phi ();
1978       res = PHI_RESULT (phi);
1979       if (virtual_operand_p (res))
1980 	{
1981 	  gsi_next (&gsi);
1982 	  continue;
1983 	}
1984 
1985       /* Check if it is a part of reduction.  If it is,
1986          keep the phi at the reduction's keep_res field.  The
1987          PHI_RESULT of this phi is the resulting value of the reduction
1988          variable when exiting the loop.  */
1989 
1990       if (reduction_list->elements () > 0)
1991 	{
1992 	  struct reduction_info *red;
1993 
1994 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1995 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1996 	  if (red)
1997 	    {
1998 	      red->keep_res = phi;
1999 	      gsi_next (&gsi);
2000 	      continue;
2001 	    }
2002 	}
2003       gcc_assert (control_name == NULL_TREE
2004 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2005       control_name = res;
2006       remove_phi_node (&gsi, false);
2007     }
2008   gcc_assert (control_name != NULL_TREE);
2009 
2010   /* Initialize the control variable to number of iterations
2011      according to the rhs of the exit condition.  */
2012   gimple_stmt_iterator gsi = gsi_after_labels (ex_bb);
2013   cond_nit = as_a <gcond *> (last_stmt (exit->src));
2014   nit_1 =  gimple_cond_rhs (cond_nit);
2015   nit_1 = force_gimple_operand_gsi (&gsi,
2016 				  fold_convert (TREE_TYPE (control_name), nit_1),
2017 				  false, NULL_TREE, false, GSI_SAME_STMT);
2018   stmt = gimple_build_assign (control_name, nit_1);
2019   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2020 }
2021 
2022 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2023    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2024    NEW_DATA is the variable that should be initialized from the argument
2025    of LOOP_FN.  N_THREADS is the requested number of threads, which can be 0 if
2026    that number is to be determined later.  */
2027 
2028 static void
create_parallel_loop(struct loop * loop,tree loop_fn,tree data,tree new_data,unsigned n_threads,location_t loc,bool oacc_kernels_p)2029 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
2030 		      tree new_data, unsigned n_threads, location_t loc,
2031 		      bool oacc_kernels_p)
2032 {
2033   gimple_stmt_iterator gsi;
2034   basic_block for_bb, ex_bb, continue_bb;
2035   tree t, param;
2036   gomp_parallel *omp_par_stmt;
2037   gimple *omp_return_stmt1, *omp_return_stmt2;
2038   gimple *phi;
2039   gcond *cond_stmt;
2040   gomp_for *for_stmt;
2041   gomp_continue *omp_cont_stmt;
2042   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2043   edge exit, nexit, guard, end, e;
2044 
2045   if (oacc_kernels_p)
2046     {
2047       gcc_checking_assert (lookup_attribute ("oacc kernels",
2048 					     DECL_ATTRIBUTES (cfun->decl)));
2049       /* Indicate to later processing that this is a parallelized OpenACC
2050 	 kernels construct.  */
2051       DECL_ATTRIBUTES (cfun->decl)
2052 	= tree_cons (get_identifier ("oacc kernels parallelized"),
2053 		     NULL_TREE, DECL_ATTRIBUTES (cfun->decl));
2054     }
2055   else
2056     {
2057       /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
2058 
2059       basic_block bb = loop_preheader_edge (loop)->src;
2060       basic_block paral_bb = single_pred (bb);
2061       gsi = gsi_last_bb (paral_bb);
2062 
2063       gcc_checking_assert (n_threads != 0);
2064       t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2065       OMP_CLAUSE_NUM_THREADS_EXPR (t)
2066 	= build_int_cst (integer_type_node, n_threads);
2067       omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2068       gimple_set_location (omp_par_stmt, loc);
2069 
2070       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2071 
2072       /* Initialize NEW_DATA.  */
2073       if (data)
2074 	{
2075 	  gassign *assign_stmt;
2076 
2077 	  gsi = gsi_after_labels (bb);
2078 
2079 	  param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2080 	  assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2081 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2082 
2083 	  assign_stmt = gimple_build_assign (new_data,
2084 					     fold_convert (TREE_TYPE (new_data), param));
2085 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2086 	}
2087 
2088       /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
2089       bb = split_loop_exit_edge (single_dom_exit (loop));
2090       gsi = gsi_last_bb (bb);
2091       omp_return_stmt1 = gimple_build_omp_return (false);
2092       gimple_set_location (omp_return_stmt1, loc);
2093       gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2094     }
2095 
2096   /* Extract data for GIMPLE_OMP_FOR.  */
2097   gcc_assert (loop->header == single_dom_exit (loop)->src);
2098   cond_stmt = as_a <gcond *> (last_stmt (loop->header));
2099 
2100   cvar = gimple_cond_lhs (cond_stmt);
2101   cvar_base = SSA_NAME_VAR (cvar);
2102   phi = SSA_NAME_DEF_STMT (cvar);
2103   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2104   initvar = copy_ssa_name (cvar);
2105   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2106 	   initvar);
2107   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2108 
2109   gsi = gsi_last_nondebug_bb (loop->latch);
2110   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2111   gsi_remove (&gsi, true);
2112 
2113   /* Prepare cfg.  */
2114   for_bb = split_edge (loop_preheader_edge (loop));
2115   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2116   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2117   gcc_assert (exit == single_dom_exit (loop));
2118 
2119   guard = make_edge (for_bb, ex_bb, 0);
2120   /* FIXME: What is the probability?  */
2121   guard->probability = profile_probability::guessed_never ();
2122   /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid.  */
2123   loop->latch = split_edge (single_succ_edge (loop->latch));
2124   single_pred_edge (loop->latch)->flags = 0;
2125   end = make_single_succ_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU);
2126   rescan_loop_exit (end, true, false);
2127 
2128   for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2129        !gsi_end_p (gpi); gsi_next (&gpi))
2130     {
2131       source_location locus;
2132       gphi *phi = gpi.phi ();
2133       tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2134       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2135 
2136       /* If the exit phi is not connected to a header phi in the same loop, this
2137 	 value is not modified in the loop, and we're done with this phi.  */
2138       if (!(gimple_code (def_stmt) == GIMPLE_PHI
2139 	    && gimple_bb (def_stmt) == loop->header))
2140 	{
2141 	  locus = gimple_phi_arg_location_from_edge (phi, exit);
2142 	  add_phi_arg (phi, def, guard, locus);
2143 	  add_phi_arg (phi, def, end, locus);
2144 	  continue;
2145 	}
2146 
2147       gphi *stmt = as_a <gphi *> (def_stmt);
2148       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2149       locus = gimple_phi_arg_location_from_edge (stmt,
2150 						 loop_preheader_edge (loop));
2151       add_phi_arg (phi, def, guard, locus);
2152 
2153       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2154       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
2155       add_phi_arg (phi, def, end, locus);
2156     }
2157   e = redirect_edge_and_branch (exit, nexit->dest);
2158   PENDING_STMT (e) = NULL;
2159 
2160   /* Emit GIMPLE_OMP_FOR.  */
2161   if (oacc_kernels_p)
2162     /* Parallelized OpenACC kernels constructs use gang parallelism.  See also
2163        omp-offload.c:execute_oacc_device_lower.  */
2164     t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2165   else
2166     {
2167       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2168       int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE);
2169       enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \
2170 	= (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE);
2171       switch (schedule_type)
2172 	{
2173 	case PARAM_PARLOOPS_SCHEDULE_KIND_static:
2174 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2175 	  break;
2176 	case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic:
2177 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2178 	  break;
2179 	case PARAM_PARLOOPS_SCHEDULE_KIND_guided:
2180 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2181 	  break;
2182 	case PARAM_PARLOOPS_SCHEDULE_KIND_auto:
2183 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2184 	  chunk_size = 0;
2185 	  break;
2186 	case PARAM_PARLOOPS_SCHEDULE_KIND_runtime:
2187 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2188 	  chunk_size = 0;
2189 	  break;
2190 	default:
2191 	  gcc_unreachable ();
2192 	}
2193       if (chunk_size != 0)
2194 	OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2195 	  = build_int_cst (integer_type_node, chunk_size);
2196     }
2197 
2198   for_stmt = gimple_build_omp_for (NULL,
2199 				   (oacc_kernels_p
2200 				    ? GF_OMP_FOR_KIND_OACC_LOOP
2201 				    : GF_OMP_FOR_KIND_FOR),
2202 				   t, 1, NULL);
2203 
2204   gimple_cond_set_lhs (cond_stmt, cvar_base);
2205   type = TREE_TYPE (cvar);
2206   gimple_set_location (for_stmt, loc);
2207   gimple_omp_for_set_index (for_stmt, 0, initvar);
2208   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
2209   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
2210   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
2211   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
2212 						cvar_base,
2213 						build_int_cst (type, 1)));
2214 
2215   gsi = gsi_last_bb (for_bb);
2216   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2217   SSA_NAME_DEF_STMT (initvar) = for_stmt;
2218 
2219   /* Emit GIMPLE_OMP_CONTINUE.  */
2220   continue_bb = single_pred (loop->latch);
2221   gsi = gsi_last_bb (continue_bb);
2222   omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2223   gimple_set_location (omp_cont_stmt, loc);
2224   gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2225   SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2226 
2227   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
2228   gsi = gsi_last_bb (ex_bb);
2229   omp_return_stmt2 = gimple_build_omp_return (true);
2230   gimple_set_location (omp_return_stmt2, loc);
2231   gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2232 
2233   /* After the above dom info is hosed.  Re-compute it.  */
2234   free_dominance_info (CDI_DOMINATORS);
2235   calculate_dominance_info (CDI_DOMINATORS);
2236 }
2237 
2238 /* Return number of phis in bb.  If COUNT_VIRTUAL_P is false, don't count the
2239    virtual phi.  */
2240 
2241 static unsigned int
num_phis(basic_block bb,bool count_virtual_p)2242 num_phis (basic_block bb, bool count_virtual_p)
2243 {
2244   unsigned int nr_phis = 0;
2245   gphi_iterator gsi;
2246   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2247     {
2248       if (!count_virtual_p && virtual_operand_p (PHI_RESULT (gsi.phi ())))
2249 	continue;
2250 
2251       nr_phis++;
2252     }
2253 
2254   return nr_phis;
2255 }
2256 
2257 /* Generates code to execute the iterations of LOOP in N_THREADS
2258    threads in parallel, which can be 0 if that number is to be determined
2259    later.
2260 
2261    NITER describes number of iterations of LOOP.
2262    REDUCTION_LIST describes the reductions existent in the LOOP.  */
2263 
2264 static void
gen_parallel_loop(struct loop * loop,reduction_info_table_type * reduction_list,unsigned n_threads,struct tree_niter_desc * niter,bool oacc_kernels_p)2265 gen_parallel_loop (struct loop *loop,
2266 		   reduction_info_table_type *reduction_list,
2267 		   unsigned n_threads, struct tree_niter_desc *niter,
2268 		   bool oacc_kernels_p)
2269 {
2270   tree many_iterations_cond, type, nit;
2271   tree arg_struct, new_arg_struct;
2272   gimple_seq stmts;
2273   edge entry, exit;
2274   struct clsn_data clsn_data;
2275   location_t loc;
2276   gimple *cond_stmt;
2277   unsigned int m_p_thread=2;
2278 
2279   /* From
2280 
2281      ---------------------------------------------------------------------
2282      loop
2283        {
2284 	 IV = phi (INIT, IV + STEP)
2285 	 BODY1;
2286 	 if (COND)
2287 	   break;
2288 	 BODY2;
2289        }
2290      ---------------------------------------------------------------------
2291 
2292      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2293      we generate the following code:
2294 
2295      ---------------------------------------------------------------------
2296 
2297      if (MAY_BE_ZERO
2298      || NITER < MIN_PER_THREAD * N_THREADS)
2299      goto original;
2300 
2301      BODY1;
2302      store all local loop-invariant variables used in body of the loop to DATA.
2303      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
2304      load the variables from DATA.
2305      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
2306      BODY2;
2307      BODY1;
2308      GIMPLE_OMP_CONTINUE;
2309      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
2310      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
2311      goto end;
2312 
2313      original:
2314      loop
2315        {
2316 	 IV = phi (INIT, IV + STEP)
2317 	 BODY1;
2318 	 if (COND)
2319 	   break;
2320 	 BODY2;
2321        }
2322 
2323      end:
2324 
2325    */
2326 
2327   /* Create two versions of the loop -- in the old one, we know that the
2328      number of iterations is large enough, and we will transform it into the
2329      loop that will be split to loop_fn, the new one will be used for the
2330      remaining iterations.  */
2331 
2332   /* We should compute a better number-of-iterations value for outer loops.
2333      That is, if we have
2334 
2335     for (i = 0; i < n; ++i)
2336       for (j = 0; j < m; ++j)
2337         ...
2338 
2339     we should compute nit = n * m, not nit = n.
2340     Also may_be_zero handling would need to be adjusted.  */
2341 
2342   type = TREE_TYPE (niter->niter);
2343   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
2344 			      NULL_TREE);
2345   if (stmts)
2346     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2347 
2348   if (!oacc_kernels_p)
2349     {
2350       if (loop->inner)
2351 	m_p_thread=2;
2352       else
2353 	m_p_thread=MIN_PER_THREAD;
2354 
2355       gcc_checking_assert (n_threads != 0);
2356       many_iterations_cond =
2357 	fold_build2 (GE_EXPR, boolean_type_node,
2358 		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
2359 
2360       many_iterations_cond
2361 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2362 		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
2363 		       many_iterations_cond);
2364       many_iterations_cond
2365 	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
2366       if (stmts)
2367 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2368       if (!is_gimple_condexpr (many_iterations_cond))
2369 	{
2370 	  many_iterations_cond
2371 	    = force_gimple_operand (many_iterations_cond, &stmts,
2372 				    true, NULL_TREE);
2373 	  if (stmts)
2374 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
2375 					      stmts);
2376 	}
2377 
2378       initialize_original_copy_tables ();
2379 
2380       /* We assume that the loop usually iterates a lot.  */
2381       loop_version (loop, many_iterations_cond, NULL,
2382 		    profile_probability::likely (),
2383 		    profile_probability::unlikely (),
2384 		    profile_probability::likely (),
2385 		    profile_probability::unlikely (), true);
2386       update_ssa (TODO_update_ssa);
2387       free_original_copy_tables ();
2388     }
2389 
2390   /* Base all the induction variables in LOOP on a single control one.  */
2391   canonicalize_loop_ivs (loop, &nit, true);
2392   if (num_phis (loop->header, false) != reduction_list->elements () + 1)
2393     {
2394       /* The call to canonicalize_loop_ivs above failed to "base all the
2395 	 induction variables in LOOP on a single control one".  Do damage
2396 	 control.  */
2397       basic_block preheader = loop_preheader_edge (loop)->src;
2398       basic_block cond_bb = single_pred (preheader);
2399       gcond *cond = as_a <gcond *> (gsi_stmt (gsi_last_bb (cond_bb)));
2400       gimple_cond_make_true (cond);
2401       update_stmt (cond);
2402       /* We've gotten rid of the duplicate loop created by loop_version, but
2403 	 we can't undo whatever canonicalize_loop_ivs has done.
2404 	 TODO: Fix this properly by ensuring that the call to
2405 	 canonicalize_loop_ivs succeeds.  */
2406       if (dump_file
2407 	  && (dump_flags & TDF_DETAILS))
2408 	fprintf (dump_file, "canonicalize_loop_ivs failed for loop %d,"
2409 		 " aborting transformation\n", loop->num);
2410       return;
2411     }
2412 
2413   /* Ensure that the exit condition is the first statement in the loop.
2414      The common case is that latch of the loop is empty (apart from the
2415      increment) and immediately follows the loop exit test.  Attempt to move the
2416      entry of the loop directly before the exit check and increase the number of
2417      iterations of the loop by one.  */
2418   if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
2419     {
2420       if (dump_file
2421 	  && (dump_flags & TDF_DETAILS))
2422 	fprintf (dump_file,
2423 		 "alternative exit-first loop transform succeeded"
2424 		 " for loop %d\n", loop->num);
2425     }
2426   else
2427     {
2428       if (oacc_kernels_p)
2429 	n_threads = 1;
2430 
2431       /* Fall back on the method that handles more cases, but duplicates the
2432 	 loop body: move the exit condition of LOOP to the beginning of its
2433 	 header, and duplicate the part of the last iteration that gets disabled
2434 	 to the exit of the loop.  */
2435       transform_to_exit_first_loop (loop, reduction_list, nit);
2436     }
2437 
2438   /* Generate initializations for reductions.  */
2439   if (reduction_list->elements () > 0)
2440     reduction_list->traverse <struct loop *, initialize_reductions> (loop);
2441 
2442   /* Eliminate the references to local variables from the loop.  */
2443   gcc_assert (single_exit (loop));
2444   entry = loop_preheader_edge (loop);
2445   exit = single_dom_exit (loop);
2446 
2447   /* This rewrites the body in terms of new variables.  This has already
2448      been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
2449   if (!oacc_kernels_p)
2450     {
2451       eliminate_local_variables (entry, exit);
2452       /* In the old loop, move all variables non-local to the loop to a
2453 	 structure and back, and create separate decls for the variables used in
2454 	 loop.  */
2455       separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
2456 				&new_arg_struct, &clsn_data);
2457     }
2458   else
2459     {
2460       arg_struct = NULL_TREE;
2461       new_arg_struct = NULL_TREE;
2462       clsn_data.load = NULL_TREE;
2463       clsn_data.load_bb = exit->dest;
2464       clsn_data.store = NULL_TREE;
2465       clsn_data.store_bb = NULL;
2466     }
2467 
2468   /* Create the parallel constructs.  */
2469   loc = UNKNOWN_LOCATION;
2470   cond_stmt = last_stmt (loop->header);
2471   if (cond_stmt)
2472     loc = gimple_location (cond_stmt);
2473   create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
2474 			n_threads, loc, oacc_kernels_p);
2475   if (reduction_list->elements () > 0)
2476     create_call_for_reduction (loop, reduction_list, &clsn_data);
2477 
2478   scev_reset ();
2479 
2480   /* Free loop bound estimations that could contain references to
2481      removed statements.  */
2482   free_numbers_of_iterations_estimates (cfun);
2483 }
2484 
2485 /* Returns true when LOOP contains vector phi nodes.  */
2486 
2487 static bool
loop_has_vector_phi_nodes(struct loop * loop ATTRIBUTE_UNUSED)2488 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
2489 {
2490   unsigned i;
2491   basic_block *bbs = get_loop_body_in_dom_order (loop);
2492   gphi_iterator gsi;
2493   bool res = true;
2494 
2495   for (i = 0; i < loop->num_nodes; i++)
2496     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2497       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE)
2498 	goto end;
2499 
2500   res = false;
2501  end:
2502   free (bbs);
2503   return res;
2504 }
2505 
2506 /* Create a reduction_info struct, initialize it with REDUC_STMT
2507    and PHI, insert it to the REDUCTION_LIST.  */
2508 
2509 static void
build_new_reduction(reduction_info_table_type * reduction_list,gimple * reduc_stmt,gphi * phi)2510 build_new_reduction (reduction_info_table_type *reduction_list,
2511 		     gimple *reduc_stmt, gphi *phi)
2512 {
2513   reduction_info **slot;
2514   struct reduction_info *new_reduction;
2515   enum tree_code reduction_code;
2516 
2517   gcc_assert (reduc_stmt);
2518 
2519   if (gimple_code (reduc_stmt) == GIMPLE_PHI)
2520     {
2521       tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
2522       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2523       reduction_code = gimple_assign_rhs_code (def1);
2524     }
2525   else
2526     reduction_code = gimple_assign_rhs_code (reduc_stmt);
2527   /* Check for OpenMP supported reduction.  */
2528   switch (reduction_code)
2529     {
2530     case PLUS_EXPR:
2531     case MULT_EXPR:
2532     case MAX_EXPR:
2533     case MIN_EXPR:
2534     case BIT_IOR_EXPR:
2535     case BIT_XOR_EXPR:
2536     case BIT_AND_EXPR:
2537     case TRUTH_OR_EXPR:
2538     case TRUTH_XOR_EXPR:
2539     case TRUTH_AND_EXPR:
2540       break;
2541     default:
2542       return;
2543     }
2544 
2545   if (dump_file && (dump_flags & TDF_DETAILS))
2546     {
2547       fprintf (dump_file,
2548 	       "Detected reduction. reduction stmt is:\n");
2549       print_gimple_stmt (dump_file, reduc_stmt, 0);
2550       fprintf (dump_file, "\n");
2551     }
2552 
2553   new_reduction = XCNEW (struct reduction_info);
2554 
2555   new_reduction->reduc_stmt = reduc_stmt;
2556   new_reduction->reduc_phi = phi;
2557   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
2558   new_reduction->reduction_code = reduction_code;
2559   slot = reduction_list->find_slot (new_reduction, INSERT);
2560   *slot = new_reduction;
2561 }
2562 
2563 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
2564 
2565 int
set_reduc_phi_uids(reduction_info ** slot,void * data ATTRIBUTE_UNUSED)2566 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
2567 {
2568   struct reduction_info *const red = *slot;
2569   gimple_set_uid (red->reduc_phi, red->reduc_version);
2570   return 1;
2571 }
2572 
2573 /* Return true if the type of reduction performed by STMT is suitable
2574    for this pass.  */
2575 
2576 static bool
valid_reduction_p(gimple * stmt)2577 valid_reduction_p (gimple *stmt)
2578 {
2579   /* Parallelization would reassociate the operation, which isn't
2580      allowed for in-order reductions.  */
2581   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2582   vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (stmt_info);
2583   return reduc_type != FOLD_LEFT_REDUCTION;
2584 }
2585 
2586 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
2587 
2588 static void
gather_scalar_reductions(loop_p loop,reduction_info_table_type * reduction_list)2589 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
2590 {
2591   gphi_iterator gsi;
2592   loop_vec_info simple_loop_info;
2593   auto_vec<gphi *, 4> double_reduc_phis;
2594   auto_vec<gimple *, 4> double_reduc_stmts;
2595 
2596   if (!stmt_vec_info_vec.exists ())
2597     init_stmt_vec_info_vec ();
2598 
2599   simple_loop_info = vect_analyze_loop_form (loop);
2600   if (simple_loop_info == NULL)
2601     goto gather_done;
2602 
2603   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2604     {
2605       gphi *phi = gsi.phi ();
2606       affine_iv iv;
2607       tree res = PHI_RESULT (phi);
2608       bool double_reduc;
2609 
2610       if (virtual_operand_p (res))
2611 	continue;
2612 
2613       if (simple_iv (loop, loop, res, &iv, true))
2614 	continue;
2615 
2616       gimple *reduc_stmt
2617 	= vect_force_simple_reduction (simple_loop_info, phi,
2618 				       &double_reduc, true);
2619       if (!reduc_stmt || !valid_reduction_p (reduc_stmt))
2620 	continue;
2621 
2622       if (double_reduc)
2623 	{
2624 	  if (loop->inner->inner != NULL)
2625 	    continue;
2626 
2627 	  double_reduc_phis.safe_push (phi);
2628 	  double_reduc_stmts.safe_push (reduc_stmt);
2629 	  continue;
2630 	}
2631 
2632       build_new_reduction (reduction_list, reduc_stmt, phi);
2633     }
2634   delete simple_loop_info;
2635 
2636   if (!double_reduc_phis.is_empty ())
2637     {
2638       simple_loop_info = vect_analyze_loop_form (loop->inner);
2639       if (simple_loop_info)
2640 	{
2641 	  gphi *phi;
2642 	  unsigned int i;
2643 
2644 	  FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
2645 	    {
2646 	      affine_iv iv;
2647 	      tree res = PHI_RESULT (phi);
2648 	      bool double_reduc;
2649 
2650 	      use_operand_p use_p;
2651 	      gimple *inner_stmt;
2652 	      bool single_use_p = single_imm_use (res, &use_p, &inner_stmt);
2653 	      gcc_assert (single_use_p);
2654 	      if (gimple_code (inner_stmt) != GIMPLE_PHI)
2655 		continue;
2656 	      gphi *inner_phi = as_a <gphi *> (inner_stmt);
2657 	      if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
2658 			     &iv, true))
2659 		continue;
2660 
2661 	      gimple *inner_reduc_stmt
2662 		= vect_force_simple_reduction (simple_loop_info, inner_phi,
2663 					       &double_reduc, true);
2664 	      gcc_assert (!double_reduc);
2665 	      if (inner_reduc_stmt == NULL
2666 		  || !valid_reduction_p (inner_reduc_stmt))
2667 		continue;
2668 
2669 	      build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
2670 	    }
2671 	  delete simple_loop_info;
2672 	}
2673     }
2674 
2675  gather_done:
2676   /* Release the claim on gimple_uid.  */
2677   free_stmt_vec_info_vec ();
2678 
2679   if (reduction_list->elements () == 0)
2680     return;
2681 
2682   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
2683      and free_stmt_vec_info_vec, we can set gimple_uid of reduc_phi stmts only
2684      now.  */
2685   basic_block bb;
2686   FOR_EACH_BB_FN (bb, cfun)
2687     for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2688       gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1);
2689   reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
2690 }
2691 
2692 /* Try to initialize NITER for code generation part.  */
2693 
2694 static bool
try_get_loop_niter(loop_p loop,struct tree_niter_desc * niter)2695 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2696 {
2697   edge exit = single_dom_exit (loop);
2698 
2699   gcc_assert (exit);
2700 
2701   /* We need to know # of iterations, and there should be no uses of values
2702      defined inside loop outside of it, unless the values are invariants of
2703      the loop.  */
2704   if (!number_of_iterations_exit (loop, exit, niter, false))
2705     {
2706       if (dump_file && (dump_flags & TDF_DETAILS))
2707 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2708       return false;
2709     }
2710 
2711   return true;
2712 }
2713 
2714 /* Return the default def of the first function argument.  */
2715 
2716 static tree
get_omp_data_i_param(void)2717 get_omp_data_i_param (void)
2718 {
2719   tree decl = DECL_ARGUMENTS (cfun->decl);
2720   gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
2721   return ssa_default_def (cfun, decl);
2722 }
2723 
2724 /* For PHI in loop header of LOOP, look for pattern:
2725 
2726    <bb preheader>
2727    .omp_data_i = &.omp_data_arr;
2728    addr = .omp_data_i->sum;
2729    sum_a = *addr;
2730 
2731    <bb header>:
2732    sum_b = PHI <sum_a (preheader), sum_c (latch)>
2733 
2734    and return addr.  Otherwise, return NULL_TREE.  */
2735 
2736 static tree
find_reduc_addr(struct loop * loop,gphi * phi)2737 find_reduc_addr (struct loop *loop, gphi *phi)
2738 {
2739   edge e = loop_preheader_edge (loop);
2740   tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
2741   gimple *stmt = SSA_NAME_DEF_STMT (arg);
2742   if (!gimple_assign_single_p (stmt))
2743     return NULL_TREE;
2744   tree memref = gimple_assign_rhs1 (stmt);
2745   if (TREE_CODE (memref) != MEM_REF)
2746     return NULL_TREE;
2747   tree addr = TREE_OPERAND (memref, 0);
2748 
2749   gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
2750   if (!gimple_assign_single_p (stmt2))
2751     return NULL_TREE;
2752   tree compref = gimple_assign_rhs1 (stmt2);
2753   if (TREE_CODE (compref) != COMPONENT_REF)
2754     return NULL_TREE;
2755   tree addr2 = TREE_OPERAND (compref, 0);
2756   if (TREE_CODE (addr2) != MEM_REF)
2757     return NULL_TREE;
2758   addr2 = TREE_OPERAND (addr2, 0);
2759   if (TREE_CODE (addr2) != SSA_NAME
2760       || addr2 != get_omp_data_i_param ())
2761     return NULL_TREE;
2762 
2763   return addr;
2764 }
2765 
2766 /* Try to initialize REDUCTION_LIST for code generation part.
2767    REDUCTION_LIST describes the reductions.  */
2768 
2769 static bool
try_create_reduction_list(loop_p loop,reduction_info_table_type * reduction_list,bool oacc_kernels_p)2770 try_create_reduction_list (loop_p loop,
2771 			   reduction_info_table_type *reduction_list,
2772 			   bool oacc_kernels_p)
2773 {
2774   edge exit = single_dom_exit (loop);
2775   gphi_iterator gsi;
2776 
2777   gcc_assert (exit);
2778 
2779   /* Try to get rid of exit phis.  */
2780   final_value_replacement_loop (loop);
2781 
2782   gather_scalar_reductions (loop, reduction_list);
2783 
2784 
2785   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2786     {
2787       gphi *phi = gsi.phi ();
2788       struct reduction_info *red;
2789       imm_use_iterator imm_iter;
2790       use_operand_p use_p;
2791       gimple *reduc_phi;
2792       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2793 
2794       if (!virtual_operand_p (val))
2795 	{
2796 	  if (dump_file && (dump_flags & TDF_DETAILS))
2797 	    {
2798 	      fprintf (dump_file, "phi is ");
2799 	      print_gimple_stmt (dump_file, phi, 0);
2800 	      fprintf (dump_file, "arg of phi to exit:   value ");
2801 	      print_generic_expr (dump_file, val);
2802 	      fprintf (dump_file, " used outside loop\n");
2803 	      fprintf (dump_file,
2804 		       "  checking if it is part of reduction pattern:\n");
2805 	    }
2806 	  if (reduction_list->elements () == 0)
2807 	    {
2808 	      if (dump_file && (dump_flags & TDF_DETAILS))
2809 		fprintf (dump_file,
2810 			 "  FAILED: it is not a part of reduction.\n");
2811 	      return false;
2812 	    }
2813 	  reduc_phi = NULL;
2814 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2815 	    {
2816 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2817 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2818 		{
2819 		  reduc_phi = USE_STMT (use_p);
2820 		  break;
2821 		}
2822 	    }
2823 	  red = reduction_phi (reduction_list, reduc_phi);
2824 	  if (red == NULL)
2825 	    {
2826 	      if (dump_file && (dump_flags & TDF_DETAILS))
2827 		fprintf (dump_file,
2828 			 "  FAILED: it is not a part of reduction.\n");
2829 	      return false;
2830 	    }
2831 	  if (red->keep_res != NULL)
2832 	    {
2833 	      if (dump_file && (dump_flags & TDF_DETAILS))
2834 		fprintf (dump_file,
2835 			 "  FAILED: reduction has multiple exit phis.\n");
2836 	      return false;
2837 	    }
2838 	  red->keep_res = phi;
2839 	  if (dump_file && (dump_flags & TDF_DETAILS))
2840 	    {
2841 	      fprintf (dump_file, "reduction phi is  ");
2842 	      print_gimple_stmt (dump_file, red->reduc_phi, 0);
2843 	      fprintf (dump_file, "reduction stmt is  ");
2844 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0);
2845 	    }
2846 	}
2847     }
2848 
2849   /* The iterations of the loop may communicate only through bivs whose
2850      iteration space can be distributed efficiently.  */
2851   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2852     {
2853       gphi *phi = gsi.phi ();
2854       tree def = PHI_RESULT (phi);
2855       affine_iv iv;
2856 
2857       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2858 	{
2859 	  struct reduction_info *red;
2860 
2861 	  red = reduction_phi (reduction_list, phi);
2862 	  if (red == NULL)
2863 	    {
2864 	      if (dump_file && (dump_flags & TDF_DETAILS))
2865 		fprintf (dump_file,
2866 			 "  FAILED: scalar dependency between iterations\n");
2867 	      return false;
2868 	    }
2869 	}
2870     }
2871 
2872   if (oacc_kernels_p)
2873     {
2874       for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
2875 	   gsi_next (&gsi))
2876 	{
2877 	  gphi *phi = gsi.phi ();
2878 	  tree def = PHI_RESULT (phi);
2879 	  affine_iv iv;
2880 
2881 	  if (!virtual_operand_p (def)
2882 	      && !simple_iv (loop, loop, def, &iv, true))
2883 	    {
2884 	      tree addr = find_reduc_addr (loop, phi);
2885 	      if (addr == NULL_TREE)
2886 		return false;
2887 	      struct reduction_info *red = reduction_phi (reduction_list, phi);
2888 	      red->reduc_addr = addr;
2889 	    }
2890 	}
2891     }
2892 
2893   return true;
2894 }
2895 
2896 /* Return true if LOOP contains phis with ADDR_EXPR in args.  */
2897 
2898 static bool
loop_has_phi_with_address_arg(struct loop * loop)2899 loop_has_phi_with_address_arg (struct loop *loop)
2900 {
2901   basic_block *bbs = get_loop_body (loop);
2902   bool res = false;
2903 
2904   unsigned i, j;
2905   gphi_iterator gsi;
2906   for (i = 0; i < loop->num_nodes; i++)
2907     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2908       {
2909 	gphi *phi = gsi.phi ();
2910 	for (j = 0; j < gimple_phi_num_args (phi); j++)
2911 	  {
2912 	    tree arg = gimple_phi_arg_def (phi, j);
2913 	    if (TREE_CODE (arg) == ADDR_EXPR)
2914 	      {
2915 		/* This should be handled by eliminate_local_variables, but that
2916 		   function currently ignores phis.  */
2917 		res = true;
2918 		goto end;
2919 	      }
2920 	  }
2921       }
2922  end:
2923   free (bbs);
2924 
2925   return res;
2926 }
2927 
2928 /* Return true if memory ref REF (corresponding to the stmt at GSI in
2929    REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
2930    or the statements in REGIONS_BB[I + n].  REF_IS_STORE indicates if REF is a
2931    store.  Ignore conflicts with SKIP_STMT.  */
2932 
2933 static bool
ref_conflicts_with_region(gimple_stmt_iterator gsi,ao_ref * ref,bool ref_is_store,vec<basic_block> region_bbs,unsigned int i,gimple * skip_stmt)2934 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
2935 			   bool ref_is_store, vec<basic_block> region_bbs,
2936 			   unsigned int i, gimple *skip_stmt)
2937 {
2938   basic_block bb = region_bbs[i];
2939   gsi_next (&gsi);
2940 
2941   while (true)
2942     {
2943       for (; !gsi_end_p (gsi);
2944 	   gsi_next (&gsi))
2945 	{
2946 	  gimple *stmt = gsi_stmt (gsi);
2947 	  if (stmt == skip_stmt)
2948 	    {
2949 	      if (dump_file)
2950 		{
2951 		  fprintf (dump_file, "skipping reduction store: ");
2952 		  print_gimple_stmt (dump_file, stmt, 0);
2953 		}
2954 	      continue;
2955 	    }
2956 
2957 	  if (!gimple_vdef (stmt)
2958 	      && !gimple_vuse (stmt))
2959 	    continue;
2960 
2961 	  if (gimple_code (stmt) == GIMPLE_RETURN)
2962 	    continue;
2963 
2964 	  if (ref_is_store)
2965 	    {
2966 	      if (ref_maybe_used_by_stmt_p (stmt, ref))
2967 		{
2968 		  if (dump_file)
2969 		    {
2970 		      fprintf (dump_file, "Stmt ");
2971 		      print_gimple_stmt (dump_file, stmt, 0);
2972 		    }
2973 		  return true;
2974 		}
2975 	    }
2976 	  else
2977 	    {
2978 	      if (stmt_may_clobber_ref_p_1 (stmt, ref))
2979 		{
2980 		  if (dump_file)
2981 		    {
2982 		      fprintf (dump_file, "Stmt ");
2983 		      print_gimple_stmt (dump_file, stmt, 0);
2984 		    }
2985 		  return true;
2986 		}
2987 	    }
2988 	}
2989       i++;
2990       if (i == region_bbs.length ())
2991 	break;
2992       bb = region_bbs[i];
2993       gsi = gsi_start_bb (bb);
2994     }
2995 
2996   return false;
2997 }
2998 
2999 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
3000    in parallel with REGION_BBS containing the loop.  Return the stores of
3001    reduction results in REDUCTION_STORES.  */
3002 
3003 static bool
oacc_entry_exit_ok_1(bitmap in_loop_bbs,vec<basic_block> region_bbs,reduction_info_table_type * reduction_list,bitmap reduction_stores)3004 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3005 		      reduction_info_table_type *reduction_list,
3006 		      bitmap reduction_stores)
3007 {
3008   tree omp_data_i = get_omp_data_i_param ();
3009 
3010   unsigned i;
3011   basic_block bb;
3012   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3013     {
3014       if (bitmap_bit_p (in_loop_bbs, bb->index))
3015 	continue;
3016 
3017       gimple_stmt_iterator gsi;
3018       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3019 	   gsi_next (&gsi))
3020 	{
3021 	  gimple *stmt = gsi_stmt (gsi);
3022 	  gimple *skip_stmt = NULL;
3023 
3024 	  if (is_gimple_debug (stmt)
3025 	      || gimple_code (stmt) == GIMPLE_COND)
3026 	    continue;
3027 
3028 	  ao_ref ref;
3029 	  bool ref_is_store = false;
3030 	  if (gimple_assign_load_p (stmt))
3031 	    {
3032 	      tree rhs = gimple_assign_rhs1 (stmt);
3033 	      tree base = get_base_address (rhs);
3034 	      if (TREE_CODE (base) == MEM_REF
3035 		  && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0))
3036 		continue;
3037 
3038 	      tree lhs = gimple_assign_lhs (stmt);
3039 	      if (TREE_CODE (lhs) == SSA_NAME
3040 		  && has_single_use (lhs))
3041 		{
3042 		  use_operand_p use_p;
3043 		  gimple *use_stmt;
3044 		  single_imm_use (lhs, &use_p, &use_stmt);
3045 		  if (gimple_code (use_stmt) == GIMPLE_PHI)
3046 		    {
3047 		      struct reduction_info *red;
3048 		      red = reduction_phi (reduction_list, use_stmt);
3049 		      tree val = PHI_RESULT (red->keep_res);
3050 		      if (has_single_use (val))
3051 			{
3052 			  single_imm_use (val, &use_p, &use_stmt);
3053 			  if (gimple_store_p (use_stmt))
3054 			    {
3055 			      unsigned int id
3056 				= SSA_NAME_VERSION (gimple_vdef (use_stmt));
3057 			      bitmap_set_bit (reduction_stores, id);
3058 			      skip_stmt = use_stmt;
3059 			      if (dump_file)
3060 				{
3061 				  fprintf (dump_file, "found reduction load: ");
3062 				  print_gimple_stmt (dump_file, stmt, 0);
3063 				}
3064 			    }
3065 			}
3066 		    }
3067 		}
3068 
3069 	      ao_ref_init (&ref, rhs);
3070 	    }
3071 	  else if (gimple_store_p (stmt))
3072 	    {
3073 	      ao_ref_init (&ref, gimple_assign_lhs (stmt));
3074 	      ref_is_store = true;
3075 	    }
3076 	  else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
3077 	    continue;
3078 	  else if (!gimple_has_side_effects (stmt)
3079 		   && !gimple_could_trap_p (stmt)
3080 		   && !stmt_could_throw_p (stmt)
3081 		   && !gimple_vdef (stmt)
3082 		   && !gimple_vuse (stmt))
3083 	    continue;
3084 	  else if (gimple_call_internal_p (stmt, IFN_GOACC_DIM_POS))
3085 	    continue;
3086 	  else if (gimple_code (stmt) == GIMPLE_RETURN)
3087 	    continue;
3088 	  else
3089 	    {
3090 	      if (dump_file)
3091 		{
3092 		  fprintf (dump_file, "Unhandled stmt in entry/exit: ");
3093 		  print_gimple_stmt (dump_file, stmt, 0);
3094 		}
3095 	      return false;
3096 	    }
3097 
3098 	  if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs,
3099 					 i, skip_stmt))
3100 	    {
3101 	      if (dump_file)
3102 		{
3103 		  fprintf (dump_file, "conflicts with entry/exit stmt: ");
3104 		  print_gimple_stmt (dump_file, stmt, 0);
3105 		}
3106 	      return false;
3107 	    }
3108 	}
3109     }
3110 
3111   return true;
3112 }
3113 
3114 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3115    gang_pos == 0, except when the stores are REDUCTION_STORES.  Return true
3116    if any changes were made.  */
3117 
3118 static bool
oacc_entry_exit_single_gang(bitmap in_loop_bbs,vec<basic_block> region_bbs,bitmap reduction_stores)3119 oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3120 			     bitmap reduction_stores)
3121 {
3122   tree gang_pos = NULL_TREE;
3123   bool changed = false;
3124 
3125   unsigned i;
3126   basic_block bb;
3127   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3128     {
3129       if (bitmap_bit_p (in_loop_bbs, bb->index))
3130 	continue;
3131 
3132       gimple_stmt_iterator gsi;
3133       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
3134 	{
3135 	  gimple *stmt = gsi_stmt (gsi);
3136 
3137 	  if (!gimple_store_p (stmt))
3138 	    {
3139 	      /* Update gsi to point to next stmt.  */
3140 	      gsi_next (&gsi);
3141 	      continue;
3142 	    }
3143 
3144 	  if (bitmap_bit_p (reduction_stores,
3145 			    SSA_NAME_VERSION (gimple_vdef (stmt))))
3146 	    {
3147 	      if (dump_file)
3148 		{
3149 		  fprintf (dump_file,
3150 			   "skipped reduction store for single-gang"
3151 			   " neutering: ");
3152 		  print_gimple_stmt (dump_file, stmt, 0);
3153 		}
3154 
3155 	      /* Update gsi to point to next stmt.  */
3156 	      gsi_next (&gsi);
3157 	      continue;
3158 	    }
3159 
3160 	  changed = true;
3161 
3162 	  if (gang_pos == NULL_TREE)
3163 	    {
3164 	      tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3165 	      gcall *gang_single
3166 		= gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3167 	      gang_pos = make_ssa_name (integer_type_node);
3168 	      gimple_call_set_lhs (gang_single, gang_pos);
3169 	      gimple_stmt_iterator start
3170 		= gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3171 	      tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3172 	      gimple_set_vuse (gang_single, vuse);
3173 	      gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3174 	    }
3175 
3176 	  if (dump_file)
3177 	    {
3178 	      fprintf (dump_file,
3179 		       "found store that needs single-gang neutering: ");
3180 	      print_gimple_stmt (dump_file, stmt, 0);
3181 	    }
3182 
3183 	  {
3184 	    /* Split block before store.  */
3185 	    gimple_stmt_iterator gsi2 = gsi;
3186 	    gsi_prev (&gsi2);
3187 	    edge e;
3188 	    if (gsi_end_p (gsi2))
3189 	      {
3190 		e = split_block_after_labels (bb);
3191 		gsi2 = gsi_last_bb (bb);
3192 	      }
3193 	    else
3194 	      e = split_block (bb, gsi_stmt (gsi2));
3195 	    basic_block bb2 = e->dest;
3196 
3197 	    /* Split block after store.  */
3198 	    gimple_stmt_iterator gsi3 = gsi_start_bb (bb2);
3199 	    edge e2 = split_block (bb2, gsi_stmt (gsi3));
3200 	    basic_block bb3 = e2->dest;
3201 
3202 	    gimple *cond
3203 	      = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3204 				   NULL_TREE, NULL_TREE);
3205 	    gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3206 
3207 	    edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3208 	    /* FIXME: What is the probability?  */
3209 	    e3->probability = profile_probability::guessed_never ();
3210 	    e->flags = EDGE_TRUE_VALUE;
3211 
3212 	    tree vdef = gimple_vdef (stmt);
3213 	    tree vuse = gimple_vuse (stmt);
3214 
3215 	    tree phi_res = copy_ssa_name (vdef);
3216 	    gphi *new_phi = create_phi_node (phi_res, bb3);
3217 	    replace_uses_by (vdef, phi_res);
3218 	    add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3219 	    add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3220 
3221 	    /* Update gsi to point to next stmt.  */
3222 	    bb = bb3;
3223 	    gsi = gsi_start_bb (bb);
3224 	  }
3225 	}
3226     }
3227 
3228   return changed;
3229 }
3230 
3231 /* Return true if the statements before and after the LOOP can be executed in
3232    parallel with the function containing the loop.  Resolve conflicting stores
3233    outside LOOP by guarding them such that only a single gang executes them.  */
3234 
3235 static bool
oacc_entry_exit_ok(struct loop * loop,reduction_info_table_type * reduction_list)3236 oacc_entry_exit_ok (struct loop *loop,
3237 		    reduction_info_table_type *reduction_list)
3238 {
3239   basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3240   vec<basic_block> region_bbs
3241     = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3242 
3243   bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3244   bitmap_clear (in_loop_bbs);
3245   for (unsigned int i = 0; i < loop->num_nodes; i++)
3246     bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3247 
3248   bitmap reduction_stores = BITMAP_ALLOC (NULL);
3249   bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3250 				   reduction_stores);
3251 
3252   if (res)
3253     {
3254       bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3255 						  reduction_stores);
3256       if (changed)
3257 	{
3258 	  free_dominance_info (CDI_DOMINATORS);
3259 	  calculate_dominance_info (CDI_DOMINATORS);
3260 	}
3261     }
3262 
3263   region_bbs.release ();
3264   free (loop_bbs);
3265 
3266   BITMAP_FREE (in_loop_bbs);
3267   BITMAP_FREE (reduction_stores);
3268 
3269   return res;
3270 }
3271 
3272 /* Detect parallel loops and generate parallel code using libgomp
3273    primitives.  Returns true if some loop was parallelized, false
3274    otherwise.  */
3275 
3276 static bool
parallelize_loops(bool oacc_kernels_p)3277 parallelize_loops (bool oacc_kernels_p)
3278 {
3279   unsigned n_threads;
3280   bool changed = false;
3281   struct loop *loop;
3282   struct loop *skip_loop = NULL;
3283   struct tree_niter_desc niter_desc;
3284   struct obstack parloop_obstack;
3285   HOST_WIDE_INT estimated;
3286   source_location loop_loc;
3287 
3288   /* Do not parallelize loops in the functions created by parallelization.  */
3289   if (!oacc_kernels_p
3290       && parallelized_function_p (cfun->decl))
3291     return false;
3292 
3293   /* Do not parallelize loops in offloaded functions.  */
3294   if (!oacc_kernels_p
3295       && oacc_get_fn_attrib (cfun->decl) != NULL)
3296      return false;
3297 
3298   if (cfun->has_nonlocal_label)
3299     return false;
3300 
3301   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
3302      the argument to -ftree-parallelize-loops.  */
3303   if (oacc_kernels_p)
3304     n_threads = 0;
3305   else
3306     n_threads = flag_tree_parallelize_loops;
3307 
3308   gcc_obstack_init (&parloop_obstack);
3309   reduction_info_table_type reduction_list (10);
3310 
3311   calculate_dominance_info (CDI_DOMINATORS);
3312 
3313   FOR_EACH_LOOP (loop, 0)
3314     {
3315       if (loop == skip_loop)
3316 	{
3317 	  if (!loop->in_oacc_kernels_region
3318 	      && dump_file && (dump_flags & TDF_DETAILS))
3319 	    fprintf (dump_file,
3320 		     "Skipping loop %d as inner loop of parallelized loop\n",
3321 		     loop->num);
3322 
3323 	  skip_loop = loop->inner;
3324 	  continue;
3325 	}
3326       else
3327 	skip_loop = NULL;
3328 
3329       reduction_list.empty ();
3330 
3331       if (oacc_kernels_p)
3332 	{
3333 	  if (!loop->in_oacc_kernels_region)
3334 	    continue;
3335 
3336 	  /* Don't try to parallelize inner loops in an oacc kernels region.  */
3337 	  if (loop->inner)
3338 	    skip_loop = loop->inner;
3339 
3340 	  if (dump_file && (dump_flags & TDF_DETAILS))
3341 	    fprintf (dump_file,
3342 		     "Trying loop %d with header bb %d in oacc kernels"
3343 		     " region\n", loop->num, loop->header->index);
3344 	}
3345 
3346       if (dump_file && (dump_flags & TDF_DETAILS))
3347       {
3348         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
3349 	if (loop->inner)
3350 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
3351 	else
3352 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
3353       }
3354 
3355       if (!single_dom_exit (loop))
3356       {
3357 
3358         if (dump_file && (dump_flags & TDF_DETAILS))
3359 	  fprintf (dump_file, "loop is !single_dom_exit\n");
3360 
3361 	continue;
3362       }
3363 
3364       if (/* And of course, the loop must be parallelizable.  */
3365 	  !can_duplicate_loop_p (loop)
3366 	  || loop_has_blocks_with_irreducible_flag (loop)
3367 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
3368 	  /* FIXME: the check for vector phi nodes could be removed.  */
3369 	  || loop_has_vector_phi_nodes (loop))
3370 	continue;
3371 
3372       estimated = estimated_loop_iterations_int (loop);
3373       if (estimated == -1)
3374 	estimated = get_likely_max_loop_iterations_int (loop);
3375       /* FIXME: Bypass this check as graphite doesn't update the
3376 	 count and frequency correctly now.  */
3377       if (!flag_loop_parallelize_all
3378 	  && !oacc_kernels_p
3379 	  && ((estimated != -1
3380 	       && (estimated
3381 		   < ((HOST_WIDE_INT) n_threads
3382 		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
3383 	      /* Do not bother with loops in cold areas.  */
3384 	      || optimize_loop_nest_for_size_p (loop)))
3385 	continue;
3386 
3387       if (!try_get_loop_niter (loop, &niter_desc))
3388 	continue;
3389 
3390       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
3391 	continue;
3392 
3393       if (loop_has_phi_with_address_arg (loop))
3394 	continue;
3395 
3396       if (!loop->can_be_parallel
3397 	  && !loop_parallel_p (loop, &parloop_obstack))
3398 	continue;
3399 
3400       if (oacc_kernels_p
3401 	&& !oacc_entry_exit_ok (loop, &reduction_list))
3402 	{
3403 	  if (dump_file)
3404 	    fprintf (dump_file, "entry/exit not ok: FAILED\n");
3405 	  continue;
3406 	}
3407 
3408       changed = true;
3409       skip_loop = loop->inner;
3410 
3411       loop_loc = find_loop_location (loop);
3412       if (loop->inner)
3413 	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3414 			 "parallelizing outer loop %d\n", loop->num);
3415       else
3416 	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3417 			 "parallelizing inner loop %d\n", loop->num);
3418 
3419       gen_parallel_loop (loop, &reduction_list,
3420 			 n_threads, &niter_desc, oacc_kernels_p);
3421     }
3422 
3423   obstack_free (&parloop_obstack, NULL);
3424 
3425   /* Parallelization will cause new function calls to be inserted through
3426      which local variables will escape.  Reset the points-to solution
3427      for ESCAPED.  */
3428   if (changed)
3429     pt_solution_reset (&cfun->gimple_df->escaped);
3430 
3431   return changed;
3432 }
3433 
3434 /* Parallelization.  */
3435 
3436 namespace {
3437 
3438 const pass_data pass_data_parallelize_loops =
3439 {
3440   GIMPLE_PASS, /* type */
3441   "parloops", /* name */
3442   OPTGROUP_LOOP, /* optinfo_flags */
3443   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
3444   ( PROP_cfg | PROP_ssa ), /* properties_required */
3445   0, /* properties_provided */
3446   0, /* properties_destroyed */
3447   0, /* todo_flags_start */
3448   0, /* todo_flags_finish */
3449 };
3450 
3451 class pass_parallelize_loops : public gimple_opt_pass
3452 {
3453 public:
pass_parallelize_loops(gcc::context * ctxt)3454   pass_parallelize_loops (gcc::context *ctxt)
3455     : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
3456       oacc_kernels_p (false)
3457   {}
3458 
3459   /* opt_pass methods: */
gate(function *)3460   virtual bool gate (function *)
3461   {
3462     if (oacc_kernels_p)
3463       return flag_openacc;
3464     else
3465       return flag_tree_parallelize_loops > 1;
3466   }
3467   virtual unsigned int execute (function *);
clone()3468   opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); }
set_pass_param(unsigned int n,bool param)3469   void set_pass_param (unsigned int n, bool param)
3470     {
3471       gcc_assert (n == 0);
3472       oacc_kernels_p = param;
3473     }
3474 
3475  private:
3476   bool oacc_kernels_p;
3477 }; // class pass_parallelize_loops
3478 
3479 unsigned
execute(function * fun)3480 pass_parallelize_loops::execute (function *fun)
3481 {
3482   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
3483   if (nthreads == NULL_TREE)
3484     return 0;
3485 
3486   bool in_loop_pipeline = scev_initialized_p ();
3487   if (!in_loop_pipeline)
3488     loop_optimizer_init (LOOPS_NORMAL
3489 			 | LOOPS_HAVE_RECORDED_EXITS);
3490 
3491   if (number_of_loops (fun) <= 1)
3492     return 0;
3493 
3494   if (!in_loop_pipeline)
3495     {
3496       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3497       scev_initialize ();
3498     }
3499 
3500   unsigned int todo = 0;
3501   if (parallelize_loops (oacc_kernels_p))
3502     {
3503       fun->curr_properties &= ~(PROP_gimple_eomp);
3504 
3505       checking_verify_loop_structure ();
3506 
3507       todo |= TODO_update_ssa;
3508     }
3509 
3510   if (!in_loop_pipeline)
3511     {
3512       scev_finalize ();
3513       loop_optimizer_finalize ();
3514     }
3515 
3516   return todo;
3517 }
3518 
3519 } // anon namespace
3520 
3521 gimple_opt_pass *
make_pass_parallelize_loops(gcc::context * ctxt)3522 make_pass_parallelize_loops (gcc::context *ctxt)
3523 {
3524   return new pass_parallelize_loops (ctxt);
3525 }
3526