1 /* Loop autoparallelization.
2    Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012
3    Free Software Foundation, Inc.
4    Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
5    Zdenek Dvorak <dvorakz@suse.cz>.
6 
7 This file is part of GCC.
8 
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13 
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17 for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3.  If not see
21 <http://www.gnu.org/licenses/>.  */
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tree-flow.h"
27 #include "cfgloop.h"
28 #include "tree-data-ref.h"
29 #include "tree-scalar-evolution.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-pass.h"
32 #include "langhooks.h"
33 #include "tree-vectorizer.h"
34 
35 /* This pass tries to distribute iterations of loops into several threads.
36    The implementation is straightforward -- for each loop we test whether its
37    iterations are independent, and if it is the case (and some additional
38    conditions regarding profitability and correctness are satisfied), we
39    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
40    machinery do its job.
41 
42    The most of the complexity is in bringing the code into shape expected
43    by the omp expanders:
44    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
45       variable and that the exit test is at the start of the loop body
46    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
47       variables by accesses through pointers, and breaking up ssa chains
48       by storing the values incoming to the parallelized loop to a structure
49       passed to the new function as an argument (something similar is done
50       in omp gimplification, unfortunately only a small part of the code
51       can be shared).
52 
53    TODO:
54    -- if there are several parallelizable loops in a function, it may be
55       possible to generate the threads just once (using synchronization to
56       ensure that cross-loop dependences are obeyed).
57    -- handling of common scalar dependence patterns (accumulation, ...)
58    -- handling of non-innermost loops  */
59 
60 /*
61   Reduction handling:
62   currently we use vect_force_simple_reduction() to detect reduction patterns.
63   The code transformation will be introduced by an example.
64 
65 
66 parloop
67 {
68   int sum=1;
69 
70   for (i = 0; i < N; i++)
71    {
72     x[i] = i + 3;
73     sum+=x[i];
74    }
75 }
76 
77 gimple-like code:
78 header_bb:
79 
80   # sum_29 = PHI <sum_11(5), 1(3)>
81   # i_28 = PHI <i_12(5), 0(3)>
82   D.1795_8 = i_28 + 3;
83   x[i_28] = D.1795_8;
84   sum_11 = D.1795_8 + sum_29;
85   i_12 = i_28 + 1;
86   if (N_6(D) > i_12)
87     goto header_bb;
88 
89 
90 exit_bb:
91 
92   # sum_21 = PHI <sum_11(4)>
93   printf (&"%d"[0], sum_21);
94 
95 
96 after reduction transformation (only relevant parts):
97 
98 parloop
99 {
100 
101 ....
102 
103 
104   # Storing the initial value given by the user.  #
105 
106   .paral_data_store.32.sum.27 = 1;
107 
108   #pragma omp parallel num_threads(4)
109 
110   #pragma omp for schedule(static)
111 
112   # The neutral element corresponding to the particular
113   reduction's operation, e.g. 0 for PLUS_EXPR,
114   1 for MULT_EXPR, etc. replaces the user's initial value.  #
115 
116   # sum.27_29 = PHI <sum.27_11, 0>
117 
118   sum.27_11 = D.1827_8 + sum.27_29;
119 
120   GIMPLE_OMP_CONTINUE
121 
122   # Adding this reduction phi is done at create_phi_for_local_result() #
123   # sum.27_56 = PHI <sum.27_11, 0>
124   GIMPLE_OMP_RETURN
125 
126   # Creating the atomic operation is done at
127   create_call_for_reduction_1()  #
128 
129   #pragma omp atomic_load
130   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
131   D.1840_60 = sum.27_56 + D.1839_59;
132   #pragma omp atomic_store (D.1840_60);
133 
134   GIMPLE_OMP_RETURN
135 
136  # collecting the result after the join of the threads is done at
137   create_loads_for_reductions().
138   The value computed by the threads is loaded from the
139   shared struct.  #
140 
141 
142   .paral_data_load.33_52 = &.paral_data_store.32;
143   sum_37 =  .paral_data_load.33_52->sum.27;
144   sum_43 = D.1795_41 + sum_37;
145 
146   exit bb:
147   # sum_21 = PHI <sum_43, sum_26>
148   printf (&"%d"[0], sum_21);
149 
150 ...
151 
152 }
153 
154 */
155 
156 /* Minimal number of iterations of a loop that should be executed in each
157    thread.  */
158 #define MIN_PER_THREAD 100
159 
160 /* Element of the hashtable, representing a
161    reduction in the current loop.  */
162 struct reduction_info
163 {
164   gimple reduc_stmt;		/* reduction statement.  */
165   gimple reduc_phi;		/* The phi node defining the reduction.  */
166   enum tree_code reduction_code;/* code for the reduction operation.  */
167   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
168 				   result.  */
169   gimple keep_res;		/* The PHI_RESULT of this phi is the resulting value
170 				   of the reduction variable when existing the loop. */
171   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
172   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
173   tree init;			/* reduction initialization value.  */
174   gimple new_phi;		/* (helper field) Newly created phi node whose result
175 				   will be passed to the atomic operation.  Represents
176 				   the local result each thread computed for the reduction
177 				   operation.  */
178 };
179 
180 /* Equality and hash functions for hashtab code.  */
181 
182 static int
183 reduction_info_eq (const void *aa, const void *bb)
184 {
185   const struct reduction_info *a = (const struct reduction_info *) aa;
186   const struct reduction_info *b = (const struct reduction_info *) bb;
187 
188   return (a->reduc_phi == b->reduc_phi);
189 }
190 
191 static hashval_t
192 reduction_info_hash (const void *aa)
193 {
194   const struct reduction_info *a = (const struct reduction_info *) aa;
195 
196   return a->reduc_version;
197 }
198 
199 static struct reduction_info *
200 reduction_phi (htab_t reduction_list, gimple phi)
201 {
202   struct reduction_info tmpred, *red;
203 
204   if (htab_elements (reduction_list) == 0 || phi == NULL)
205     return NULL;
206 
207   tmpred.reduc_phi = phi;
208   tmpred.reduc_version = gimple_uid (phi);
209   red = (struct reduction_info *) htab_find (reduction_list, &tmpred);
210 
211   return red;
212 }
213 
214 /* Element of hashtable of names to copy.  */
215 
216 struct name_to_copy_elt
217 {
218   unsigned version;	/* The version of the name to copy.  */
219   tree new_name;	/* The new name used in the copy.  */
220   tree field;		/* The field of the structure used to pass the
221 			   value.  */
222 };
223 
224 /* Equality and hash functions for hashtab code.  */
225 
226 static int
227 name_to_copy_elt_eq (const void *aa, const void *bb)
228 {
229   const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
230   const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb;
231 
232   return a->version == b->version;
233 }
234 
235 static hashval_t
236 name_to_copy_elt_hash (const void *aa)
237 {
238   const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa;
239 
240   return (hashval_t) a->version;
241 }
242 
243 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
244    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
245    represents the denominator for every element in the matrix.  */
246 typedef struct lambda_trans_matrix_s
247 {
248   lambda_matrix matrix;
249   int rowsize;
250   int colsize;
251   int denominator;
252 } *lambda_trans_matrix;
253 #define LTM_MATRIX(T) ((T)->matrix)
254 #define LTM_ROWSIZE(T) ((T)->rowsize)
255 #define LTM_COLSIZE(T) ((T)->colsize)
256 #define LTM_DENOMINATOR(T) ((T)->denominator)
257 
258 /* Allocate a new transformation matrix.  */
259 
260 static lambda_trans_matrix
261 lambda_trans_matrix_new (int colsize, int rowsize,
262 			 struct obstack * lambda_obstack)
263 {
264   lambda_trans_matrix ret;
265 
266   ret = (lambda_trans_matrix)
267     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
268   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
269   LTM_ROWSIZE (ret) = rowsize;
270   LTM_COLSIZE (ret) = colsize;
271   LTM_DENOMINATOR (ret) = 1;
272   return ret;
273 }
274 
275 /* Multiply a vector VEC by a matrix MAT.
276    MAT is an M*N matrix, and VEC is a vector with length N.  The result
277    is stored in DEST which must be a vector of length M.  */
278 
279 static void
280 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
281 			   lambda_vector vec, lambda_vector dest)
282 {
283   int i, j;
284 
285   lambda_vector_clear (dest, m);
286   for (i = 0; i < m; i++)
287     for (j = 0; j < n; j++)
288       dest[i] += matrix[i][j] * vec[j];
289 }
290 
291 /* Return true if TRANS is a legal transformation matrix that respects
292    the dependence vectors in DISTS and DIRS.  The conservative answer
293    is false.
294 
295    "Wolfe proves that a unimodular transformation represented by the
296    matrix T is legal when applied to a loop nest with a set of
297    lexicographically non-negative distance vectors RDG if and only if
298    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
299    i.e.: if and only if it transforms the lexicographically positive
300    distance vectors to lexicographically positive vectors.  Note that
301    a unimodular matrix must transform the zero vector (and only it) to
302    the zero vector." S.Muchnick.  */
303 
304 static bool
305 lambda_transform_legal_p (lambda_trans_matrix trans,
306 			  int nb_loops,
307 			  VEC (ddr_p, heap) *dependence_relations)
308 {
309   unsigned int i, j;
310   lambda_vector distres;
311   struct data_dependence_relation *ddr;
312 
313   gcc_assert (LTM_COLSIZE (trans) == nb_loops
314 	      && LTM_ROWSIZE (trans) == nb_loops);
315 
316   /* When there are no dependences, the transformation is correct.  */
317   if (VEC_length (ddr_p, dependence_relations) == 0)
318     return true;
319 
320   ddr = VEC_index (ddr_p, dependence_relations, 0);
321   if (ddr == NULL)
322     return true;
323 
324   /* When there is an unknown relation in the dependence_relations, we
325      know that it is no worth looking at this loop nest: give up.  */
326   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
327     return false;
328 
329   distres = lambda_vector_new (nb_loops);
330 
331   /* For each distance vector in the dependence graph.  */
332   FOR_EACH_VEC_ELT (ddr_p, dependence_relations, i, ddr)
333     {
334       /* Don't care about relations for which we know that there is no
335 	 dependence, nor about read-read (aka. output-dependences):
336 	 these data accesses can happen in any order.  */
337       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
338 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
339 	continue;
340 
341       /* Conservatively answer: "this transformation is not valid".  */
342       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
343 	return false;
344 
345       /* If the dependence could not be captured by a distance vector,
346 	 conservatively answer that the transform is not valid.  */
347       if (DDR_NUM_DIST_VECTS (ddr) == 0)
348 	return false;
349 
350       /* Compute trans.dist_vect */
351       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
352 	{
353 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
354 				     DDR_DIST_VECT (ddr, j), distres);
355 
356 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
357 	    return false;
358 	}
359     }
360   return true;
361 }
362 
363 /* Data dependency analysis. Returns true if the iterations of LOOP
364    are independent on each other (that is, if we can execute them
365    in parallel).  */
366 
367 static bool
368 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
369 {
370   VEC (loop_p, heap) *loop_nest;
371   VEC (ddr_p, heap) *dependence_relations;
372   VEC (data_reference_p, heap) *datarefs;
373   lambda_trans_matrix trans;
374   bool ret = false;
375 
376   if (dump_file && (dump_flags & TDF_DETAILS))
377   {
378     fprintf (dump_file, "Considering loop %d\n", loop->num);
379     if (!loop->inner)
380       fprintf (dump_file, "loop is innermost\n");
381     else
382       fprintf (dump_file, "loop NOT innermost\n");
383    }
384 
385   /* Check for problems with dependences.  If the loop can be reversed,
386      the iterations are independent.  */
387   datarefs = VEC_alloc (data_reference_p, heap, 10);
388   dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
389   loop_nest = VEC_alloc (loop_p, heap, 3);
390   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
391 					   &dependence_relations))
392     {
393       if (dump_file && (dump_flags & TDF_DETAILS))
394 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
395       ret = false;
396       goto end;
397     }
398   if (dump_file && (dump_flags & TDF_DETAILS))
399     dump_data_dependence_relations (dump_file, dependence_relations);
400 
401   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
402   LTM_MATRIX (trans)[0][0] = -1;
403 
404   if (lambda_transform_legal_p (trans, 1, dependence_relations))
405     {
406       ret = true;
407       if (dump_file && (dump_flags & TDF_DETAILS))
408 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
409     }
410   else if (dump_file && (dump_flags & TDF_DETAILS))
411     fprintf (dump_file,
412 	     "  FAILED: data dependencies exist across iterations\n");
413 
414  end:
415   VEC_free (loop_p, heap, loop_nest);
416   free_dependence_relations (dependence_relations);
417   free_data_refs (datarefs);
418 
419   return ret;
420 }
421 
422 /* Return true when LOOP contains basic blocks marked with the
423    BB_IRREDUCIBLE_LOOP flag.  */
424 
425 static inline bool
426 loop_has_blocks_with_irreducible_flag (struct loop *loop)
427 {
428   unsigned i;
429   basic_block *bbs = get_loop_body_in_dom_order (loop);
430   bool res = true;
431 
432   for (i = 0; i < loop->num_nodes; i++)
433     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
434       goto end;
435 
436   res = false;
437  end:
438   free (bbs);
439   return res;
440 }
441 
442 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
443    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
444    to their addresses that can be reused.  The address of OBJ is known to
445    be invariant in the whole function.  Other needed statements are placed
446    right before GSI.  */
447 
448 static tree
449 take_address_of (tree obj, tree type, edge entry, htab_t decl_address,
450 		 gimple_stmt_iterator *gsi)
451 {
452   int uid;
453   void **dslot;
454   struct int_tree_map ielt, *nielt;
455   tree *var_p, name, bvar, addr;
456   gimple stmt;
457   gimple_seq stmts;
458 
459   /* Since the address of OBJ is invariant, the trees may be shared.
460      Avoid rewriting unrelated parts of the code.  */
461   obj = unshare_expr (obj);
462   for (var_p = &obj;
463        handled_component_p (*var_p);
464        var_p = &TREE_OPERAND (*var_p, 0))
465     continue;
466 
467   /* Canonicalize the access to base on a MEM_REF.  */
468   if (DECL_P (*var_p))
469     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
470 
471   /* Assign a canonical SSA name to the address of the base decl used
472      in the address and share it for all accesses and addresses based
473      on it.  */
474   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
475   ielt.uid = uid;
476   dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
477   if (!*dslot)
478     {
479       if (gsi == NULL)
480 	return NULL;
481       addr = TREE_OPERAND (*var_p, 0);
482       bvar = create_tmp_var (TREE_TYPE (addr),
483 			     get_name (TREE_OPERAND
484 				         (TREE_OPERAND (*var_p, 0), 0)));
485       add_referenced_var (bvar);
486       stmt = gimple_build_assign (bvar, addr);
487       name = make_ssa_name (bvar, stmt);
488       gimple_assign_set_lhs (stmt, name);
489       gsi_insert_on_edge_immediate (entry, stmt);
490 
491       nielt = XNEW (struct int_tree_map);
492       nielt->uid = uid;
493       nielt->to = name;
494       *dslot = nielt;
495     }
496   else
497     name = ((struct int_tree_map *) *dslot)->to;
498 
499   /* Express the address in terms of the canonical SSA name.  */
500   TREE_OPERAND (*var_p, 0) = name;
501   if (gsi == NULL)
502     return build_fold_addr_expr_with_type (obj, type);
503 
504   name = force_gimple_operand (build_addr (obj, current_function_decl),
505 			       &stmts, true, NULL_TREE);
506   if (!gimple_seq_empty_p (stmts))
507     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
508 
509   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
510     {
511       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
512 				   NULL_TREE);
513       if (!gimple_seq_empty_p (stmts))
514 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
515     }
516 
517   return name;
518 }
519 
520 /* Callback for htab_traverse.  Create the initialization statement
521    for reduction described in SLOT, and place it at the preheader of
522    the loop described in DATA.  */
523 
524 static int
525 initialize_reductions (void **slot, void *data)
526 {
527   tree init, c;
528   tree bvar, type, arg;
529   edge e;
530 
531   struct reduction_info *const reduc = (struct reduction_info *) *slot;
532   struct loop *loop = (struct loop *) data;
533 
534   /* Create initialization in preheader:
535      reduction_variable = initialization value of reduction.  */
536 
537   /* In the phi node at the header, replace the argument coming
538      from the preheader with the reduction initialization value.  */
539 
540   /* Create a new variable to initialize the reduction.  */
541   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
542   bvar = create_tmp_var (type, "reduction");
543   add_referenced_var (bvar);
544 
545   c = build_omp_clause (gimple_location (reduc->reduc_stmt),
546 			OMP_CLAUSE_REDUCTION);
547   OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
548   OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt));
549 
550   init = omp_reduction_init (c, TREE_TYPE (bvar));
551   reduc->init = init;
552 
553   /* Replace the argument representing the initialization value
554      with the initialization value for the reduction (neutral
555      element for the particular operation, e.g. 0 for PLUS_EXPR,
556      1 for MULT_EXPR, etc).
557      Keep the old value in a new variable "reduction_initial",
558      that will be taken in consideration after the parallel
559      computing is done.  */
560 
561   e = loop_preheader_edge (loop);
562   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
563   /* Create new variable to hold the initial value.  */
564 
565   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
566 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
567   reduc->initial_value = arg;
568   return 1;
569 }
570 
571 struct elv_data
572 {
573   struct walk_stmt_info info;
574   edge entry;
575   htab_t decl_address;
576   gimple_stmt_iterator *gsi;
577   bool changed;
578   bool reset;
579 };
580 
581 /* Eliminates references to local variables in *TP out of the single
582    entry single exit region starting at DTA->ENTRY.
583    DECL_ADDRESS contains addresses of the references that had their
584    address taken already.  If the expression is changed, CHANGED is
585    set to true.  Callback for walk_tree.  */
586 
587 static tree
588 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
589 {
590   struct elv_data *const dta = (struct elv_data *) data;
591   tree t = *tp, var, addr, addr_type, type, obj;
592 
593   if (DECL_P (t))
594     {
595       *walk_subtrees = 0;
596 
597       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
598 	return NULL_TREE;
599 
600       type = TREE_TYPE (t);
601       addr_type = build_pointer_type (type);
602       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
603 			      dta->gsi);
604       if (dta->gsi == NULL && addr == NULL_TREE)
605 	{
606 	  dta->reset = true;
607 	  return NULL_TREE;
608 	}
609 
610       *tp = build_simple_mem_ref (addr);
611 
612       dta->changed = true;
613       return NULL_TREE;
614     }
615 
616   if (TREE_CODE (t) == ADDR_EXPR)
617     {
618       /* ADDR_EXPR may appear in two contexts:
619 	 -- as a gimple operand, when the address taken is a function invariant
620 	 -- as gimple rhs, when the resulting address in not a function
621 	    invariant
622 	 We do not need to do anything special in the latter case (the base of
623 	 the memory reference whose address is taken may be replaced in the
624 	 DECL_P case).  The former case is more complicated, as we need to
625 	 ensure that the new address is still a gimple operand.  Thus, it
626 	 is not sufficient to replace just the base of the memory reference --
627 	 we need to move the whole computation of the address out of the
628 	 loop.  */
629       if (!is_gimple_val (t))
630 	return NULL_TREE;
631 
632       *walk_subtrees = 0;
633       obj = TREE_OPERAND (t, 0);
634       var = get_base_address (obj);
635       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
636 	return NULL_TREE;
637 
638       addr_type = TREE_TYPE (t);
639       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
640 			      dta->gsi);
641       if (dta->gsi == NULL && addr == NULL_TREE)
642 	{
643 	  dta->reset = true;
644 	  return NULL_TREE;
645 	}
646       *tp = addr;
647 
648       dta->changed = true;
649       return NULL_TREE;
650     }
651 
652   if (!EXPR_P (t))
653     *walk_subtrees = 0;
654 
655   return NULL_TREE;
656 }
657 
658 /* Moves the references to local variables in STMT at *GSI out of the single
659    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
660    addresses of the references that had their address taken
661    already.  */
662 
663 static void
664 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
665 				htab_t decl_address)
666 {
667   struct elv_data dta;
668   gimple stmt = gsi_stmt (*gsi);
669 
670   memset (&dta.info, '\0', sizeof (dta.info));
671   dta.entry = entry;
672   dta.decl_address = decl_address;
673   dta.changed = false;
674   dta.reset = false;
675 
676   if (gimple_debug_bind_p (stmt))
677     {
678       dta.gsi = NULL;
679       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
680 		 eliminate_local_variables_1, &dta.info, NULL);
681       if (dta.reset)
682 	{
683 	  gimple_debug_bind_reset_value (stmt);
684 	  dta.changed = true;
685 	}
686     }
687   else
688     {
689       dta.gsi = gsi;
690       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
691     }
692 
693   if (dta.changed)
694     update_stmt (stmt);
695 }
696 
697 /* Eliminates the references to local variables from the single entry
698    single exit region between the ENTRY and EXIT edges.
699 
700    This includes:
701    1) Taking address of a local variable -- these are moved out of the
702    region (and temporary variable is created to hold the address if
703    necessary).
704 
705    2) Dereferencing a local variable -- these are replaced with indirect
706    references.  */
707 
708 static void
709 eliminate_local_variables (edge entry, edge exit)
710 {
711   basic_block bb;
712   VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3);
713   unsigned i;
714   gimple_stmt_iterator gsi;
715   bool has_debug_stmt = false;
716   htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
717 				     free);
718   basic_block entry_bb = entry->src;
719   basic_block exit_bb = exit->dest;
720 
721   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
722 
723   FOR_EACH_VEC_ELT (basic_block, body, i, bb)
724     if (bb != entry_bb && bb != exit_bb)
725       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
726 	if (is_gimple_debug (gsi_stmt (gsi)))
727 	  {
728 	    if (gimple_debug_bind_p (gsi_stmt (gsi)))
729 	      has_debug_stmt = true;
730 	  }
731 	else
732 	  eliminate_local_variables_stmt (entry, &gsi, decl_address);
733 
734   if (has_debug_stmt)
735     FOR_EACH_VEC_ELT (basic_block, body, i, bb)
736       if (bb != entry_bb && bb != exit_bb)
737 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
738 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
739 	    eliminate_local_variables_stmt (entry, &gsi, decl_address);
740 
741   htab_delete (decl_address);
742   VEC_free (basic_block, heap, body);
743 }
744 
745 /* Returns true if expression EXPR is not defined between ENTRY and
746    EXIT, i.e. if all its operands are defined outside of the region.  */
747 
748 static bool
749 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
750 {
751   basic_block entry_bb = entry->src;
752   basic_block exit_bb = exit->dest;
753   basic_block def_bb;
754 
755   if (is_gimple_min_invariant (expr))
756     return true;
757 
758   if (TREE_CODE (expr) == SSA_NAME)
759     {
760       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
761       if (def_bb
762 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
763 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
764 	return false;
765 
766       return true;
767     }
768 
769   return false;
770 }
771 
772 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
773    The copies are stored to NAME_COPIES, if NAME was already duplicated,
774    its duplicate stored in NAME_COPIES is returned.
775 
776    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
777    duplicated, storing the copies in DECL_COPIES.  */
778 
779 static tree
780 separate_decls_in_region_name (tree name,
781 			       htab_t name_copies, htab_t decl_copies,
782 			       bool copy_name_p)
783 {
784   tree copy, var, var_copy;
785   unsigned idx, uid, nuid;
786   struct int_tree_map ielt, *nielt;
787   struct name_to_copy_elt elt, *nelt;
788   void **slot, **dslot;
789 
790   if (TREE_CODE (name) != SSA_NAME)
791     return name;
792 
793   idx = SSA_NAME_VERSION (name);
794   elt.version = idx;
795   slot = htab_find_slot_with_hash (name_copies, &elt, idx,
796 				   copy_name_p ? INSERT : NO_INSERT);
797   if (slot && *slot)
798     return ((struct name_to_copy_elt *) *slot)->new_name;
799 
800   var = SSA_NAME_VAR (name);
801   uid = DECL_UID (var);
802   ielt.uid = uid;
803   dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
804   if (!*dslot)
805     {
806       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
807       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
808       add_referenced_var (var_copy);
809       nielt = XNEW (struct int_tree_map);
810       nielt->uid = uid;
811       nielt->to = var_copy;
812       *dslot = nielt;
813 
814       /* Ensure that when we meet this decl next time, we won't duplicate
815          it again.  */
816       nuid = DECL_UID (var_copy);
817       ielt.uid = nuid;
818       dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
819       gcc_assert (!*dslot);
820       nielt = XNEW (struct int_tree_map);
821       nielt->uid = nuid;
822       nielt->to = var_copy;
823       *dslot = nielt;
824     }
825   else
826     var_copy = ((struct int_tree_map *) *dslot)->to;
827 
828   if (copy_name_p)
829     {
830       copy = duplicate_ssa_name (name, NULL);
831       nelt = XNEW (struct name_to_copy_elt);
832       nelt->version = idx;
833       nelt->new_name = copy;
834       nelt->field = NULL_TREE;
835       *slot = nelt;
836     }
837   else
838     {
839       gcc_assert (!slot);
840       copy = name;
841     }
842 
843   SSA_NAME_VAR (copy) = var_copy;
844   return copy;
845 }
846 
847 /* Finds the ssa names used in STMT that are defined outside the
848    region between ENTRY and EXIT and replaces such ssa names with
849    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
850    decls of all ssa names used in STMT (including those defined in
851    LOOP) are replaced with the new temporary variables; the
852    replacement decls are stored in DECL_COPIES.  */
853 
854 static void
855 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt,
856 			       htab_t name_copies, htab_t decl_copies)
857 {
858   use_operand_p use;
859   def_operand_p def;
860   ssa_op_iter oi;
861   tree name, copy;
862   bool copy_name_p;
863 
864   mark_virtual_ops_for_renaming (stmt);
865 
866   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
867   {
868     name = DEF_FROM_PTR (def);
869     gcc_assert (TREE_CODE (name) == SSA_NAME);
870     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
871 					  false);
872     gcc_assert (copy == name);
873   }
874 
875   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
876   {
877     name = USE_FROM_PTR (use);
878     if (TREE_CODE (name) != SSA_NAME)
879       continue;
880 
881     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
882     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
883 					  copy_name_p);
884     SET_USE (use, copy);
885   }
886 }
887 
888 /* Finds the ssa names used in STMT that are defined outside the
889    region between ENTRY and EXIT and replaces such ssa names with
890    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
891    decls of all ssa names used in STMT (including those defined in
892    LOOP) are replaced with the new temporary variables; the
893    replacement decls are stored in DECL_COPIES.  */
894 
895 static bool
896 separate_decls_in_region_debug (gimple stmt, htab_t name_copies,
897 				htab_t decl_copies)
898 {
899   use_operand_p use;
900   ssa_op_iter oi;
901   tree var, name;
902   struct int_tree_map ielt;
903   struct name_to_copy_elt elt;
904   void **slot, **dslot;
905 
906   if (gimple_debug_bind_p (stmt))
907     var = gimple_debug_bind_get_var (stmt);
908   else if (gimple_debug_source_bind_p (stmt))
909     var = gimple_debug_source_bind_get_var (stmt);
910   else
911     return true;
912   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
913     return true;
914   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
915   ielt.uid = DECL_UID (var);
916   dslot = htab_find_slot_with_hash (decl_copies, &ielt, ielt.uid, NO_INSERT);
917   if (!dslot)
918     return true;
919   if (gimple_debug_bind_p (stmt))
920     gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
921   else if (gimple_debug_source_bind_p (stmt))
922     gimple_debug_source_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
923 
924   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
925   {
926     name = USE_FROM_PTR (use);
927     if (TREE_CODE (name) != SSA_NAME)
928       continue;
929 
930     elt.version = SSA_NAME_VERSION (name);
931     slot = htab_find_slot_with_hash (name_copies, &elt, elt.version, NO_INSERT);
932     if (!slot)
933       {
934 	gimple_debug_bind_reset_value (stmt);
935 	update_stmt (stmt);
936 	break;
937       }
938 
939     SET_USE (use, ((struct name_to_copy_elt *) *slot)->new_name);
940   }
941 
942   return false;
943 }
944 
945 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
946    specified in SLOT. The type is passed in DATA.  */
947 
948 static int
949 add_field_for_reduction (void **slot, void *data)
950 {
951 
952   struct reduction_info *const red = (struct reduction_info *) *slot;
953   tree const type = (tree) data;
954   tree var = SSA_NAME_VAR (gimple_assign_lhs (red->reduc_stmt));
955   tree field = build_decl (gimple_location (red->reduc_stmt),
956 			   FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
957 
958   insert_field_into_struct (type, field);
959 
960   red->field = field;
961 
962   return 1;
963 }
964 
965 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
966    described in SLOT. The type is passed in DATA.  */
967 
968 static int
969 add_field_for_name (void **slot, void *data)
970 {
971   struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
972   tree type = (tree) data;
973   tree name = ssa_name (elt->version);
974   tree var = SSA_NAME_VAR (name);
975   tree field = build_decl (DECL_SOURCE_LOCATION (var),
976 			   FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
977 
978   insert_field_into_struct (type, field);
979   elt->field = field;
980 
981   return 1;
982 }
983 
984 /* Callback for htab_traverse.  A local result is the intermediate result
985    computed by a single
986    thread, or the initial value in case no iteration was executed.
987    This function creates a phi node reflecting these values.
988    The phi's result will be stored in NEW_PHI field of the
989    reduction's data structure.  */
990 
991 static int
992 create_phi_for_local_result (void **slot, void *data)
993 {
994   struct reduction_info *const reduc = (struct reduction_info *) *slot;
995   const struct loop *const loop = (const struct loop *) data;
996   edge e;
997   gimple new_phi;
998   basic_block store_bb;
999   tree local_res;
1000   source_location locus;
1001 
1002   /* STORE_BB is the block where the phi
1003      should be stored.  It is the destination of the loop exit.
1004      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1005   store_bb = FALLTHRU_EDGE (loop->latch)->dest;
1006 
1007   /* STORE_BB has two predecessors.  One coming from  the loop
1008      (the reduction's result is computed at the loop),
1009      and another coming from a block preceding the loop,
1010      when no iterations
1011      are executed (the initial value should be taken).  */
1012   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
1013     e = EDGE_PRED (store_bb, 1);
1014   else
1015     e = EDGE_PRED (store_bb, 0);
1016   local_res
1017     = make_ssa_name (SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt)),
1018 		     NULL);
1019   locus = gimple_location (reduc->reduc_stmt);
1020   new_phi = create_phi_node (local_res, store_bb);
1021   SSA_NAME_DEF_STMT (local_res) = new_phi;
1022   add_phi_arg (new_phi, reduc->init, e, locus);
1023   add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt),
1024 	       FALLTHRU_EDGE (loop->latch), locus);
1025   reduc->new_phi = new_phi;
1026 
1027   return 1;
1028 }
1029 
1030 struct clsn_data
1031 {
1032   tree store;
1033   tree load;
1034 
1035   basic_block store_bb;
1036   basic_block load_bb;
1037 };
1038 
1039 /* Callback for htab_traverse.  Create an atomic instruction for the
1040    reduction described in SLOT.
1041    DATA annotates the place in memory the atomic operation relates to,
1042    and the basic block it needs to be generated in.  */
1043 
1044 static int
1045 create_call_for_reduction_1 (void **slot, void *data)
1046 {
1047   struct reduction_info *const reduc = (struct reduction_info *) *slot;
1048   struct clsn_data *const clsn_data = (struct clsn_data *) data;
1049   gimple_stmt_iterator gsi;
1050   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1051   tree load_struct;
1052   basic_block bb;
1053   basic_block new_bb;
1054   edge e;
1055   tree t, addr, ref, x;
1056   tree tmp_load, name;
1057   gimple load;
1058 
1059   load_struct = build_simple_mem_ref (clsn_data->load);
1060   t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1061 
1062   addr = build_addr (t, current_function_decl);
1063 
1064   /* Create phi node.  */
1065   bb = clsn_data->load_bb;
1066 
1067   e = split_block (bb, t);
1068   new_bb = e->dest;
1069 
1070   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
1071   add_referenced_var (tmp_load);
1072   tmp_load = make_ssa_name (tmp_load, NULL);
1073   load = gimple_build_omp_atomic_load (tmp_load, addr);
1074   SSA_NAME_DEF_STMT (tmp_load) = load;
1075   gsi = gsi_start_bb (new_bb);
1076   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1077 
1078   e = split_block (new_bb, load);
1079   new_bb = e->dest;
1080   gsi = gsi_start_bb (new_bb);
1081   ref = tmp_load;
1082   x = fold_build2 (reduc->reduction_code,
1083 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1084 		   PHI_RESULT (reduc->new_phi));
1085 
1086   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1087 				   GSI_CONTINUE_LINKING);
1088 
1089   gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1090   return 1;
1091 }
1092 
1093 /* Create the atomic operation at the join point of the threads.
1094    REDUCTION_LIST describes the reductions in the LOOP.
1095    LD_ST_DATA describes the shared data structure where
1096    shared data is stored in and loaded from.  */
1097 static void
1098 create_call_for_reduction (struct loop *loop, htab_t reduction_list,
1099 			   struct clsn_data *ld_st_data)
1100 {
1101   htab_traverse (reduction_list, create_phi_for_local_result, loop);
1102   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1103   ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
1104   htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data);
1105 }
1106 
1107 /* Callback for htab_traverse.  Loads the final reduction value at the
1108    join point of all threads, and inserts it in the right place.  */
1109 
1110 static int
1111 create_loads_for_reductions (void **slot, void *data)
1112 {
1113   struct reduction_info *const red = (struct reduction_info *) *slot;
1114   struct clsn_data *const clsn_data = (struct clsn_data *) data;
1115   gimple stmt;
1116   gimple_stmt_iterator gsi;
1117   tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1118   tree load_struct;
1119   tree name;
1120   tree x;
1121 
1122   gsi = gsi_after_labels (clsn_data->load_bb);
1123   load_struct = build_simple_mem_ref (clsn_data->load);
1124   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1125 			NULL_TREE);
1126 
1127   x = load_struct;
1128   name = PHI_RESULT (red->keep_res);
1129   stmt = gimple_build_assign (name, x);
1130   SSA_NAME_DEF_STMT (name) = stmt;
1131 
1132   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1133 
1134   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1135        !gsi_end_p (gsi); gsi_next (&gsi))
1136     if (gsi_stmt (gsi) == red->keep_res)
1137       {
1138 	remove_phi_node (&gsi, false);
1139 	return 1;
1140       }
1141   gcc_unreachable ();
1142 }
1143 
1144 /* Load the reduction result that was stored in LD_ST_DATA.
1145    REDUCTION_LIST describes the list of reductions that the
1146    loads should be generated for.  */
1147 static void
1148 create_final_loads_for_reduction (htab_t reduction_list,
1149 				  struct clsn_data *ld_st_data)
1150 {
1151   gimple_stmt_iterator gsi;
1152   tree t;
1153   gimple stmt;
1154 
1155   gsi = gsi_after_labels (ld_st_data->load_bb);
1156   t = build_fold_addr_expr (ld_st_data->store);
1157   stmt = gimple_build_assign (ld_st_data->load, t);
1158 
1159   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1160   SSA_NAME_DEF_STMT (ld_st_data->load) = stmt;
1161 
1162   htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data);
1163 
1164 }
1165 
1166 /* Callback for htab_traverse.  Store the neutral value for the
1167   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1168   1 for MULT_EXPR, etc. into the reduction field.
1169   The reduction is specified in SLOT. The store information is
1170   passed in DATA.  */
1171 
1172 static int
1173 create_stores_for_reduction (void **slot, void *data)
1174 {
1175   struct reduction_info *const red = (struct reduction_info *) *slot;
1176   struct clsn_data *const clsn_data = (struct clsn_data *) data;
1177   tree t;
1178   gimple stmt;
1179   gimple_stmt_iterator gsi;
1180   tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1181 
1182   gsi = gsi_last_bb (clsn_data->store_bb);
1183   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1184   stmt = gimple_build_assign (t, red->initial_value);
1185   mark_virtual_ops_for_renaming (stmt);
1186   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1187 
1188   return 1;
1189 }
1190 
1191 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1192    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1193    specified in SLOT.  */
1194 
1195 static int
1196 create_loads_and_stores_for_name (void **slot, void *data)
1197 {
1198   struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot;
1199   struct clsn_data *const clsn_data = (struct clsn_data *) data;
1200   tree t;
1201   gimple stmt;
1202   gimple_stmt_iterator gsi;
1203   tree type = TREE_TYPE (elt->new_name);
1204   tree load_struct;
1205 
1206   gsi = gsi_last_bb (clsn_data->store_bb);
1207   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1208   stmt = gimple_build_assign (t, ssa_name (elt->version));
1209   mark_virtual_ops_for_renaming (stmt);
1210   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1211 
1212   gsi = gsi_last_bb (clsn_data->load_bb);
1213   load_struct = build_simple_mem_ref (clsn_data->load);
1214   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1215   stmt = gimple_build_assign (elt->new_name, t);
1216   SSA_NAME_DEF_STMT (elt->new_name) = stmt;
1217   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1218 
1219   return 1;
1220 }
1221 
1222 /* Moves all the variables used in LOOP and defined outside of it (including
1223    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1224    name) to a structure created for this purpose.  The code
1225 
1226    while (1)
1227      {
1228        use (a);
1229        use (b);
1230      }
1231 
1232    is transformed this way:
1233 
1234    bb0:
1235    old.a = a;
1236    old.b = b;
1237 
1238    bb1:
1239    a' = new->a;
1240    b' = new->b;
1241    while (1)
1242      {
1243        use (a');
1244        use (b');
1245      }
1246 
1247    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1248    pointer `new' is intentionally not initialized (the loop will be split to a
1249    separate function later, and `new' will be initialized from its arguments).
1250    LD_ST_DATA holds information about the shared data structure used to pass
1251    information among the threads.  It is initialized here, and
1252    gen_parallel_loop will pass it to create_call_for_reduction that
1253    needs this information.  REDUCTION_LIST describes the reductions
1254    in LOOP.  */
1255 
1256 static void
1257 separate_decls_in_region (edge entry, edge exit, htab_t reduction_list,
1258 			  tree *arg_struct, tree *new_arg_struct,
1259 			  struct clsn_data *ld_st_data)
1260 
1261 {
1262   basic_block bb1 = split_edge (entry);
1263   basic_block bb0 = single_pred (bb1);
1264   htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
1265 				    name_to_copy_elt_eq, free);
1266   htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
1267 				    free);
1268   unsigned i;
1269   tree type, type_name, nvar;
1270   gimple_stmt_iterator gsi;
1271   struct clsn_data clsn_data;
1272   VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3);
1273   basic_block bb;
1274   basic_block entry_bb = bb1;
1275   basic_block exit_bb = exit->dest;
1276   bool has_debug_stmt = false;
1277 
1278   entry = single_succ_edge (entry_bb);
1279   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1280 
1281   FOR_EACH_VEC_ELT (basic_block, body, i, bb)
1282     {
1283       if (bb != entry_bb && bb != exit_bb)
1284 	{
1285 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1286 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1287 					   name_copies, decl_copies);
1288 
1289 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1290 	    {
1291 	      gimple stmt = gsi_stmt (gsi);
1292 
1293 	      if (is_gimple_debug (stmt))
1294 		has_debug_stmt = true;
1295 	      else
1296 		separate_decls_in_region_stmt (entry, exit, stmt,
1297 					       name_copies, decl_copies);
1298 	    }
1299 	}
1300     }
1301 
1302   /* Now process debug bind stmts.  We must not create decls while
1303      processing debug stmts, so we defer their processing so as to
1304      make sure we will have debug info for as many variables as
1305      possible (all of those that were dealt with in the loop above),
1306      and discard those for which we know there's nothing we can
1307      do.  */
1308   if (has_debug_stmt)
1309     FOR_EACH_VEC_ELT (basic_block, body, i, bb)
1310       if (bb != entry_bb && bb != exit_bb)
1311 	{
1312 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1313 	    {
1314 	      gimple stmt = gsi_stmt (gsi);
1315 
1316 	      if (is_gimple_debug (stmt))
1317 		{
1318 		  if (separate_decls_in_region_debug (stmt, name_copies,
1319 						      decl_copies))
1320 		    {
1321 		      gsi_remove (&gsi, true);
1322 		      continue;
1323 		    }
1324 		}
1325 
1326 	      gsi_next (&gsi);
1327 	    }
1328 	}
1329 
1330   VEC_free (basic_block, heap, body);
1331 
1332   if (htab_elements (name_copies) == 0 && htab_elements (reduction_list) == 0)
1333     {
1334       /* It may happen that there is nothing to copy (if there are only
1335          loop carried and external variables in the loop).  */
1336       *arg_struct = NULL;
1337       *new_arg_struct = NULL;
1338     }
1339   else
1340     {
1341       /* Create the type for the structure to store the ssa names to.  */
1342       type = lang_hooks.types.make_type (RECORD_TYPE);
1343       type_name = build_decl (UNKNOWN_LOCATION,
1344 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1345 			      type);
1346       TYPE_NAME (type) = type_name;
1347 
1348       htab_traverse (name_copies, add_field_for_name, type);
1349       if (reduction_list && htab_elements (reduction_list) > 0)
1350 	{
1351 	  /* Create the fields for reductions.  */
1352 	  htab_traverse (reduction_list, add_field_for_reduction,
1353                          type);
1354 	}
1355       layout_type (type);
1356 
1357       /* Create the loads and stores.  */
1358       *arg_struct = create_tmp_var (type, ".paral_data_store");
1359       add_referenced_var (*arg_struct);
1360       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1361       add_referenced_var (nvar);
1362       *new_arg_struct = make_ssa_name (nvar, NULL);
1363 
1364       ld_st_data->store = *arg_struct;
1365       ld_st_data->load = *new_arg_struct;
1366       ld_st_data->store_bb = bb0;
1367       ld_st_data->load_bb = bb1;
1368 
1369       htab_traverse (name_copies, create_loads_and_stores_for_name,
1370 		     ld_st_data);
1371 
1372       /* Load the calculation from memory (after the join of the threads).  */
1373 
1374       if (reduction_list && htab_elements (reduction_list) > 0)
1375 	{
1376 	  htab_traverse (reduction_list, create_stores_for_reduction,
1377                         ld_st_data);
1378 	  clsn_data.load = make_ssa_name (nvar, NULL);
1379 	  clsn_data.load_bb = exit->dest;
1380 	  clsn_data.store = ld_st_data->store;
1381 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1382 	}
1383     }
1384 
1385   htab_delete (decl_copies);
1386   htab_delete (name_copies);
1387 }
1388 
1389 /* Bitmap containing uids of functions created by parallelization.  We cannot
1390    allocate it from the default obstack, as it must live across compilation
1391    of several functions; we make it gc allocated instead.  */
1392 
1393 static GTY(()) bitmap parallelized_functions;
1394 
1395 /* Returns true if FN was created by create_loop_fn.  */
1396 
1397 static bool
1398 parallelized_function_p (tree fn)
1399 {
1400   if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1401     return false;
1402 
1403   return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1404 }
1405 
1406 /* Creates and returns an empty function that will receive the body of
1407    a parallelized loop.  */
1408 
1409 static tree
1410 create_loop_fn (location_t loc)
1411 {
1412   char buf[100];
1413   char *tname;
1414   tree decl, type, name, t;
1415   struct function *act_cfun = cfun;
1416   static unsigned loopfn_num;
1417 
1418   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1419   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1420   clean_symbol_name (tname);
1421   name = get_identifier (tname);
1422   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1423 
1424   decl = build_decl (loc, FUNCTION_DECL, name, type);
1425   if (!parallelized_functions)
1426     parallelized_functions = BITMAP_GGC_ALLOC ();
1427   bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1428 
1429   TREE_STATIC (decl) = 1;
1430   TREE_USED (decl) = 1;
1431   DECL_ARTIFICIAL (decl) = 1;
1432   DECL_IGNORED_P (decl) = 0;
1433   TREE_PUBLIC (decl) = 0;
1434   DECL_UNINLINABLE (decl) = 1;
1435   DECL_EXTERNAL (decl) = 0;
1436   DECL_CONTEXT (decl) = NULL_TREE;
1437   DECL_INITIAL (decl) = make_node (BLOCK);
1438 
1439   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1440   DECL_ARTIFICIAL (t) = 1;
1441   DECL_IGNORED_P (t) = 1;
1442   DECL_RESULT (decl) = t;
1443 
1444   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1445 		  ptr_type_node);
1446   DECL_ARTIFICIAL (t) = 1;
1447   DECL_ARG_TYPE (t) = ptr_type_node;
1448   DECL_CONTEXT (t) = decl;
1449   TREE_USED (t) = 1;
1450   DECL_ARGUMENTS (decl) = t;
1451 
1452   allocate_struct_function (decl, false);
1453 
1454   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1455      it.  */
1456   set_cfun (act_cfun);
1457 
1458   return decl;
1459 }
1460 
1461 /* Moves the exit condition of LOOP to the beginning of its header, and
1462    duplicates the part of the last iteration that gets disabled to the
1463    exit of the loop.  NIT is the number of iterations of the loop
1464    (used to initialize the variables in the duplicated part).
1465 
1466    TODO: the common case is that latch of the loop is empty and immediately
1467    follows the loop exit.  In this case, it would be better not to copy the
1468    body of the loop, but only move the entry of the loop directly before the
1469    exit check and increase the number of iterations of the loop by one.
1470    This may need some additional preconditioning in case NIT = ~0.
1471    REDUCTION_LIST describes the reductions in LOOP.  */
1472 
1473 static void
1474 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit)
1475 {
1476   basic_block *bbs, *nbbs, ex_bb, orig_header;
1477   unsigned n;
1478   bool ok;
1479   edge exit = single_dom_exit (loop), hpred;
1480   tree control, control_name, res, t;
1481   gimple phi, nphi, cond_stmt, stmt, cond_nit;
1482   gimple_stmt_iterator gsi;
1483   tree nit_1;
1484   edge exit_1;
1485   tree new_rhs;
1486 
1487   split_block_after_labels (loop->header);
1488   orig_header = single_succ (loop->header);
1489   hpred = single_succ_edge (loop->header);
1490 
1491   cond_stmt = last_stmt (exit->src);
1492   control = gimple_cond_lhs (cond_stmt);
1493   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1494 
1495   /* Make sure that we have phi nodes on exit for all loop header phis
1496      (create_parallel_loop requires that).  */
1497   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1498     {
1499       phi = gsi_stmt (gsi);
1500       res = PHI_RESULT (phi);
1501       t = make_ssa_name (SSA_NAME_VAR (res), phi);
1502       SET_PHI_RESULT (phi, t);
1503       nphi = create_phi_node (res, orig_header);
1504       SSA_NAME_DEF_STMT (res) = nphi;
1505       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1506 
1507       if (res == control)
1508 	{
1509 	  gimple_cond_set_lhs (cond_stmt, t);
1510 	  update_stmt (cond_stmt);
1511 	  control = t;
1512 	}
1513     }
1514 
1515  /* Setting the condition towards peeling the last iteration:
1516     If the block consisting of the exit condition has the latch as
1517     successor, then the body of the loop is executed before
1518     the exit condition is tested.  In such case, moving the
1519     condition to the entry, causes that the loop will iterate
1520     one less iteration (which is the wanted outcome, since we
1521     peel out the last iteration).  If the body is executed after
1522     the condition, moving the condition to the entry requires
1523     decrementing one iteration.  */
1524   exit_1 = EDGE_SUCC (exit->src, EDGE_SUCC (exit->src, 0) == exit);
1525   if (exit_1->dest == loop->latch)
1526     new_rhs = gimple_cond_rhs (cond_stmt);
1527   else
1528   {
1529     new_rhs = fold_build2 (MINUS_EXPR, TREE_TYPE (gimple_cond_rhs (cond_stmt)),
1530 			   gimple_cond_rhs (cond_stmt),
1531 			   build_int_cst (TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1));
1532     if (TREE_CODE (gimple_cond_rhs (cond_stmt)) == SSA_NAME)
1533       {
1534  	basic_block preheader;
1535   	gimple_stmt_iterator gsi1;
1536 
1537   	preheader = loop_preheader_edge(loop)->src;
1538     	gsi1 = gsi_after_labels (preheader);
1539 	new_rhs = force_gimple_operand_gsi (&gsi1, new_rhs, true,
1540 					    NULL_TREE,false,GSI_CONTINUE_LINKING);
1541       }
1542   }
1543   gimple_cond_set_rhs (cond_stmt, unshare_expr (new_rhs));
1544   gimple_cond_set_lhs (cond_stmt, unshare_expr (gimple_cond_lhs (cond_stmt)));
1545 
1546   bbs = get_loop_body_in_dom_order (loop);
1547 
1548   for (n = 0; bbs[n] != loop->latch; n++)
1549     continue;
1550   nbbs = XNEWVEC (basic_block, n);
1551   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1552 				   bbs + 1, n, nbbs);
1553   gcc_assert (ok);
1554   free (bbs);
1555   ex_bb = nbbs[0];
1556   free (nbbs);
1557 
1558   /* Other than reductions, the only gimple reg that should be copied
1559      out of the loop is the control variable.  */
1560 
1561   control_name = NULL_TREE;
1562   for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); )
1563     {
1564       phi = gsi_stmt (gsi);
1565       res = PHI_RESULT (phi);
1566       if (!is_gimple_reg (res))
1567 	{
1568 	  gsi_next (&gsi);
1569 	  continue;
1570 	}
1571 
1572       /* Check if it is a part of reduction.  If it is,
1573          keep the phi at the reduction's keep_res field.  The
1574          PHI_RESULT of this phi is the resulting value of the reduction
1575          variable when exiting the loop.  */
1576 
1577       exit = single_dom_exit (loop);
1578 
1579       if (htab_elements (reduction_list) > 0)
1580 	{
1581 	  struct reduction_info *red;
1582 
1583 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1584 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1585 	  if (red)
1586 	    {
1587 	      red->keep_res = phi;
1588 	      gsi_next (&gsi);
1589 	      continue;
1590 	    }
1591 	}
1592       gcc_assert (control_name == NULL_TREE
1593 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1594       control_name = res;
1595       remove_phi_node (&gsi, false);
1596     }
1597   gcc_assert (control_name != NULL_TREE);
1598 
1599   /* Initialize the control variable to number of iterations
1600      according to the rhs of the exit condition.  */
1601   gsi = gsi_after_labels (ex_bb);
1602   cond_nit = last_stmt (exit->src);
1603   nit_1 =  gimple_cond_rhs (cond_nit);
1604   nit_1 = force_gimple_operand_gsi (&gsi,
1605 				  fold_convert (TREE_TYPE (control_name), nit_1),
1606 				  false, NULL_TREE, false, GSI_SAME_STMT);
1607   stmt = gimple_build_assign (control_name, nit_1);
1608   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1609   SSA_NAME_DEF_STMT (control_name) = stmt;
1610 }
1611 
1612 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1613    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
1614    NEW_DATA is the variable that should be initialized from the argument
1615    of LOOP_FN.  N_THREADS is the requested number of threads.  Returns the
1616    basic block containing GIMPLE_OMP_PARALLEL tree.  */
1617 
1618 static basic_block
1619 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1620 		      tree new_data, unsigned n_threads, location_t loc)
1621 {
1622   gimple_stmt_iterator gsi;
1623   basic_block bb, paral_bb, for_bb, ex_bb;
1624   tree t, param;
1625   gimple stmt, for_stmt, phi, cond_stmt;
1626   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
1627   edge exit, nexit, guard, end, e;
1628 
1629   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
1630   bb = loop_preheader_edge (loop)->src;
1631   paral_bb = single_pred (bb);
1632   gsi = gsi_last_bb (paral_bb);
1633 
1634   t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
1635   OMP_CLAUSE_NUM_THREADS_EXPR (t)
1636     = build_int_cst (integer_type_node, n_threads);
1637   stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
1638   gimple_set_location (stmt, loc);
1639 
1640   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1641 
1642   /* Initialize NEW_DATA.  */
1643   if (data)
1644     {
1645       gsi = gsi_after_labels (bb);
1646 
1647       param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL);
1648       stmt = gimple_build_assign (param, build_fold_addr_expr (data));
1649       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1650       SSA_NAME_DEF_STMT (param) = stmt;
1651 
1652       stmt = gimple_build_assign (new_data,
1653 				  fold_convert (TREE_TYPE (new_data), param));
1654       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1655       SSA_NAME_DEF_STMT (new_data) = stmt;
1656     }
1657 
1658   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
1659   bb = split_loop_exit_edge (single_dom_exit (loop));
1660   gsi = gsi_last_bb (bb);
1661   stmt = gimple_build_omp_return (false);
1662   gimple_set_location (stmt, loc);
1663   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1664 
1665   /* Extract data for GIMPLE_OMP_FOR.  */
1666   gcc_assert (loop->header == single_dom_exit (loop)->src);
1667   cond_stmt = last_stmt (loop->header);
1668 
1669   cvar = gimple_cond_lhs (cond_stmt);
1670   cvar_base = SSA_NAME_VAR (cvar);
1671   phi = SSA_NAME_DEF_STMT (cvar);
1672   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1673   initvar = make_ssa_name (cvar_base, NULL);
1674   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1675 	   initvar);
1676   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1677 
1678   gsi = gsi_last_nondebug_bb (loop->latch);
1679   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
1680   gsi_remove (&gsi, true);
1681 
1682   /* Prepare cfg.  */
1683   for_bb = split_edge (loop_preheader_edge (loop));
1684   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1685   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1686   gcc_assert (exit == single_dom_exit (loop));
1687 
1688   guard = make_edge (for_bb, ex_bb, 0);
1689   single_succ_edge (loop->latch)->flags = 0;
1690   end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1691   for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1692     {
1693       source_location locus;
1694       tree def;
1695       phi = gsi_stmt (gsi);
1696       stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1697 
1698       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
1699       locus = gimple_phi_arg_location_from_edge (stmt,
1700 						 loop_preheader_edge (loop));
1701       add_phi_arg (phi, def, guard, locus);
1702 
1703       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
1704       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
1705       add_phi_arg (phi, def, end, locus);
1706     }
1707   e = redirect_edge_and_branch (exit, nexit->dest);
1708   PENDING_STMT (e) = NULL;
1709 
1710   /* Emit GIMPLE_OMP_FOR.  */
1711   gimple_cond_set_lhs (cond_stmt, cvar_base);
1712   type = TREE_TYPE (cvar);
1713   t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
1714   OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1715 
1716   for_stmt = gimple_build_omp_for (NULL, t, 1, NULL);
1717   gimple_set_location (for_stmt, loc);
1718   gimple_omp_for_set_index (for_stmt, 0, initvar);
1719   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
1720   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
1721   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
1722   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
1723 						cvar_base,
1724 						build_int_cst (type, 1)));
1725 
1726   gsi = gsi_last_bb (for_bb);
1727   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
1728   SSA_NAME_DEF_STMT (initvar) = for_stmt;
1729 
1730   /* Emit GIMPLE_OMP_CONTINUE.  */
1731   gsi = gsi_last_bb (loop->latch);
1732   stmt = gimple_build_omp_continue (cvar_next, cvar);
1733   gimple_set_location (stmt, loc);
1734   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1735   SSA_NAME_DEF_STMT (cvar_next) = stmt;
1736 
1737   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
1738   gsi = gsi_last_bb (ex_bb);
1739   stmt = gimple_build_omp_return (true);
1740   gimple_set_location (stmt, loc);
1741   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1742 
1743   return paral_bb;
1744 }
1745 
1746 /* Generates code to execute the iterations of LOOP in N_THREADS
1747    threads in parallel.
1748 
1749    NITER describes number of iterations of LOOP.
1750    REDUCTION_LIST describes the reductions existent in the LOOP.  */
1751 
1752 static void
1753 gen_parallel_loop (struct loop *loop, htab_t reduction_list,
1754 		   unsigned n_threads, struct tree_niter_desc *niter)
1755 {
1756   loop_iterator li;
1757   tree many_iterations_cond, type, nit;
1758   tree arg_struct, new_arg_struct;
1759   gimple_seq stmts;
1760   basic_block parallel_head;
1761   edge entry, exit;
1762   struct clsn_data clsn_data;
1763   unsigned prob;
1764   location_t loc;
1765   gimple cond_stmt;
1766 
1767   /* From
1768 
1769      ---------------------------------------------------------------------
1770      loop
1771        {
1772 	 IV = phi (INIT, IV + STEP)
1773 	 BODY1;
1774 	 if (COND)
1775 	   break;
1776 	 BODY2;
1777        }
1778      ---------------------------------------------------------------------
1779 
1780      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1781      we generate the following code:
1782 
1783      ---------------------------------------------------------------------
1784 
1785      if (MAY_BE_ZERO
1786      || NITER < MIN_PER_THREAD * N_THREADS)
1787      goto original;
1788 
1789      BODY1;
1790      store all local loop-invariant variables used in body of the loop to DATA.
1791      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1792      load the variables from DATA.
1793      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1794      BODY2;
1795      BODY1;
1796      GIMPLE_OMP_CONTINUE;
1797      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
1798      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
1799      goto end;
1800 
1801      original:
1802      loop
1803        {
1804 	 IV = phi (INIT, IV + STEP)
1805 	 BODY1;
1806 	 if (COND)
1807 	   break;
1808 	 BODY2;
1809        }
1810 
1811      end:
1812 
1813    */
1814 
1815   /* Create two versions of the loop -- in the old one, we know that the
1816      number of iterations is large enough, and we will transform it into the
1817      loop that will be split to loop_fn, the new one will be used for the
1818      remaining iterations.  */
1819 
1820   type = TREE_TYPE (niter->niter);
1821   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1822 			      NULL_TREE);
1823   if (stmts)
1824     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1825 
1826   many_iterations_cond =
1827     fold_build2 (GE_EXPR, boolean_type_node,
1828 		 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1829   many_iterations_cond
1830     = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1831 		   invert_truthvalue (unshare_expr (niter->may_be_zero)),
1832 		   many_iterations_cond);
1833   many_iterations_cond
1834     = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1835   if (stmts)
1836     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1837   if (!is_gimple_condexpr (many_iterations_cond))
1838     {
1839       many_iterations_cond
1840 	= force_gimple_operand (many_iterations_cond, &stmts,
1841 				true, NULL_TREE);
1842       if (stmts)
1843 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1844     }
1845 
1846   initialize_original_copy_tables ();
1847 
1848   /* We assume that the loop usually iterates a lot.  */
1849   prob = 4 * REG_BR_PROB_BASE / 5;
1850   loop_version (loop, many_iterations_cond, NULL,
1851 		prob, prob, REG_BR_PROB_BASE - prob, true);
1852   update_ssa (TODO_update_ssa);
1853   free_original_copy_tables ();
1854 
1855   /* Base all the induction variables in LOOP on a single control one.  */
1856   canonicalize_loop_ivs (loop, &nit, true);
1857 
1858   /* Ensure that the exit condition is the first statement in the loop.  */
1859   transform_to_exit_first_loop (loop, reduction_list, nit);
1860 
1861   /* Generate initializations for reductions.  */
1862   if (htab_elements (reduction_list) > 0)
1863     htab_traverse (reduction_list, initialize_reductions, loop);
1864 
1865   /* Eliminate the references to local variables from the loop.  */
1866   gcc_assert (single_exit (loop));
1867   entry = loop_preheader_edge (loop);
1868   exit = single_dom_exit (loop);
1869 
1870   eliminate_local_variables (entry, exit);
1871   /* In the old loop, move all variables non-local to the loop to a structure
1872      and back, and create separate decls for the variables used in loop.  */
1873   separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
1874 			    &new_arg_struct, &clsn_data);
1875 
1876   /* Create the parallel constructs.  */
1877   loc = UNKNOWN_LOCATION;
1878   cond_stmt = last_stmt (loop->header);
1879   if (cond_stmt)
1880     loc = gimple_location (cond_stmt);
1881   parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
1882 					new_arg_struct, n_threads, loc);
1883   if (htab_elements (reduction_list) > 0)
1884     create_call_for_reduction (loop, reduction_list, &clsn_data);
1885 
1886   scev_reset ();
1887 
1888   /* Cancel the loop (it is simpler to do it here rather than to teach the
1889      expander to do it).  */
1890   cancel_loop_tree (loop);
1891 
1892   /* Free loop bound estimations that could contain references to
1893      removed statements.  */
1894   FOR_EACH_LOOP (li, loop, 0)
1895     free_numbers_of_iterations_estimates_loop (loop);
1896 
1897   /* Expand the parallel constructs.  We do it directly here instead of running
1898      a separate expand_omp pass, since it is more efficient, and less likely to
1899      cause troubles with further analyses not being able to deal with the
1900      OMP trees.  */
1901 
1902   omp_expand_local (parallel_head);
1903 }
1904 
1905 /* Returns true when LOOP contains vector phi nodes.  */
1906 
1907 static bool
1908 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
1909 {
1910   unsigned i;
1911   basic_block *bbs = get_loop_body_in_dom_order (loop);
1912   gimple_stmt_iterator gsi;
1913   bool res = true;
1914 
1915   for (i = 0; i < loop->num_nodes; i++)
1916     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1917       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE)
1918 	goto end;
1919 
1920   res = false;
1921  end:
1922   free (bbs);
1923   return res;
1924 }
1925 
1926 /* Create a reduction_info struct, initialize it with REDUC_STMT
1927    and PHI, insert it to the REDUCTION_LIST.  */
1928 
1929 static void
1930 build_new_reduction (htab_t reduction_list, gimple reduc_stmt, gimple phi)
1931 {
1932   PTR *slot;
1933   struct reduction_info *new_reduction;
1934 
1935   gcc_assert (reduc_stmt);
1936 
1937   if (dump_file && (dump_flags & TDF_DETAILS))
1938     {
1939       fprintf (dump_file,
1940 	       "Detected reduction. reduction stmt is: \n");
1941       print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
1942       fprintf (dump_file, "\n");
1943     }
1944 
1945   new_reduction = XCNEW (struct reduction_info);
1946 
1947   new_reduction->reduc_stmt = reduc_stmt;
1948   new_reduction->reduc_phi = phi;
1949   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
1950   new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt);
1951   slot = htab_find_slot (reduction_list, new_reduction, INSERT);
1952   *slot = new_reduction;
1953 }
1954 
1955 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
1956 
1957 static int
1958 set_reduc_phi_uids (void **slot, void *data ATTRIBUTE_UNUSED)
1959 {
1960   struct reduction_info *const red = (struct reduction_info *) *slot;
1961   gimple_set_uid (red->reduc_phi, red->reduc_version);
1962   return 1;
1963 }
1964 
1965 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
1966 
1967 static void
1968 gather_scalar_reductions (loop_p loop, htab_t reduction_list)
1969 {
1970   gimple_stmt_iterator gsi;
1971   loop_vec_info simple_loop_info;
1972 
1973   vect_dump = NULL;
1974   simple_loop_info = vect_analyze_loop_form (loop);
1975 
1976   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1977     {
1978       gimple phi = gsi_stmt (gsi);
1979       affine_iv iv;
1980       tree res = PHI_RESULT (phi);
1981       bool double_reduc;
1982 
1983       if (!is_gimple_reg (res))
1984 	continue;
1985 
1986       if (!simple_iv (loop, loop, res, &iv, true)
1987 	&& simple_loop_info)
1988 	{
1989            gimple reduc_stmt = vect_force_simple_reduction (simple_loop_info,
1990 							    phi, true,
1991 							    &double_reduc);
1992 	   if (reduc_stmt && !double_reduc)
1993               build_new_reduction (reduction_list, reduc_stmt, phi);
1994         }
1995     }
1996   destroy_loop_vec_info (simple_loop_info, true);
1997 
1998   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
1999      and destroy_loop_vec_info, we can set gimple_uid of reduc_phi stmts
2000      only now.  */
2001   htab_traverse (reduction_list, set_reduc_phi_uids, NULL);
2002 }
2003 
2004 /* Try to initialize NITER for code generation part.  */
2005 
2006 static bool
2007 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2008 {
2009   edge exit = single_dom_exit (loop);
2010 
2011   gcc_assert (exit);
2012 
2013   /* We need to know # of iterations, and there should be no uses of values
2014      defined inside loop outside of it, unless the values are invariants of
2015      the loop.  */
2016   if (!number_of_iterations_exit (loop, exit, niter, false))
2017     {
2018       if (dump_file && (dump_flags & TDF_DETAILS))
2019 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2020       return false;
2021     }
2022 
2023   return true;
2024 }
2025 
2026 /* Try to initialize REDUCTION_LIST for code generation part.
2027    REDUCTION_LIST describes the reductions.  */
2028 
2029 static bool
2030 try_create_reduction_list (loop_p loop, htab_t reduction_list)
2031 {
2032   edge exit = single_dom_exit (loop);
2033   gimple_stmt_iterator gsi;
2034 
2035   gcc_assert (exit);
2036 
2037   gather_scalar_reductions (loop, reduction_list);
2038 
2039 
2040   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2041     {
2042       gimple phi = gsi_stmt (gsi);
2043       struct reduction_info *red;
2044       imm_use_iterator imm_iter;
2045       use_operand_p use_p;
2046       gimple reduc_phi;
2047       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2048 
2049       if (is_gimple_reg (val))
2050 	{
2051 	  if (dump_file && (dump_flags & TDF_DETAILS))
2052 	    {
2053 	      fprintf (dump_file, "phi is ");
2054 	      print_gimple_stmt (dump_file, phi, 0, 0);
2055 	      fprintf (dump_file, "arg of phi to exit:   value ");
2056 	      print_generic_expr (dump_file, val, 0);
2057 	      fprintf (dump_file, " used outside loop\n");
2058 	      fprintf (dump_file,
2059 		       "  checking if it a part of reduction pattern:  \n");
2060 	    }
2061 	  if (htab_elements (reduction_list) == 0)
2062 	    {
2063 	      if (dump_file && (dump_flags & TDF_DETAILS))
2064 		fprintf (dump_file,
2065 			 "  FAILED: it is not a part of reduction.\n");
2066 	      return false;
2067 	    }
2068 	  reduc_phi = NULL;
2069 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2070 	    {
2071 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2072 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2073 		{
2074 		  reduc_phi = USE_STMT (use_p);
2075 		  break;
2076 		}
2077 	    }
2078 	  red = reduction_phi (reduction_list, reduc_phi);
2079 	  if (red == NULL)
2080 	    {
2081 	      if (dump_file && (dump_flags & TDF_DETAILS))
2082 		fprintf (dump_file,
2083 			 "  FAILED: it is not a part of reduction.\n");
2084 	      return false;
2085 	    }
2086 	  if (dump_file && (dump_flags & TDF_DETAILS))
2087 	    {
2088 	      fprintf (dump_file, "reduction phi is  ");
2089 	      print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
2090 	      fprintf (dump_file, "reduction stmt is  ");
2091 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
2092 	    }
2093 	}
2094     }
2095 
2096   /* The iterations of the loop may communicate only through bivs whose
2097      iteration space can be distributed efficiently.  */
2098   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2099     {
2100       gimple phi = gsi_stmt (gsi);
2101       tree def = PHI_RESULT (phi);
2102       affine_iv iv;
2103 
2104       if (is_gimple_reg (def) && !simple_iv (loop, loop, def, &iv, true))
2105 	{
2106 	  struct reduction_info *red;
2107 
2108 	  red = reduction_phi (reduction_list, phi);
2109 	  if (red == NULL)
2110 	    {
2111 	      if (dump_file && (dump_flags & TDF_DETAILS))
2112 		fprintf (dump_file,
2113 			 "  FAILED: scalar dependency between iterations\n");
2114 	      return false;
2115 	    }
2116 	}
2117     }
2118 
2119 
2120   return true;
2121 }
2122 
2123 /* Detect parallel loops and generate parallel code using libgomp
2124    primitives.  Returns true if some loop was parallelized, false
2125    otherwise.  */
2126 
2127 bool
2128 parallelize_loops (void)
2129 {
2130   unsigned n_threads = flag_tree_parallelize_loops;
2131   bool changed = false;
2132   struct loop *loop;
2133   struct tree_niter_desc niter_desc;
2134   loop_iterator li;
2135   htab_t reduction_list;
2136   struct obstack parloop_obstack;
2137   HOST_WIDE_INT estimated;
2138   LOC loop_loc;
2139 
2140   /* Do not parallelize loops in the functions created by parallelization.  */
2141   if (parallelized_function_p (cfun->decl))
2142     return false;
2143   if (cfun->has_nonlocal_label)
2144     return false;
2145 
2146   gcc_obstack_init (&parloop_obstack);
2147   reduction_list = htab_create (10, reduction_info_hash,
2148 				     reduction_info_eq, free);
2149   init_stmt_vec_info_vec ();
2150 
2151   FOR_EACH_LOOP (li, loop, 0)
2152     {
2153       htab_empty (reduction_list);
2154       if (dump_file && (dump_flags & TDF_DETAILS))
2155       {
2156         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
2157 	if (loop->inner)
2158 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
2159 	else
2160 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
2161       }
2162 
2163       /* If we use autopar in graphite pass, we use its marked dependency
2164       checking results.  */
2165       if (flag_loop_parallelize_all && !loop->can_be_parallel)
2166       {
2167         if (dump_file && (dump_flags & TDF_DETAILS))
2168 	   fprintf (dump_file, "loop is not parallel according to graphite\n");
2169 	continue;
2170       }
2171 
2172       if (!single_dom_exit (loop))
2173       {
2174 
2175         if (dump_file && (dump_flags & TDF_DETAILS))
2176 	  fprintf (dump_file, "loop is !single_dom_exit\n");
2177 
2178 	continue;
2179       }
2180 
2181       if (/* And of course, the loop must be parallelizable.  */
2182 	  !can_duplicate_loop_p (loop)
2183 	  || loop_has_blocks_with_irreducible_flag (loop)
2184 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
2185 	  /* FIXME: the check for vector phi nodes could be removed.  */
2186 	  || loop_has_vector_phi_nodes (loop)
2187 	  /* FIXME: transform_to_exit_first_loop does not handle not
2188 	     header-copied loops correctly - see PR46886.  */
2189 	  || !do_while_loop_p (loop))
2190 	continue;
2191       estimated = max_stmt_executions_int (loop, false);
2192       /* FIXME: Bypass this check as graphite doesn't update the
2193       count and frequency correctly now.  */
2194       if (!flag_loop_parallelize_all
2195 	  && ((estimated !=-1
2196 	     && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
2197 	      /* Do not bother with loops in cold areas.  */
2198 	      || optimize_loop_nest_for_size_p (loop)))
2199 	continue;
2200 
2201       if (!try_get_loop_niter (loop, &niter_desc))
2202 	continue;
2203 
2204       if (!try_create_reduction_list (loop, reduction_list))
2205 	continue;
2206 
2207       if (!flag_loop_parallelize_all
2208 	  && !loop_parallel_p (loop, &parloop_obstack))
2209 	continue;
2210 
2211       changed = true;
2212       if (dump_file && (dump_flags & TDF_DETAILS))
2213       {
2214 	if (loop->inner)
2215 	  fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index);
2216 	else
2217 	  fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index);
2218 	loop_loc = find_loop_location (loop);
2219 	if (loop_loc != UNKNOWN_LOC)
2220 	  fprintf (dump_file, "\nloop at %s:%d: ",
2221 		   LOC_FILE (loop_loc), LOC_LINE (loop_loc));
2222       }
2223       gen_parallel_loop (loop, reduction_list,
2224 			 n_threads, &niter_desc);
2225       verify_flow_info ();
2226       verify_dominators (CDI_DOMINATORS);
2227       verify_loop_structure ();
2228       verify_loop_closed_ssa (true);
2229     }
2230 
2231   free_stmt_vec_info_vec ();
2232   htab_delete (reduction_list);
2233   obstack_free (&parloop_obstack, NULL);
2234 
2235   /* Parallelization will cause new function calls to be inserted through
2236      which local variables will escape.  Reset the points-to solution
2237      for ESCAPED.  */
2238   if (changed)
2239     pt_solution_reset (&cfun->gimple_df->escaped);
2240 
2241   return changed;
2242 }
2243 
2244 #include "gt-tree-parloops.h"
2245