1 /* Loop autoparallelization.
2    Copyright (C) 2006-2014 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tree.h"
26 #include "basic-block.h"
27 #include "tree-ssa-alias.h"
28 #include "internal-fn.h"
29 #include "gimple-expr.h"
30 #include "is-a.h"
31 #include "gimple.h"
32 #include "gimplify.h"
33 #include "gimple-iterator.h"
34 #include "gimplify-me.h"
35 #include "gimple-walk.h"
36 #include "stor-layout.h"
37 #include "tree-nested.h"
38 #include "gimple-ssa.h"
39 #include "tree-cfg.h"
40 #include "tree-phinodes.h"
41 #include "ssa-iterators.h"
42 #include "stringpool.h"
43 #include "tree-ssanames.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "tree-into-ssa.h"
49 #include "cfgloop.h"
50 #include "tree-data-ref.h"
51 #include "tree-scalar-evolution.h"
52 #include "gimple-pretty-print.h"
53 #include "tree-pass.h"
54 #include "langhooks.h"
55 #include "tree-vectorizer.h"
56 #include "tree-hasher.h"
57 #include "tree-parloops.h"
58 #include "omp-low.h"
59 #include "tree-nested.h"
60 
61 /* This pass tries to distribute iterations of loops into several threads.
62    The implementation is straightforward -- for each loop we test whether its
63    iterations are independent, and if it is the case (and some additional
64    conditions regarding profitability and correctness are satisfied), we
65    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
66    machinery do its job.
67 
68    The most of the complexity is in bringing the code into shape expected
69    by the omp expanders:
70    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
71       variable and that the exit test is at the start of the loop body
72    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
73       variables by accesses through pointers, and breaking up ssa chains
74       by storing the values incoming to the parallelized loop to a structure
75       passed to the new function as an argument (something similar is done
76       in omp gimplification, unfortunately only a small part of the code
77       can be shared).
78 
79    TODO:
80    -- if there are several parallelizable loops in a function, it may be
81       possible to generate the threads just once (using synchronization to
82       ensure that cross-loop dependences are obeyed).
83    -- handling of common reduction patterns for outer loops.
84 
85    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
86 /*
87   Reduction handling:
88   currently we use vect_force_simple_reduction() to detect reduction patterns.
89   The code transformation will be introduced by an example.
90 
91 
92 parloop
93 {
94   int sum=1;
95 
96   for (i = 0; i < N; i++)
97    {
98     x[i] = i + 3;
99     sum+=x[i];
100    }
101 }
102 
103 gimple-like code:
104 header_bb:
105 
106   # sum_29 = PHI <sum_11(5), 1(3)>
107   # i_28 = PHI <i_12(5), 0(3)>
108   D.1795_8 = i_28 + 3;
109   x[i_28] = D.1795_8;
110   sum_11 = D.1795_8 + sum_29;
111   i_12 = i_28 + 1;
112   if (N_6(D) > i_12)
113     goto header_bb;
114 
115 
116 exit_bb:
117 
118   # sum_21 = PHI <sum_11(4)>
119   printf (&"%d"[0], sum_21);
120 
121 
122 after reduction transformation (only relevant parts):
123 
124 parloop
125 {
126 
127 ....
128 
129 
130   # Storing the initial value given by the user.  #
131 
132   .paral_data_store.32.sum.27 = 1;
133 
134   #pragma omp parallel num_threads(4)
135 
136   #pragma omp for schedule(static)
137 
138   # The neutral element corresponding to the particular
139   reduction's operation, e.g. 0 for PLUS_EXPR,
140   1 for MULT_EXPR, etc. replaces the user's initial value.  #
141 
142   # sum.27_29 = PHI <sum.27_11, 0>
143 
144   sum.27_11 = D.1827_8 + sum.27_29;
145 
146   GIMPLE_OMP_CONTINUE
147 
148   # Adding this reduction phi is done at create_phi_for_local_result() #
149   # sum.27_56 = PHI <sum.27_11, 0>
150   GIMPLE_OMP_RETURN
151 
152   # Creating the atomic operation is done at
153   create_call_for_reduction_1()  #
154 
155   #pragma omp atomic_load
156   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
157   D.1840_60 = sum.27_56 + D.1839_59;
158   #pragma omp atomic_store (D.1840_60);
159 
160   GIMPLE_OMP_RETURN
161 
162  # collecting the result after the join of the threads is done at
163   create_loads_for_reductions().
164   The value computed by the threads is loaded from the
165   shared struct.  #
166 
167 
168   .paral_data_load.33_52 = &.paral_data_store.32;
169   sum_37 =  .paral_data_load.33_52->sum.27;
170   sum_43 = D.1795_41 + sum_37;
171 
172   exit bb:
173   # sum_21 = PHI <sum_43, sum_26>
174   printf (&"%d"[0], sum_21);
175 
176 ...
177 
178 }
179 
180 */
181 
182 /* Minimal number of iterations of a loop that should be executed in each
183    thread.  */
184 #define MIN_PER_THREAD 100
185 
186 /* Element of the hashtable, representing a
187    reduction in the current loop.  */
188 struct reduction_info
189 {
190   gimple reduc_stmt;		/* reduction statement.  */
191   gimple reduc_phi;		/* The phi node defining the reduction.  */
192   enum tree_code reduction_code;/* code for the reduction operation.  */
193   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
194 				   result.  */
195   gimple keep_res;		/* The PHI_RESULT of this phi is the resulting value
196 				   of the reduction variable when existing the loop. */
197   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
198   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
199   tree init;			/* reduction initialization value.  */
200   gimple new_phi;		/* (helper field) Newly created phi node whose result
201 				   will be passed to the atomic operation.  Represents
202 				   the local result each thread computed for the reduction
203 				   operation.  */
204 };
205 
206 /* Reduction info hashtable helpers.  */
207 
208 struct reduction_hasher : typed_free_remove <reduction_info>
209 {
210   typedef reduction_info value_type;
211   typedef reduction_info compare_type;
212   static inline hashval_t hash (const value_type *);
213   static inline bool equal (const value_type *, const compare_type *);
214 };
215 
216 /* Equality and hash functions for hashtab code.  */
217 
218 inline bool
equal(const value_type * a,const compare_type * b)219 reduction_hasher::equal (const value_type *a, const compare_type *b)
220 {
221   return (a->reduc_phi == b->reduc_phi);
222 }
223 
224 inline hashval_t
hash(const value_type * a)225 reduction_hasher::hash (const value_type *a)
226 {
227   return a->reduc_version;
228 }
229 
230 typedef hash_table <reduction_hasher> reduction_info_table_type;
231 
232 
233 static struct reduction_info *
reduction_phi(reduction_info_table_type reduction_list,gimple phi)234 reduction_phi (reduction_info_table_type reduction_list, gimple phi)
235 {
236   struct reduction_info tmpred, *red;
237 
238   if (reduction_list.elements () == 0 || phi == NULL)
239     return NULL;
240 
241   tmpred.reduc_phi = phi;
242   tmpred.reduc_version = gimple_uid (phi);
243   red = reduction_list.find (&tmpred);
244 
245   return red;
246 }
247 
248 /* Element of hashtable of names to copy.  */
249 
250 struct name_to_copy_elt
251 {
252   unsigned version;	/* The version of the name to copy.  */
253   tree new_name;	/* The new name used in the copy.  */
254   tree field;		/* The field of the structure used to pass the
255 			   value.  */
256 };
257 
258 /* Name copies hashtable helpers.  */
259 
260 struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt>
261 {
262   typedef name_to_copy_elt value_type;
263   typedef name_to_copy_elt compare_type;
264   static inline hashval_t hash (const value_type *);
265   static inline bool equal (const value_type *, const compare_type *);
266 };
267 
268 /* Equality and hash functions for hashtab code.  */
269 
270 inline bool
equal(const value_type * a,const compare_type * b)271 name_to_copy_hasher::equal (const value_type *a, const compare_type *b)
272 {
273   return a->version == b->version;
274 }
275 
276 inline hashval_t
hash(const value_type * a)277 name_to_copy_hasher::hash (const value_type *a)
278 {
279   return (hashval_t) a->version;
280 }
281 
282 typedef hash_table <name_to_copy_hasher> name_to_copy_table_type;
283 
284 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
285    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
286    represents the denominator for every element in the matrix.  */
287 typedef struct lambda_trans_matrix_s
288 {
289   lambda_matrix matrix;
290   int rowsize;
291   int colsize;
292   int denominator;
293 } *lambda_trans_matrix;
294 #define LTM_MATRIX(T) ((T)->matrix)
295 #define LTM_ROWSIZE(T) ((T)->rowsize)
296 #define LTM_COLSIZE(T) ((T)->colsize)
297 #define LTM_DENOMINATOR(T) ((T)->denominator)
298 
299 /* Allocate a new transformation matrix.  */
300 
301 static lambda_trans_matrix
lambda_trans_matrix_new(int colsize,int rowsize,struct obstack * lambda_obstack)302 lambda_trans_matrix_new (int colsize, int rowsize,
303 			 struct obstack * lambda_obstack)
304 {
305   lambda_trans_matrix ret;
306 
307   ret = (lambda_trans_matrix)
308     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
309   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
310   LTM_ROWSIZE (ret) = rowsize;
311   LTM_COLSIZE (ret) = colsize;
312   LTM_DENOMINATOR (ret) = 1;
313   return ret;
314 }
315 
316 /* Multiply a vector VEC by a matrix MAT.
317    MAT is an M*N matrix, and VEC is a vector with length N.  The result
318    is stored in DEST which must be a vector of length M.  */
319 
320 static void
lambda_matrix_vector_mult(lambda_matrix matrix,int m,int n,lambda_vector vec,lambda_vector dest)321 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
322 			   lambda_vector vec, lambda_vector dest)
323 {
324   int i, j;
325 
326   lambda_vector_clear (dest, m);
327   for (i = 0; i < m; i++)
328     for (j = 0; j < n; j++)
329       dest[i] += matrix[i][j] * vec[j];
330 }
331 
332 /* Return true if TRANS is a legal transformation matrix that respects
333    the dependence vectors in DISTS and DIRS.  The conservative answer
334    is false.
335 
336    "Wolfe proves that a unimodular transformation represented by the
337    matrix T is legal when applied to a loop nest with a set of
338    lexicographically non-negative distance vectors RDG if and only if
339    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
340    i.e.: if and only if it transforms the lexicographically positive
341    distance vectors to lexicographically positive vectors.  Note that
342    a unimodular matrix must transform the zero vector (and only it) to
343    the zero vector." S.Muchnick.  */
344 
345 static bool
lambda_transform_legal_p(lambda_trans_matrix trans,int nb_loops,vec<ddr_p> dependence_relations)346 lambda_transform_legal_p (lambda_trans_matrix trans,
347 			  int nb_loops,
348 			  vec<ddr_p> dependence_relations)
349 {
350   unsigned int i, j;
351   lambda_vector distres;
352   struct data_dependence_relation *ddr;
353 
354   gcc_assert (LTM_COLSIZE (trans) == nb_loops
355 	      && LTM_ROWSIZE (trans) == nb_loops);
356 
357   /* When there are no dependences, the transformation is correct.  */
358   if (dependence_relations.length () == 0)
359     return true;
360 
361   ddr = dependence_relations[0];
362   if (ddr == NULL)
363     return true;
364 
365   /* When there is an unknown relation in the dependence_relations, we
366      know that it is no worth looking at this loop nest: give up.  */
367   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
368     return false;
369 
370   distres = lambda_vector_new (nb_loops);
371 
372   /* For each distance vector in the dependence graph.  */
373   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
374     {
375       /* Don't care about relations for which we know that there is no
376 	 dependence, nor about read-read (aka. output-dependences):
377 	 these data accesses can happen in any order.  */
378       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
379 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
380 	continue;
381 
382       /* Conservatively answer: "this transformation is not valid".  */
383       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
384 	return false;
385 
386       /* If the dependence could not be captured by a distance vector,
387 	 conservatively answer that the transform is not valid.  */
388       if (DDR_NUM_DIST_VECTS (ddr) == 0)
389 	return false;
390 
391       /* Compute trans.dist_vect */
392       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
393 	{
394 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
395 				     DDR_DIST_VECT (ddr, j), distres);
396 
397 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
398 	    return false;
399 	}
400     }
401   return true;
402 }
403 
404 /* Data dependency analysis. Returns true if the iterations of LOOP
405    are independent on each other (that is, if we can execute them
406    in parallel).  */
407 
408 static bool
loop_parallel_p(struct loop * loop,struct obstack * parloop_obstack)409 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
410 {
411   vec<ddr_p> dependence_relations;
412   vec<data_reference_p> datarefs;
413   lambda_trans_matrix trans;
414   bool ret = false;
415 
416   if (dump_file && (dump_flags & TDF_DETAILS))
417   {
418     fprintf (dump_file, "Considering loop %d\n", loop->num);
419     if (!loop->inner)
420       fprintf (dump_file, "loop is innermost\n");
421     else
422       fprintf (dump_file, "loop NOT innermost\n");
423    }
424 
425   /* Check for problems with dependences.  If the loop can be reversed,
426      the iterations are independent.  */
427   auto_vec<loop_p, 3> loop_nest;
428   datarefs.create (10);
429   dependence_relations.create (100);
430   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
431 					   &dependence_relations))
432     {
433       if (dump_file && (dump_flags & TDF_DETAILS))
434 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
435       ret = false;
436       goto end;
437     }
438   if (dump_file && (dump_flags & TDF_DETAILS))
439     dump_data_dependence_relations (dump_file, dependence_relations);
440 
441   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
442   LTM_MATRIX (trans)[0][0] = -1;
443 
444   if (lambda_transform_legal_p (trans, 1, dependence_relations))
445     {
446       ret = true;
447       if (dump_file && (dump_flags & TDF_DETAILS))
448 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
449     }
450   else if (dump_file && (dump_flags & TDF_DETAILS))
451     fprintf (dump_file,
452 	     "  FAILED: data dependencies exist across iterations\n");
453 
454  end:
455   free_dependence_relations (dependence_relations);
456   free_data_refs (datarefs);
457 
458   return ret;
459 }
460 
461 /* Return true when LOOP contains basic blocks marked with the
462    BB_IRREDUCIBLE_LOOP flag.  */
463 
464 static inline bool
loop_has_blocks_with_irreducible_flag(struct loop * loop)465 loop_has_blocks_with_irreducible_flag (struct loop *loop)
466 {
467   unsigned i;
468   basic_block *bbs = get_loop_body_in_dom_order (loop);
469   bool res = true;
470 
471   for (i = 0; i < loop->num_nodes; i++)
472     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
473       goto end;
474 
475   res = false;
476  end:
477   free (bbs);
478   return res;
479 }
480 
481 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
482    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
483    to their addresses that can be reused.  The address of OBJ is known to
484    be invariant in the whole function.  Other needed statements are placed
485    right before GSI.  */
486 
487 static tree
take_address_of(tree obj,tree type,edge entry,int_tree_htab_type decl_address,gimple_stmt_iterator * gsi)488 take_address_of (tree obj, tree type, edge entry,
489 		 int_tree_htab_type decl_address, gimple_stmt_iterator *gsi)
490 {
491   int uid;
492   int_tree_map **dslot;
493   struct int_tree_map ielt, *nielt;
494   tree *var_p, name, addr;
495   gimple stmt;
496   gimple_seq stmts;
497 
498   /* Since the address of OBJ is invariant, the trees may be shared.
499      Avoid rewriting unrelated parts of the code.  */
500   obj = unshare_expr (obj);
501   for (var_p = &obj;
502        handled_component_p (*var_p);
503        var_p = &TREE_OPERAND (*var_p, 0))
504     continue;
505 
506   /* Canonicalize the access to base on a MEM_REF.  */
507   if (DECL_P (*var_p))
508     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
509 
510   /* Assign a canonical SSA name to the address of the base decl used
511      in the address and share it for all accesses and addresses based
512      on it.  */
513   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
514   ielt.uid = uid;
515   dslot = decl_address.find_slot_with_hash (&ielt, uid, INSERT);
516   if (!*dslot)
517     {
518       if (gsi == NULL)
519 	return NULL;
520       addr = TREE_OPERAND (*var_p, 0);
521       const char *obj_name
522 	= get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
523       if (obj_name)
524 	name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
525       else
526 	name = make_ssa_name (TREE_TYPE (addr), NULL);
527       stmt = gimple_build_assign (name, addr);
528       gsi_insert_on_edge_immediate (entry, stmt);
529 
530       nielt = XNEW (struct int_tree_map);
531       nielt->uid = uid;
532       nielt->to = name;
533       *dslot = nielt;
534     }
535   else
536     name = (*dslot)->to;
537 
538   /* Express the address in terms of the canonical SSA name.  */
539   TREE_OPERAND (*var_p, 0) = name;
540   if (gsi == NULL)
541     return build_fold_addr_expr_with_type (obj, type);
542 
543   name = force_gimple_operand (build_addr (obj, current_function_decl),
544 			       &stmts, true, NULL_TREE);
545   if (!gimple_seq_empty_p (stmts))
546     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
547 
548   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
549     {
550       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
551 				   NULL_TREE);
552       if (!gimple_seq_empty_p (stmts))
553 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
554     }
555 
556   return name;
557 }
558 
559 /* Callback for htab_traverse.  Create the initialization statement
560    for reduction described in SLOT, and place it at the preheader of
561    the loop described in DATA.  */
562 
563 int
initialize_reductions(reduction_info ** slot,struct loop * loop)564 initialize_reductions (reduction_info **slot, struct loop *loop)
565 {
566   tree init, c;
567   tree bvar, type, arg;
568   edge e;
569 
570   struct reduction_info *const reduc = *slot;
571 
572   /* Create initialization in preheader:
573      reduction_variable = initialization value of reduction.  */
574 
575   /* In the phi node at the header, replace the argument coming
576      from the preheader with the reduction initialization value.  */
577 
578   /* Create a new variable to initialize the reduction.  */
579   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
580   bvar = create_tmp_var (type, "reduction");
581 
582   c = build_omp_clause (gimple_location (reduc->reduc_stmt),
583 			OMP_CLAUSE_REDUCTION);
584   OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code;
585   OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt));
586 
587   init = omp_reduction_init (c, TREE_TYPE (bvar));
588   reduc->init = init;
589 
590   /* Replace the argument representing the initialization value
591      with the initialization value for the reduction (neutral
592      element for the particular operation, e.g. 0 for PLUS_EXPR,
593      1 for MULT_EXPR, etc).
594      Keep the old value in a new variable "reduction_initial",
595      that will be taken in consideration after the parallel
596      computing is done.  */
597 
598   e = loop_preheader_edge (loop);
599   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
600   /* Create new variable to hold the initial value.  */
601 
602   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
603 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
604   reduc->initial_value = arg;
605   return 1;
606 }
607 
608 struct elv_data
609 {
610   struct walk_stmt_info info;
611   edge entry;
612   int_tree_htab_type decl_address;
613   gimple_stmt_iterator *gsi;
614   bool changed;
615   bool reset;
616 };
617 
618 /* Eliminates references to local variables in *TP out of the single
619    entry single exit region starting at DTA->ENTRY.
620    DECL_ADDRESS contains addresses of the references that had their
621    address taken already.  If the expression is changed, CHANGED is
622    set to true.  Callback for walk_tree.  */
623 
624 static tree
eliminate_local_variables_1(tree * tp,int * walk_subtrees,void * data)625 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
626 {
627   struct elv_data *const dta = (struct elv_data *) data;
628   tree t = *tp, var, addr, addr_type, type, obj;
629 
630   if (DECL_P (t))
631     {
632       *walk_subtrees = 0;
633 
634       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
635 	return NULL_TREE;
636 
637       type = TREE_TYPE (t);
638       addr_type = build_pointer_type (type);
639       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
640 			      dta->gsi);
641       if (dta->gsi == NULL && addr == NULL_TREE)
642 	{
643 	  dta->reset = true;
644 	  return NULL_TREE;
645 	}
646 
647       *tp = build_simple_mem_ref (addr);
648 
649       dta->changed = true;
650       return NULL_TREE;
651     }
652 
653   if (TREE_CODE (t) == ADDR_EXPR)
654     {
655       /* ADDR_EXPR may appear in two contexts:
656 	 -- as a gimple operand, when the address taken is a function invariant
657 	 -- as gimple rhs, when the resulting address in not a function
658 	    invariant
659 	 We do not need to do anything special in the latter case (the base of
660 	 the memory reference whose address is taken may be replaced in the
661 	 DECL_P case).  The former case is more complicated, as we need to
662 	 ensure that the new address is still a gimple operand.  Thus, it
663 	 is not sufficient to replace just the base of the memory reference --
664 	 we need to move the whole computation of the address out of the
665 	 loop.  */
666       if (!is_gimple_val (t))
667 	return NULL_TREE;
668 
669       *walk_subtrees = 0;
670       obj = TREE_OPERAND (t, 0);
671       var = get_base_address (obj);
672       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
673 	return NULL_TREE;
674 
675       addr_type = TREE_TYPE (t);
676       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
677 			      dta->gsi);
678       if (dta->gsi == NULL && addr == NULL_TREE)
679 	{
680 	  dta->reset = true;
681 	  return NULL_TREE;
682 	}
683       *tp = addr;
684 
685       dta->changed = true;
686       return NULL_TREE;
687     }
688 
689   if (!EXPR_P (t))
690     *walk_subtrees = 0;
691 
692   return NULL_TREE;
693 }
694 
695 /* Moves the references to local variables in STMT at *GSI out of the single
696    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
697    addresses of the references that had their address taken
698    already.  */
699 
700 static void
eliminate_local_variables_stmt(edge entry,gimple_stmt_iterator * gsi,int_tree_htab_type decl_address)701 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
702 				int_tree_htab_type decl_address)
703 {
704   struct elv_data dta;
705   gimple stmt = gsi_stmt (*gsi);
706 
707   memset (&dta.info, '\0', sizeof (dta.info));
708   dta.entry = entry;
709   dta.decl_address = decl_address;
710   dta.changed = false;
711   dta.reset = false;
712 
713   if (gimple_debug_bind_p (stmt))
714     {
715       dta.gsi = NULL;
716       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
717 		 eliminate_local_variables_1, &dta.info, NULL);
718       if (dta.reset)
719 	{
720 	  gimple_debug_bind_reset_value (stmt);
721 	  dta.changed = true;
722 	}
723     }
724   else if (gimple_clobber_p (stmt))
725     {
726       stmt = gimple_build_nop ();
727       gsi_replace (gsi, stmt, false);
728       dta.changed = true;
729     }
730   else
731     {
732       dta.gsi = gsi;
733       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
734     }
735 
736   if (dta.changed)
737     update_stmt (stmt);
738 }
739 
740 /* Eliminates the references to local variables from the single entry
741    single exit region between the ENTRY and EXIT edges.
742 
743    This includes:
744    1) Taking address of a local variable -- these are moved out of the
745    region (and temporary variable is created to hold the address if
746    necessary).
747 
748    2) Dereferencing a local variable -- these are replaced with indirect
749    references.  */
750 
751 static void
eliminate_local_variables(edge entry,edge exit)752 eliminate_local_variables (edge entry, edge exit)
753 {
754   basic_block bb;
755   auto_vec<basic_block, 3> body;
756   unsigned i;
757   gimple_stmt_iterator gsi;
758   bool has_debug_stmt = false;
759   int_tree_htab_type decl_address;
760   decl_address.create (10);
761   basic_block entry_bb = entry->src;
762   basic_block exit_bb = exit->dest;
763 
764   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
765 
766   FOR_EACH_VEC_ELT (body, i, bb)
767     if (bb != entry_bb && bb != exit_bb)
768       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
769 	if (is_gimple_debug (gsi_stmt (gsi)))
770 	  {
771 	    if (gimple_debug_bind_p (gsi_stmt (gsi)))
772 	      has_debug_stmt = true;
773 	  }
774 	else
775 	  eliminate_local_variables_stmt (entry, &gsi, decl_address);
776 
777   if (has_debug_stmt)
778     FOR_EACH_VEC_ELT (body, i, bb)
779       if (bb != entry_bb && bb != exit_bb)
780 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
781 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
782 	    eliminate_local_variables_stmt (entry, &gsi, decl_address);
783 
784   decl_address.dispose ();
785 }
786 
787 /* Returns true if expression EXPR is not defined between ENTRY and
788    EXIT, i.e. if all its operands are defined outside of the region.  */
789 
790 static bool
expr_invariant_in_region_p(edge entry,edge exit,tree expr)791 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
792 {
793   basic_block entry_bb = entry->src;
794   basic_block exit_bb = exit->dest;
795   basic_block def_bb;
796 
797   if (is_gimple_min_invariant (expr))
798     return true;
799 
800   if (TREE_CODE (expr) == SSA_NAME)
801     {
802       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
803       if (def_bb
804 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
805 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
806 	return false;
807 
808       return true;
809     }
810 
811   return false;
812 }
813 
814 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
815    The copies are stored to NAME_COPIES, if NAME was already duplicated,
816    its duplicate stored in NAME_COPIES is returned.
817 
818    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
819    duplicated, storing the copies in DECL_COPIES.  */
820 
821 static tree
separate_decls_in_region_name(tree name,name_to_copy_table_type name_copies,int_tree_htab_type decl_copies,bool copy_name_p)822 separate_decls_in_region_name (tree name, name_to_copy_table_type name_copies,
823 			       int_tree_htab_type decl_copies, bool copy_name_p)
824 {
825   tree copy, var, var_copy;
826   unsigned idx, uid, nuid;
827   struct int_tree_map ielt, *nielt;
828   struct name_to_copy_elt elt, *nelt;
829   name_to_copy_elt **slot;
830   int_tree_map **dslot;
831 
832   if (TREE_CODE (name) != SSA_NAME)
833     return name;
834 
835   idx = SSA_NAME_VERSION (name);
836   elt.version = idx;
837   slot = name_copies.find_slot_with_hash (&elt, idx,
838 					  copy_name_p ? INSERT : NO_INSERT);
839   if (slot && *slot)
840     return (*slot)->new_name;
841 
842   if (copy_name_p)
843     {
844       copy = duplicate_ssa_name (name, NULL);
845       nelt = XNEW (struct name_to_copy_elt);
846       nelt->version = idx;
847       nelt->new_name = copy;
848       nelt->field = NULL_TREE;
849       *slot = nelt;
850     }
851   else
852     {
853       gcc_assert (!slot);
854       copy = name;
855     }
856 
857   var = SSA_NAME_VAR (name);
858   if (!var)
859     return copy;
860 
861   uid = DECL_UID (var);
862   ielt.uid = uid;
863   dslot = decl_copies.find_slot_with_hash (&ielt, uid, INSERT);
864   if (!*dslot)
865     {
866       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
867       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
868       nielt = XNEW (struct int_tree_map);
869       nielt->uid = uid;
870       nielt->to = var_copy;
871       *dslot = nielt;
872 
873       /* Ensure that when we meet this decl next time, we won't duplicate
874          it again.  */
875       nuid = DECL_UID (var_copy);
876       ielt.uid = nuid;
877       dslot = decl_copies.find_slot_with_hash (&ielt, nuid, INSERT);
878       gcc_assert (!*dslot);
879       nielt = XNEW (struct int_tree_map);
880       nielt->uid = nuid;
881       nielt->to = var_copy;
882       *dslot = nielt;
883     }
884   else
885     var_copy = ((struct int_tree_map *) *dslot)->to;
886 
887   replace_ssa_name_symbol (copy, var_copy);
888   return copy;
889 }
890 
891 /* Finds the ssa names used in STMT that are defined outside the
892    region between ENTRY and EXIT and replaces such ssa names with
893    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
894    decls of all ssa names used in STMT (including those defined in
895    LOOP) are replaced with the new temporary variables; the
896    replacement decls are stored in DECL_COPIES.  */
897 
898 static void
separate_decls_in_region_stmt(edge entry,edge exit,gimple stmt,name_to_copy_table_type name_copies,int_tree_htab_type decl_copies)899 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt,
900 			       name_to_copy_table_type name_copies,
901 			       int_tree_htab_type decl_copies)
902 {
903   use_operand_p use;
904   def_operand_p def;
905   ssa_op_iter oi;
906   tree name, copy;
907   bool copy_name_p;
908 
909   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
910   {
911     name = DEF_FROM_PTR (def);
912     gcc_assert (TREE_CODE (name) == SSA_NAME);
913     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
914 					  false);
915     gcc_assert (copy == name);
916   }
917 
918   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
919   {
920     name = USE_FROM_PTR (use);
921     if (TREE_CODE (name) != SSA_NAME)
922       continue;
923 
924     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
925     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
926 					  copy_name_p);
927     SET_USE (use, copy);
928   }
929 }
930 
931 /* Finds the ssa names used in STMT that are defined outside the
932    region between ENTRY and EXIT and replaces such ssa names with
933    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
934    decls of all ssa names used in STMT (including those defined in
935    LOOP) are replaced with the new temporary variables; the
936    replacement decls are stored in DECL_COPIES.  */
937 
938 static bool
separate_decls_in_region_debug(gimple stmt,name_to_copy_table_type name_copies,int_tree_htab_type decl_copies)939 separate_decls_in_region_debug (gimple stmt,
940 				name_to_copy_table_type name_copies,
941 				int_tree_htab_type decl_copies)
942 {
943   use_operand_p use;
944   ssa_op_iter oi;
945   tree var, name;
946   struct int_tree_map ielt;
947   struct name_to_copy_elt elt;
948   name_to_copy_elt **slot;
949   int_tree_map **dslot;
950 
951   if (gimple_debug_bind_p (stmt))
952     var = gimple_debug_bind_get_var (stmt);
953   else if (gimple_debug_source_bind_p (stmt))
954     var = gimple_debug_source_bind_get_var (stmt);
955   else
956     return true;
957   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
958     return true;
959   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
960   ielt.uid = DECL_UID (var);
961   dslot = decl_copies.find_slot_with_hash (&ielt, ielt.uid, NO_INSERT);
962   if (!dslot)
963     return true;
964   if (gimple_debug_bind_p (stmt))
965     gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
966   else if (gimple_debug_source_bind_p (stmt))
967     gimple_debug_source_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to);
968 
969   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
970   {
971     name = USE_FROM_PTR (use);
972     if (TREE_CODE (name) != SSA_NAME)
973       continue;
974 
975     elt.version = SSA_NAME_VERSION (name);
976     slot = name_copies.find_slot_with_hash (&elt, elt.version, NO_INSERT);
977     if (!slot)
978       {
979 	gimple_debug_bind_reset_value (stmt);
980 	update_stmt (stmt);
981 	break;
982       }
983 
984     SET_USE (use, (*slot)->new_name);
985   }
986 
987   return false;
988 }
989 
990 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
991    specified in SLOT. The type is passed in DATA.  */
992 
993 int
add_field_for_reduction(reduction_info ** slot,tree type)994 add_field_for_reduction (reduction_info **slot, tree type)
995 {
996 
997   struct reduction_info *const red = *slot;
998   tree var = gimple_assign_lhs (red->reduc_stmt);
999   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
1000 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1001 
1002   insert_field_into_struct (type, field);
1003 
1004   red->field = field;
1005 
1006   return 1;
1007 }
1008 
1009 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1010    described in SLOT. The type is passed in DATA.  */
1011 
1012 int
add_field_for_name(name_to_copy_elt ** slot,tree type)1013 add_field_for_name (name_to_copy_elt **slot, tree type)
1014 {
1015   struct name_to_copy_elt *const elt = *slot;
1016   tree name = ssa_name (elt->version);
1017   tree field = build_decl (UNKNOWN_LOCATION,
1018 			   FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1019 			   TREE_TYPE (name));
1020 
1021   insert_field_into_struct (type, field);
1022   elt->field = field;
1023 
1024   return 1;
1025 }
1026 
1027 /* Callback for htab_traverse.  A local result is the intermediate result
1028    computed by a single
1029    thread, or the initial value in case no iteration was executed.
1030    This function creates a phi node reflecting these values.
1031    The phi's result will be stored in NEW_PHI field of the
1032    reduction's data structure.  */
1033 
1034 int
create_phi_for_local_result(reduction_info ** slot,struct loop * loop)1035 create_phi_for_local_result (reduction_info **slot, struct loop *loop)
1036 {
1037   struct reduction_info *const reduc = *slot;
1038   edge e;
1039   gimple new_phi;
1040   basic_block store_bb;
1041   tree local_res;
1042   source_location locus;
1043 
1044   /* STORE_BB is the block where the phi
1045      should be stored.  It is the destination of the loop exit.
1046      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1047   store_bb = FALLTHRU_EDGE (loop->latch)->dest;
1048 
1049   /* STORE_BB has two predecessors.  One coming from  the loop
1050      (the reduction's result is computed at the loop),
1051      and another coming from a block preceding the loop,
1052      when no iterations
1053      are executed (the initial value should be taken).  */
1054   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch))
1055     e = EDGE_PRED (store_bb, 1);
1056   else
1057     e = EDGE_PRED (store_bb, 0);
1058   local_res = copy_ssa_name (gimple_assign_lhs (reduc->reduc_stmt), NULL);
1059   locus = gimple_location (reduc->reduc_stmt);
1060   new_phi = create_phi_node (local_res, store_bb);
1061   add_phi_arg (new_phi, reduc->init, e, locus);
1062   add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt),
1063 	       FALLTHRU_EDGE (loop->latch), locus);
1064   reduc->new_phi = new_phi;
1065 
1066   return 1;
1067 }
1068 
1069 struct clsn_data
1070 {
1071   tree store;
1072   tree load;
1073 
1074   basic_block store_bb;
1075   basic_block load_bb;
1076 };
1077 
1078 /* Callback for htab_traverse.  Create an atomic instruction for the
1079    reduction described in SLOT.
1080    DATA annotates the place in memory the atomic operation relates to,
1081    and the basic block it needs to be generated in.  */
1082 
1083 int
create_call_for_reduction_1(reduction_info ** slot,struct clsn_data * clsn_data)1084 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1085 {
1086   struct reduction_info *const reduc = *slot;
1087   gimple_stmt_iterator gsi;
1088   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1089   tree load_struct;
1090   basic_block bb;
1091   basic_block new_bb;
1092   edge e;
1093   tree t, addr, ref, x;
1094   tree tmp_load, name;
1095   gimple load;
1096 
1097   load_struct = build_simple_mem_ref (clsn_data->load);
1098   t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1099 
1100   addr = build_addr (t, current_function_decl);
1101 
1102   /* Create phi node.  */
1103   bb = clsn_data->load_bb;
1104 
1105   e = split_block (bb, t);
1106   new_bb = e->dest;
1107 
1108   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL);
1109   tmp_load = make_ssa_name (tmp_load, NULL);
1110   load = gimple_build_omp_atomic_load (tmp_load, addr);
1111   SSA_NAME_DEF_STMT (tmp_load) = load;
1112   gsi = gsi_start_bb (new_bb);
1113   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1114 
1115   e = split_block (new_bb, load);
1116   new_bb = e->dest;
1117   gsi = gsi_start_bb (new_bb);
1118   ref = tmp_load;
1119   x = fold_build2 (reduc->reduction_code,
1120 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1121 		   PHI_RESULT (reduc->new_phi));
1122 
1123   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1124 				   GSI_CONTINUE_LINKING);
1125 
1126   gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1127   return 1;
1128 }
1129 
1130 /* Create the atomic operation at the join point of the threads.
1131    REDUCTION_LIST describes the reductions in the LOOP.
1132    LD_ST_DATA describes the shared data structure where
1133    shared data is stored in and loaded from.  */
1134 static void
create_call_for_reduction(struct loop * loop,reduction_info_table_type reduction_list,struct clsn_data * ld_st_data)1135 create_call_for_reduction (struct loop *loop,
1136 			   reduction_info_table_type reduction_list,
1137 			   struct clsn_data *ld_st_data)
1138 {
1139   reduction_list.traverse <struct loop *, create_phi_for_local_result> (loop);
1140   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1141   ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest;
1142   reduction_list
1143     .traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1144 }
1145 
1146 /* Callback for htab_traverse.  Loads the final reduction value at the
1147    join point of all threads, and inserts it in the right place.  */
1148 
1149 int
create_loads_for_reductions(reduction_info ** slot,struct clsn_data * clsn_data)1150 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1151 {
1152   struct reduction_info *const red = *slot;
1153   gimple stmt;
1154   gimple_stmt_iterator gsi;
1155   tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1156   tree load_struct;
1157   tree name;
1158   tree x;
1159 
1160   gsi = gsi_after_labels (clsn_data->load_bb);
1161   load_struct = build_simple_mem_ref (clsn_data->load);
1162   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1163 			NULL_TREE);
1164 
1165   x = load_struct;
1166   name = PHI_RESULT (red->keep_res);
1167   stmt = gimple_build_assign (name, x);
1168 
1169   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1170 
1171   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1172        !gsi_end_p (gsi); gsi_next (&gsi))
1173     if (gsi_stmt (gsi) == red->keep_res)
1174       {
1175 	remove_phi_node (&gsi, false);
1176 	return 1;
1177       }
1178   gcc_unreachable ();
1179 }
1180 
1181 /* Load the reduction result that was stored in LD_ST_DATA.
1182    REDUCTION_LIST describes the list of reductions that the
1183    loads should be generated for.  */
1184 static void
create_final_loads_for_reduction(reduction_info_table_type reduction_list,struct clsn_data * ld_st_data)1185 create_final_loads_for_reduction (reduction_info_table_type reduction_list,
1186 				  struct clsn_data *ld_st_data)
1187 {
1188   gimple_stmt_iterator gsi;
1189   tree t;
1190   gimple stmt;
1191 
1192   gsi = gsi_after_labels (ld_st_data->load_bb);
1193   t = build_fold_addr_expr (ld_st_data->store);
1194   stmt = gimple_build_assign (ld_st_data->load, t);
1195 
1196   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1197 
1198   reduction_list
1199     .traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1200 
1201 }
1202 
1203 /* Callback for htab_traverse.  Store the neutral value for the
1204   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1205   1 for MULT_EXPR, etc. into the reduction field.
1206   The reduction is specified in SLOT. The store information is
1207   passed in DATA.  */
1208 
1209 int
create_stores_for_reduction(reduction_info ** slot,struct clsn_data * clsn_data)1210 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1211 {
1212   struct reduction_info *const red = *slot;
1213   tree t;
1214   gimple stmt;
1215   gimple_stmt_iterator gsi;
1216   tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt));
1217 
1218   gsi = gsi_last_bb (clsn_data->store_bb);
1219   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1220   stmt = gimple_build_assign (t, red->initial_value);
1221   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1222 
1223   return 1;
1224 }
1225 
1226 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1227    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1228    specified in SLOT.  */
1229 
1230 int
create_loads_and_stores_for_name(name_to_copy_elt ** slot,struct clsn_data * clsn_data)1231 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1232 				  struct clsn_data *clsn_data)
1233 {
1234   struct name_to_copy_elt *const elt = *slot;
1235   tree t;
1236   gimple stmt;
1237   gimple_stmt_iterator gsi;
1238   tree type = TREE_TYPE (elt->new_name);
1239   tree load_struct;
1240 
1241   gsi = gsi_last_bb (clsn_data->store_bb);
1242   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1243   stmt = gimple_build_assign (t, ssa_name (elt->version));
1244   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1245 
1246   gsi = gsi_last_bb (clsn_data->load_bb);
1247   load_struct = build_simple_mem_ref (clsn_data->load);
1248   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1249   stmt = gimple_build_assign (elt->new_name, t);
1250   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1251 
1252   return 1;
1253 }
1254 
1255 /* Moves all the variables used in LOOP and defined outside of it (including
1256    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1257    name) to a structure created for this purpose.  The code
1258 
1259    while (1)
1260      {
1261        use (a);
1262        use (b);
1263      }
1264 
1265    is transformed this way:
1266 
1267    bb0:
1268    old.a = a;
1269    old.b = b;
1270 
1271    bb1:
1272    a' = new->a;
1273    b' = new->b;
1274    while (1)
1275      {
1276        use (a');
1277        use (b');
1278      }
1279 
1280    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1281    pointer `new' is intentionally not initialized (the loop will be split to a
1282    separate function later, and `new' will be initialized from its arguments).
1283    LD_ST_DATA holds information about the shared data structure used to pass
1284    information among the threads.  It is initialized here, and
1285    gen_parallel_loop will pass it to create_call_for_reduction that
1286    needs this information.  REDUCTION_LIST describes the reductions
1287    in LOOP.  */
1288 
1289 static void
separate_decls_in_region(edge entry,edge exit,reduction_info_table_type reduction_list,tree * arg_struct,tree * new_arg_struct,struct clsn_data * ld_st_data)1290 separate_decls_in_region (edge entry, edge exit,
1291 			  reduction_info_table_type reduction_list,
1292 			  tree *arg_struct, tree *new_arg_struct,
1293 			  struct clsn_data *ld_st_data)
1294 
1295 {
1296   basic_block bb1 = split_edge (entry);
1297   basic_block bb0 = single_pred (bb1);
1298   name_to_copy_table_type name_copies;
1299   name_copies.create (10);
1300   int_tree_htab_type decl_copies;
1301   decl_copies.create (10);
1302   unsigned i;
1303   tree type, type_name, nvar;
1304   gimple_stmt_iterator gsi;
1305   struct clsn_data clsn_data;
1306   auto_vec<basic_block, 3> body;
1307   basic_block bb;
1308   basic_block entry_bb = bb1;
1309   basic_block exit_bb = exit->dest;
1310   bool has_debug_stmt = false;
1311 
1312   entry = single_succ_edge (entry_bb);
1313   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1314 
1315   FOR_EACH_VEC_ELT (body, i, bb)
1316     {
1317       if (bb != entry_bb && bb != exit_bb)
1318 	{
1319 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1320 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1321 					   name_copies, decl_copies);
1322 
1323 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1324 	    {
1325 	      gimple stmt = gsi_stmt (gsi);
1326 
1327 	      if (is_gimple_debug (stmt))
1328 		has_debug_stmt = true;
1329 	      else
1330 		separate_decls_in_region_stmt (entry, exit, stmt,
1331 					       name_copies, decl_copies);
1332 	    }
1333 	}
1334     }
1335 
1336   /* Now process debug bind stmts.  We must not create decls while
1337      processing debug stmts, so we defer their processing so as to
1338      make sure we will have debug info for as many variables as
1339      possible (all of those that were dealt with in the loop above),
1340      and discard those for which we know there's nothing we can
1341      do.  */
1342   if (has_debug_stmt)
1343     FOR_EACH_VEC_ELT (body, i, bb)
1344       if (bb != entry_bb && bb != exit_bb)
1345 	{
1346 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1347 	    {
1348 	      gimple stmt = gsi_stmt (gsi);
1349 
1350 	      if (is_gimple_debug (stmt))
1351 		{
1352 		  if (separate_decls_in_region_debug (stmt, name_copies,
1353 						      decl_copies))
1354 		    {
1355 		      gsi_remove (&gsi, true);
1356 		      continue;
1357 		    }
1358 		}
1359 
1360 	      gsi_next (&gsi);
1361 	    }
1362 	}
1363 
1364   if (name_copies.elements () == 0 && reduction_list.elements () == 0)
1365     {
1366       /* It may happen that there is nothing to copy (if there are only
1367          loop carried and external variables in the loop).  */
1368       *arg_struct = NULL;
1369       *new_arg_struct = NULL;
1370     }
1371   else
1372     {
1373       /* Create the type for the structure to store the ssa names to.  */
1374       type = lang_hooks.types.make_type (RECORD_TYPE);
1375       type_name = build_decl (UNKNOWN_LOCATION,
1376 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1377 			      type);
1378       TYPE_NAME (type) = type_name;
1379 
1380       name_copies.traverse <tree, add_field_for_name> (type);
1381       if (reduction_list.is_created () && reduction_list.elements () > 0)
1382 	{
1383 	  /* Create the fields for reductions.  */
1384 	  reduction_list.traverse <tree, add_field_for_reduction> (type);
1385 	}
1386       layout_type (type);
1387 
1388       /* Create the loads and stores.  */
1389       *arg_struct = create_tmp_var (type, ".paral_data_store");
1390       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1391       *new_arg_struct = make_ssa_name (nvar, NULL);
1392 
1393       ld_st_data->store = *arg_struct;
1394       ld_st_data->load = *new_arg_struct;
1395       ld_st_data->store_bb = bb0;
1396       ld_st_data->load_bb = bb1;
1397 
1398       name_copies
1399 	.traverse <struct clsn_data *, create_loads_and_stores_for_name>
1400 		  (ld_st_data);
1401 
1402       /* Load the calculation from memory (after the join of the threads).  */
1403 
1404       if (reduction_list.is_created () && reduction_list.elements () > 0)
1405 	{
1406 	  reduction_list
1407 	    .traverse <struct clsn_data *, create_stores_for_reduction>
1408 		      (ld_st_data);
1409 	  clsn_data.load = make_ssa_name (nvar, NULL);
1410 	  clsn_data.load_bb = exit->dest;
1411 	  clsn_data.store = ld_st_data->store;
1412 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1413 	}
1414     }
1415 
1416   decl_copies.dispose ();
1417   name_copies.dispose ();
1418 }
1419 
1420 /* Bitmap containing uids of functions created by parallelization.  We cannot
1421    allocate it from the default obstack, as it must live across compilation
1422    of several functions; we make it gc allocated instead.  */
1423 
1424 static GTY(()) bitmap parallelized_functions;
1425 
1426 /* Returns true if FN was created by create_loop_fn.  */
1427 
1428 bool
parallelized_function_p(tree fn)1429 parallelized_function_p (tree fn)
1430 {
1431   if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
1432     return false;
1433 
1434   return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
1435 }
1436 
1437 /* Creates and returns an empty function that will receive the body of
1438    a parallelized loop.  */
1439 
1440 static tree
create_loop_fn(location_t loc)1441 create_loop_fn (location_t loc)
1442 {
1443   char buf[100];
1444   char *tname;
1445   tree decl, type, name, t;
1446   struct function *act_cfun = cfun;
1447   static unsigned loopfn_num;
1448 
1449   loc = LOCATION_LOCUS (loc);
1450   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1451   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1452   clean_symbol_name (tname);
1453   name = get_identifier (tname);
1454   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1455 
1456   decl = build_decl (loc, FUNCTION_DECL, name, type);
1457   if (!parallelized_functions)
1458     parallelized_functions = BITMAP_GGC_ALLOC ();
1459   bitmap_set_bit (parallelized_functions, DECL_UID (decl));
1460 
1461   TREE_STATIC (decl) = 1;
1462   TREE_USED (decl) = 1;
1463   DECL_ARTIFICIAL (decl) = 1;
1464   DECL_IGNORED_P (decl) = 0;
1465   TREE_PUBLIC (decl) = 0;
1466   DECL_UNINLINABLE (decl) = 1;
1467   DECL_EXTERNAL (decl) = 0;
1468   DECL_CONTEXT (decl) = NULL_TREE;
1469   DECL_INITIAL (decl) = make_node (BLOCK);
1470 
1471   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1472   DECL_ARTIFICIAL (t) = 1;
1473   DECL_IGNORED_P (t) = 1;
1474   DECL_RESULT (decl) = t;
1475 
1476   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1477 		  ptr_type_node);
1478   DECL_ARTIFICIAL (t) = 1;
1479   DECL_ARG_TYPE (t) = ptr_type_node;
1480   DECL_CONTEXT (t) = decl;
1481   TREE_USED (t) = 1;
1482   DECL_ARGUMENTS (decl) = t;
1483 
1484   allocate_struct_function (decl, false);
1485 
1486   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1487      it.  */
1488   set_cfun (act_cfun);
1489 
1490   return decl;
1491 }
1492 
1493 /* Moves the exit condition of LOOP to the beginning of its header, and
1494    duplicates the part of the last iteration that gets disabled to the
1495    exit of the loop.  NIT is the number of iterations of the loop
1496    (used to initialize the variables in the duplicated part).
1497 
1498    TODO: the common case is that latch of the loop is empty and immediately
1499    follows the loop exit.  In this case, it would be better not to copy the
1500    body of the loop, but only move the entry of the loop directly before the
1501    exit check and increase the number of iterations of the loop by one.
1502    This may need some additional preconditioning in case NIT = ~0.
1503    REDUCTION_LIST describes the reductions in LOOP.  */
1504 
1505 static void
transform_to_exit_first_loop(struct loop * loop,reduction_info_table_type reduction_list,tree nit)1506 transform_to_exit_first_loop (struct loop *loop,
1507 			      reduction_info_table_type reduction_list,
1508 			      tree nit)
1509 {
1510   basic_block *bbs, *nbbs, ex_bb, orig_header;
1511   unsigned n;
1512   bool ok;
1513   edge exit = single_dom_exit (loop), hpred;
1514   tree control, control_name, res, t;
1515   gimple phi, nphi, cond_stmt, stmt, cond_nit;
1516   gimple_stmt_iterator gsi;
1517   tree nit_1;
1518 
1519   split_block_after_labels (loop->header);
1520   orig_header = single_succ (loop->header);
1521   hpred = single_succ_edge (loop->header);
1522 
1523   cond_stmt = last_stmt (exit->src);
1524   control = gimple_cond_lhs (cond_stmt);
1525   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1526 
1527   /* Make sure that we have phi nodes on exit for all loop header phis
1528      (create_parallel_loop requires that).  */
1529   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1530     {
1531       phi = gsi_stmt (gsi);
1532       res = PHI_RESULT (phi);
1533       t = copy_ssa_name (res, phi);
1534       SET_PHI_RESULT (phi, t);
1535       nphi = create_phi_node (res, orig_header);
1536       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1537 
1538       if (res == control)
1539 	{
1540 	  gimple_cond_set_lhs (cond_stmt, t);
1541 	  update_stmt (cond_stmt);
1542 	  control = t;
1543 	}
1544     }
1545 
1546   bbs = get_loop_body_in_dom_order (loop);
1547 
1548   for (n = 0; bbs[n] != exit->src; n++)
1549    continue;
1550   nbbs = XNEWVEC (basic_block, n);
1551   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1552 				   bbs + 1, n, nbbs);
1553   gcc_assert (ok);
1554   free (bbs);
1555   ex_bb = nbbs[0];
1556   free (nbbs);
1557 
1558   /* Other than reductions, the only gimple reg that should be copied
1559      out of the loop is the control variable.  */
1560   exit = single_dom_exit (loop);
1561   control_name = NULL_TREE;
1562   for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); )
1563     {
1564       phi = gsi_stmt (gsi);
1565       res = PHI_RESULT (phi);
1566       if (virtual_operand_p (res))
1567 	{
1568 	  gsi_next (&gsi);
1569 	  continue;
1570 	}
1571 
1572       /* Check if it is a part of reduction.  If it is,
1573          keep the phi at the reduction's keep_res field.  The
1574          PHI_RESULT of this phi is the resulting value of the reduction
1575          variable when exiting the loop.  */
1576 
1577       if (reduction_list.elements () > 0)
1578 	{
1579 	  struct reduction_info *red;
1580 
1581 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1582 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1583 	  if (red)
1584 	    {
1585 	      red->keep_res = phi;
1586 	      gsi_next (&gsi);
1587 	      continue;
1588 	    }
1589 	}
1590       gcc_assert (control_name == NULL_TREE
1591 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
1592       control_name = res;
1593       remove_phi_node (&gsi, false);
1594     }
1595   gcc_assert (control_name != NULL_TREE);
1596 
1597   /* Initialize the control variable to number of iterations
1598      according to the rhs of the exit condition.  */
1599   gsi = gsi_after_labels (ex_bb);
1600   cond_nit = last_stmt (exit->src);
1601   nit_1 =  gimple_cond_rhs (cond_nit);
1602   nit_1 = force_gimple_operand_gsi (&gsi,
1603 				  fold_convert (TREE_TYPE (control_name), nit_1),
1604 				  false, NULL_TREE, false, GSI_SAME_STMT);
1605   stmt = gimple_build_assign (control_name, nit_1);
1606   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1607 }
1608 
1609 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
1610    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
1611    NEW_DATA is the variable that should be initialized from the argument
1612    of LOOP_FN.  N_THREADS is the requested number of threads.  Returns the
1613    basic block containing GIMPLE_OMP_PARALLEL tree.  */
1614 
1615 static basic_block
create_parallel_loop(struct loop * loop,tree loop_fn,tree data,tree new_data,unsigned n_threads,location_t loc)1616 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
1617 		      tree new_data, unsigned n_threads, location_t loc)
1618 {
1619   gimple_stmt_iterator gsi;
1620   basic_block bb, paral_bb, for_bb, ex_bb;
1621   tree t, param;
1622   gimple stmt, for_stmt, phi, cond_stmt;
1623   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
1624   edge exit, nexit, guard, end, e;
1625 
1626   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
1627   bb = loop_preheader_edge (loop)->src;
1628   paral_bb = single_pred (bb);
1629   gsi = gsi_last_bb (paral_bb);
1630 
1631   t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
1632   OMP_CLAUSE_NUM_THREADS_EXPR (t)
1633     = build_int_cst (integer_type_node, n_threads);
1634   stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
1635   gimple_set_location (stmt, loc);
1636 
1637   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1638 
1639   /* Initialize NEW_DATA.  */
1640   if (data)
1641     {
1642       gsi = gsi_after_labels (bb);
1643 
1644       param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL);
1645       stmt = gimple_build_assign (param, build_fold_addr_expr (data));
1646       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1647 
1648       stmt = gimple_build_assign (new_data,
1649 				  fold_convert (TREE_TYPE (new_data), param));
1650       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1651     }
1652 
1653   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
1654   bb = split_loop_exit_edge (single_dom_exit (loop));
1655   gsi = gsi_last_bb (bb);
1656   stmt = gimple_build_omp_return (false);
1657   gimple_set_location (stmt, loc);
1658   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1659 
1660   /* Extract data for GIMPLE_OMP_FOR.  */
1661   gcc_assert (loop->header == single_dom_exit (loop)->src);
1662   cond_stmt = last_stmt (loop->header);
1663 
1664   cvar = gimple_cond_lhs (cond_stmt);
1665   cvar_base = SSA_NAME_VAR (cvar);
1666   phi = SSA_NAME_DEF_STMT (cvar);
1667   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1668   initvar = copy_ssa_name (cvar, NULL);
1669   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
1670 	   initvar);
1671   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
1672 
1673   gsi = gsi_last_nondebug_bb (loop->latch);
1674   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
1675   gsi_remove (&gsi, true);
1676 
1677   /* Prepare cfg.  */
1678   for_bb = split_edge (loop_preheader_edge (loop));
1679   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
1680   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
1681   gcc_assert (exit == single_dom_exit (loop));
1682 
1683   guard = make_edge (for_bb, ex_bb, 0);
1684   single_succ_edge (loop->latch)->flags = 0;
1685   end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
1686   for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi))
1687     {
1688       source_location locus;
1689       tree def;
1690       phi = gsi_stmt (gsi);
1691       stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
1692 
1693       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
1694       locus = gimple_phi_arg_location_from_edge (stmt,
1695 						 loop_preheader_edge (loop));
1696       add_phi_arg (phi, def, guard, locus);
1697 
1698       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
1699       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
1700       add_phi_arg (phi, def, end, locus);
1701     }
1702   e = redirect_edge_and_branch (exit, nexit->dest);
1703   PENDING_STMT (e) = NULL;
1704 
1705   /* Emit GIMPLE_OMP_FOR.  */
1706   gimple_cond_set_lhs (cond_stmt, cvar_base);
1707   type = TREE_TYPE (cvar);
1708   t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
1709   OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
1710 
1711   for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
1712   gimple_set_location (for_stmt, loc);
1713   gimple_omp_for_set_index (for_stmt, 0, initvar);
1714   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
1715   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
1716   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
1717   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
1718 						cvar_base,
1719 						build_int_cst (type, 1)));
1720 
1721   gsi = gsi_last_bb (for_bb);
1722   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
1723   SSA_NAME_DEF_STMT (initvar) = for_stmt;
1724 
1725   /* Emit GIMPLE_OMP_CONTINUE.  */
1726   gsi = gsi_last_bb (loop->latch);
1727   stmt = gimple_build_omp_continue (cvar_next, cvar);
1728   gimple_set_location (stmt, loc);
1729   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1730   SSA_NAME_DEF_STMT (cvar_next) = stmt;
1731 
1732   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
1733   gsi = gsi_last_bb (ex_bb);
1734   stmt = gimple_build_omp_return (true);
1735   gimple_set_location (stmt, loc);
1736   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1737 
1738   /* After the above dom info is hosed.  Re-compute it.  */
1739   free_dominance_info (CDI_DOMINATORS);
1740   calculate_dominance_info (CDI_DOMINATORS);
1741 
1742   return paral_bb;
1743 }
1744 
1745 /* Generates code to execute the iterations of LOOP in N_THREADS
1746    threads in parallel.
1747 
1748    NITER describes number of iterations of LOOP.
1749    REDUCTION_LIST describes the reductions existent in the LOOP.  */
1750 
1751 static void
gen_parallel_loop(struct loop * loop,reduction_info_table_type reduction_list,unsigned n_threads,struct tree_niter_desc * niter)1752 gen_parallel_loop (struct loop *loop, reduction_info_table_type reduction_list,
1753 		   unsigned n_threads, struct tree_niter_desc *niter)
1754 {
1755   tree many_iterations_cond, type, nit;
1756   tree arg_struct, new_arg_struct;
1757   gimple_seq stmts;
1758   basic_block parallel_head;
1759   edge entry, exit;
1760   struct clsn_data clsn_data;
1761   unsigned prob;
1762   location_t loc;
1763   gimple cond_stmt;
1764   unsigned int m_p_thread=2;
1765 
1766   /* From
1767 
1768      ---------------------------------------------------------------------
1769      loop
1770        {
1771 	 IV = phi (INIT, IV + STEP)
1772 	 BODY1;
1773 	 if (COND)
1774 	   break;
1775 	 BODY2;
1776        }
1777      ---------------------------------------------------------------------
1778 
1779      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1780      we generate the following code:
1781 
1782      ---------------------------------------------------------------------
1783 
1784      if (MAY_BE_ZERO
1785      || NITER < MIN_PER_THREAD * N_THREADS)
1786      goto original;
1787 
1788      BODY1;
1789      store all local loop-invariant variables used in body of the loop to DATA.
1790      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1791      load the variables from DATA.
1792      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1793      BODY2;
1794      BODY1;
1795      GIMPLE_OMP_CONTINUE;
1796      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
1797      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
1798      goto end;
1799 
1800      original:
1801      loop
1802        {
1803 	 IV = phi (INIT, IV + STEP)
1804 	 BODY1;
1805 	 if (COND)
1806 	   break;
1807 	 BODY2;
1808        }
1809 
1810      end:
1811 
1812    */
1813 
1814   /* Create two versions of the loop -- in the old one, we know that the
1815      number of iterations is large enough, and we will transform it into the
1816      loop that will be split to loop_fn, the new one will be used for the
1817      remaining iterations.  */
1818 
1819   /* We should compute a better number-of-iterations value for outer loops.
1820      That is, if we have
1821 
1822     for (i = 0; i < n; ++i)
1823       for (j = 0; j < m; ++j)
1824         ...
1825 
1826     we should compute nit = n * m, not nit = n.
1827     Also may_be_zero handling would need to be adjusted.  */
1828 
1829   type = TREE_TYPE (niter->niter);
1830   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1831 			      NULL_TREE);
1832   if (stmts)
1833     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1834 
1835   if (loop->inner)
1836     m_p_thread=2;
1837   else
1838     m_p_thread=MIN_PER_THREAD;
1839 
1840    many_iterations_cond =
1841      fold_build2 (GE_EXPR, boolean_type_node,
1842                 nit, build_int_cst (type, m_p_thread * n_threads));
1843 
1844   many_iterations_cond
1845     = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1846 		   invert_truthvalue (unshare_expr (niter->may_be_zero)),
1847 		   many_iterations_cond);
1848   many_iterations_cond
1849     = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
1850   if (stmts)
1851     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1852   if (!is_gimple_condexpr (many_iterations_cond))
1853     {
1854       many_iterations_cond
1855 	= force_gimple_operand (many_iterations_cond, &stmts,
1856 				true, NULL_TREE);
1857       if (stmts)
1858 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
1859     }
1860 
1861   initialize_original_copy_tables ();
1862 
1863   /* We assume that the loop usually iterates a lot.  */
1864   prob = 4 * REG_BR_PROB_BASE / 5;
1865   loop_version (loop, many_iterations_cond, NULL,
1866 		prob, prob, REG_BR_PROB_BASE - prob, true);
1867   update_ssa (TODO_update_ssa);
1868   free_original_copy_tables ();
1869 
1870   /* Base all the induction variables in LOOP on a single control one.  */
1871   canonicalize_loop_ivs (loop, &nit, true);
1872 
1873   /* Ensure that the exit condition is the first statement in the loop.  */
1874   transform_to_exit_first_loop (loop, reduction_list, nit);
1875 
1876   /* Generate initializations for reductions.  */
1877   if (reduction_list.elements () > 0)
1878     reduction_list.traverse <struct loop *, initialize_reductions> (loop);
1879 
1880   /* Eliminate the references to local variables from the loop.  */
1881   gcc_assert (single_exit (loop));
1882   entry = loop_preheader_edge (loop);
1883   exit = single_dom_exit (loop);
1884 
1885   eliminate_local_variables (entry, exit);
1886   /* In the old loop, move all variables non-local to the loop to a structure
1887      and back, and create separate decls for the variables used in loop.  */
1888   separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
1889 			    &new_arg_struct, &clsn_data);
1890 
1891   /* Create the parallel constructs.  */
1892   loc = UNKNOWN_LOCATION;
1893   cond_stmt = last_stmt (loop->header);
1894   if (cond_stmt)
1895     loc = gimple_location (cond_stmt);
1896   parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
1897 					new_arg_struct, n_threads, loc);
1898   if (reduction_list.elements () > 0)
1899     create_call_for_reduction (loop, reduction_list, &clsn_data);
1900 
1901   scev_reset ();
1902 
1903   /* Cancel the loop (it is simpler to do it here rather than to teach the
1904      expander to do it).  */
1905   cancel_loop_tree (loop);
1906 
1907   /* Free loop bound estimations that could contain references to
1908      removed statements.  */
1909   FOR_EACH_LOOP (loop, 0)
1910     free_numbers_of_iterations_estimates_loop (loop);
1911 
1912   /* Expand the parallel constructs.  We do it directly here instead of running
1913      a separate expand_omp pass, since it is more efficient, and less likely to
1914      cause troubles with further analyses not being able to deal with the
1915      OMP trees.  */
1916 
1917   omp_expand_local (parallel_head);
1918 }
1919 
1920 /* Returns true when LOOP contains vector phi nodes.  */
1921 
1922 static bool
loop_has_vector_phi_nodes(struct loop * loop ATTRIBUTE_UNUSED)1923 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
1924 {
1925   unsigned i;
1926   basic_block *bbs = get_loop_body_in_dom_order (loop);
1927   gimple_stmt_iterator gsi;
1928   bool res = true;
1929 
1930   for (i = 0; i < loop->num_nodes; i++)
1931     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1932       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE)
1933 	goto end;
1934 
1935   res = false;
1936  end:
1937   free (bbs);
1938   return res;
1939 }
1940 
1941 /* Create a reduction_info struct, initialize it with REDUC_STMT
1942    and PHI, insert it to the REDUCTION_LIST.  */
1943 
1944 static void
build_new_reduction(reduction_info_table_type reduction_list,gimple reduc_stmt,gimple phi)1945 build_new_reduction (reduction_info_table_type reduction_list,
1946 		     gimple reduc_stmt, gimple phi)
1947 {
1948   reduction_info **slot;
1949   struct reduction_info *new_reduction;
1950 
1951   gcc_assert (reduc_stmt);
1952 
1953   if (dump_file && (dump_flags & TDF_DETAILS))
1954     {
1955       fprintf (dump_file,
1956 	       "Detected reduction. reduction stmt is: \n");
1957       print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
1958       fprintf (dump_file, "\n");
1959     }
1960 
1961   new_reduction = XCNEW (struct reduction_info);
1962 
1963   new_reduction->reduc_stmt = reduc_stmt;
1964   new_reduction->reduc_phi = phi;
1965   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
1966   new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt);
1967   slot = reduction_list.find_slot (new_reduction, INSERT);
1968   *slot = new_reduction;
1969 }
1970 
1971 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
1972 
1973 int
set_reduc_phi_uids(reduction_info ** slot,void * data ATTRIBUTE_UNUSED)1974 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
1975 {
1976   struct reduction_info *const red = *slot;
1977   gimple_set_uid (red->reduc_phi, red->reduc_version);
1978   return 1;
1979 }
1980 
1981 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
1982 
1983 static void
gather_scalar_reductions(loop_p loop,reduction_info_table_type reduction_list)1984 gather_scalar_reductions (loop_p loop, reduction_info_table_type reduction_list)
1985 {
1986   gimple_stmt_iterator gsi;
1987   loop_vec_info simple_loop_info;
1988 
1989   simple_loop_info = vect_analyze_loop_form (loop);
1990 
1991   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1992     {
1993       gimple phi = gsi_stmt (gsi);
1994       affine_iv iv;
1995       tree res = PHI_RESULT (phi);
1996       bool double_reduc;
1997 
1998       if (virtual_operand_p (res))
1999 	continue;
2000 
2001       if (!simple_iv (loop, loop, res, &iv, true)
2002 	&& simple_loop_info)
2003 	{
2004            gimple reduc_stmt = vect_force_simple_reduction (simple_loop_info,
2005 							    phi, true,
2006 							    &double_reduc);
2007 	   if (reduc_stmt && !double_reduc)
2008               build_new_reduction (reduction_list, reduc_stmt, phi);
2009         }
2010     }
2011   destroy_loop_vec_info (simple_loop_info, true);
2012 
2013   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
2014      and destroy_loop_vec_info, we can set gimple_uid of reduc_phi stmts
2015      only now.  */
2016   reduction_list.traverse <void *, set_reduc_phi_uids> (NULL);
2017 }
2018 
2019 /* Try to initialize NITER for code generation part.  */
2020 
2021 static bool
try_get_loop_niter(loop_p loop,struct tree_niter_desc * niter)2022 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2023 {
2024   edge exit = single_dom_exit (loop);
2025 
2026   gcc_assert (exit);
2027 
2028   /* We need to know # of iterations, and there should be no uses of values
2029      defined inside loop outside of it, unless the values are invariants of
2030      the loop.  */
2031   if (!number_of_iterations_exit (loop, exit, niter, false))
2032     {
2033       if (dump_file && (dump_flags & TDF_DETAILS))
2034 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2035       return false;
2036     }
2037 
2038   return true;
2039 }
2040 
2041 /* Try to initialize REDUCTION_LIST for code generation part.
2042    REDUCTION_LIST describes the reductions.  */
2043 
2044 static bool
try_create_reduction_list(loop_p loop,reduction_info_table_type reduction_list)2045 try_create_reduction_list (loop_p loop,
2046 			   reduction_info_table_type reduction_list)
2047 {
2048   edge exit = single_dom_exit (loop);
2049   gimple_stmt_iterator gsi;
2050 
2051   gcc_assert (exit);
2052 
2053   gather_scalar_reductions (loop, reduction_list);
2054 
2055 
2056   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2057     {
2058       gimple phi = gsi_stmt (gsi);
2059       struct reduction_info *red;
2060       imm_use_iterator imm_iter;
2061       use_operand_p use_p;
2062       gimple reduc_phi;
2063       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2064 
2065       if (!virtual_operand_p (val))
2066 	{
2067 	  if (dump_file && (dump_flags & TDF_DETAILS))
2068 	    {
2069 	      fprintf (dump_file, "phi is ");
2070 	      print_gimple_stmt (dump_file, phi, 0, 0);
2071 	      fprintf (dump_file, "arg of phi to exit:   value ");
2072 	      print_generic_expr (dump_file, val, 0);
2073 	      fprintf (dump_file, " used outside loop\n");
2074 	      fprintf (dump_file,
2075 		       "  checking if it a part of reduction pattern:  \n");
2076 	    }
2077 	  if (reduction_list.elements () == 0)
2078 	    {
2079 	      if (dump_file && (dump_flags & TDF_DETAILS))
2080 		fprintf (dump_file,
2081 			 "  FAILED: it is not a part of reduction.\n");
2082 	      return false;
2083 	    }
2084 	  reduc_phi = NULL;
2085 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2086 	    {
2087 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2088 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2089 		{
2090 		  reduc_phi = USE_STMT (use_p);
2091 		  break;
2092 		}
2093 	    }
2094 	  red = reduction_phi (reduction_list, reduc_phi);
2095 	  if (red == NULL)
2096 	    {
2097 	      if (dump_file && (dump_flags & TDF_DETAILS))
2098 		fprintf (dump_file,
2099 			 "  FAILED: it is not a part of reduction.\n");
2100 	      return false;
2101 	    }
2102 	  if (dump_file && (dump_flags & TDF_DETAILS))
2103 	    {
2104 	      fprintf (dump_file, "reduction phi is  ");
2105 	      print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
2106 	      fprintf (dump_file, "reduction stmt is  ");
2107 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
2108 	    }
2109 	}
2110     }
2111 
2112   /* The iterations of the loop may communicate only through bivs whose
2113      iteration space can be distributed efficiently.  */
2114   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2115     {
2116       gimple phi = gsi_stmt (gsi);
2117       tree def = PHI_RESULT (phi);
2118       affine_iv iv;
2119 
2120       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2121 	{
2122 	  struct reduction_info *red;
2123 
2124 	  red = reduction_phi (reduction_list, phi);
2125 	  if (red == NULL)
2126 	    {
2127 	      if (dump_file && (dump_flags & TDF_DETAILS))
2128 		fprintf (dump_file,
2129 			 "  FAILED: scalar dependency between iterations\n");
2130 	      return false;
2131 	    }
2132 	}
2133     }
2134 
2135 
2136   return true;
2137 }
2138 
2139 /* Detect parallel loops and generate parallel code using libgomp
2140    primitives.  Returns true if some loop was parallelized, false
2141    otherwise.  */
2142 
2143 bool
parallelize_loops(void)2144 parallelize_loops (void)
2145 {
2146   unsigned n_threads = flag_tree_parallelize_loops;
2147   bool changed = false;
2148   struct loop *loop;
2149   struct tree_niter_desc niter_desc;
2150   reduction_info_table_type reduction_list;
2151   struct obstack parloop_obstack;
2152   HOST_WIDE_INT estimated;
2153   source_location loop_loc;
2154 
2155   /* Do not parallelize loops in the functions created by parallelization.  */
2156   if (parallelized_function_p (cfun->decl))
2157     return false;
2158   if (cfun->has_nonlocal_label)
2159     return false;
2160 
2161   gcc_obstack_init (&parloop_obstack);
2162   reduction_list.create (10);
2163   init_stmt_vec_info_vec ();
2164 
2165   FOR_EACH_LOOP (loop, 0)
2166     {
2167       reduction_list.empty ();
2168       if (dump_file && (dump_flags & TDF_DETAILS))
2169       {
2170         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
2171 	if (loop->inner)
2172 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
2173 	else
2174 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
2175       }
2176 
2177       /* If we use autopar in graphite pass, we use its marked dependency
2178       checking results.  */
2179       if (flag_loop_parallelize_all && !loop->can_be_parallel)
2180       {
2181         if (dump_file && (dump_flags & TDF_DETAILS))
2182 	   fprintf (dump_file, "loop is not parallel according to graphite\n");
2183 	continue;
2184       }
2185 
2186       if (!single_dom_exit (loop))
2187       {
2188 
2189         if (dump_file && (dump_flags & TDF_DETAILS))
2190 	  fprintf (dump_file, "loop is !single_dom_exit\n");
2191 
2192 	continue;
2193       }
2194 
2195       if (/* And of course, the loop must be parallelizable.  */
2196 	  !can_duplicate_loop_p (loop)
2197 	  || loop_has_blocks_with_irreducible_flag (loop)
2198 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
2199 	  /* FIXME: the check for vector phi nodes could be removed.  */
2200 	  || loop_has_vector_phi_nodes (loop))
2201 	continue;
2202 
2203       estimated = estimated_stmt_executions_int (loop);
2204       if (estimated == -1)
2205 	estimated = max_stmt_executions_int (loop);
2206       /* FIXME: Bypass this check as graphite doesn't update the
2207 	 count and frequency correctly now.  */
2208       if (!flag_loop_parallelize_all
2209 	  && ((estimated != -1
2210 	       && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
2211 	      /* Do not bother with loops in cold areas.  */
2212 	      || optimize_loop_nest_for_size_p (loop)))
2213 	continue;
2214 
2215       if (!try_get_loop_niter (loop, &niter_desc))
2216 	continue;
2217 
2218       if (!try_create_reduction_list (loop, reduction_list))
2219 	continue;
2220 
2221       if (!flag_loop_parallelize_all
2222 	  && !loop_parallel_p (loop, &parloop_obstack))
2223 	continue;
2224 
2225       changed = true;
2226       if (dump_file && (dump_flags & TDF_DETAILS))
2227       {
2228 	if (loop->inner)
2229 	  fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index);
2230 	else
2231 	  fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index);
2232 	loop_loc = find_loop_location (loop);
2233 	if (loop_loc != UNKNOWN_LOCATION)
2234 	  fprintf (dump_file, "\nloop at %s:%d: ",
2235 		   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
2236       }
2237       gen_parallel_loop (loop, reduction_list,
2238 			 n_threads, &niter_desc);
2239     }
2240 
2241   free_stmt_vec_info_vec ();
2242   reduction_list.dispose ();
2243   obstack_free (&parloop_obstack, NULL);
2244 
2245   /* Parallelization will cause new function calls to be inserted through
2246      which local variables will escape.  Reset the points-to solution
2247      for ESCAPED.  */
2248   if (changed)
2249     pt_solution_reset (&cfun->gimple_df->escaped);
2250 
2251   return changed;
2252 }
2253 
2254 /* Parallelization.  */
2255 
2256 static bool
gate_tree_parallelize_loops(void)2257 gate_tree_parallelize_loops (void)
2258 {
2259   return flag_tree_parallelize_loops > 1;
2260 }
2261 
2262 static unsigned
tree_parallelize_loops(void)2263 tree_parallelize_loops (void)
2264 {
2265   if (number_of_loops (cfun) <= 1)
2266     return 0;
2267 
2268   if (parallelize_loops ())
2269     return TODO_cleanup_cfg | TODO_rebuild_alias;
2270   return 0;
2271 }
2272 
2273 namespace {
2274 
2275 const pass_data pass_data_parallelize_loops =
2276 {
2277   GIMPLE_PASS, /* type */
2278   "parloops", /* name */
2279   OPTGROUP_LOOP, /* optinfo_flags */
2280   true, /* has_gate */
2281   true, /* has_execute */
2282   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
2283   ( PROP_cfg | PROP_ssa ), /* properties_required */
2284   0, /* properties_provided */
2285   0, /* properties_destroyed */
2286   0, /* todo_flags_start */
2287   TODO_verify_flow, /* todo_flags_finish */
2288 };
2289 
2290 class pass_parallelize_loops : public gimple_opt_pass
2291 {
2292 public:
pass_parallelize_loops(gcc::context * ctxt)2293   pass_parallelize_loops (gcc::context *ctxt)
2294     : gimple_opt_pass (pass_data_parallelize_loops, ctxt)
2295   {}
2296 
2297   /* opt_pass methods: */
gate()2298   bool gate () { return gate_tree_parallelize_loops (); }
execute()2299   unsigned int execute () { return tree_parallelize_loops (); }
2300 
2301 }; // class pass_parallelize_loops
2302 
2303 } // anon namespace
2304 
2305 gimple_opt_pass *
make_pass_parallelize_loops(gcc::context * ctxt)2306 make_pass_parallelize_loops (gcc::context *ctxt)
2307 {
2308   return new pass_parallelize_loops (ctxt);
2309 }
2310 
2311 
2312 #include "gt-tree-parloops.h"
2313