1 /* Loop distribution.
2    Copyright (C) 2006-2018 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    Loop distribution is the dual of loop fusion.  It separates statements
40    of a loop (or loop nest) into multiple loops (or loop nests) with the
41    same loop header.  The major goal is to separate statements which may
42    be vectorized from those that can't.  This pass implements distribution
43    in the following steps:
44 
45      1) Seed partitions with specific type statements.  For now we support
46 	two types seed statements: statement defining variable used outside
47 	of loop; statement storing to memory.
48      2) Build reduced dependence graph (RDG) for loop to be distributed.
49 	The vertices (RDG:V) model all statements in the loop and the edges
50 	(RDG:E) model flow and control dependencies between statements.
51      3) Apart from RDG, compute data dependencies between memory references.
52      4) Starting from seed statement, build up partition by adding depended
53 	statements according to RDG's dependence information.  Partition is
54 	classified as parallel type if it can be executed paralleled; or as
55 	sequential type if it can't.  Parallel type partition is further
56 	classified as different builtin kinds if it can be implemented as
57 	builtin function calls.
58      5) Build partition dependence graph (PG) based on data dependencies.
59 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60 	all data dependencies between every partitions pair.  In general,
61 	data dependence is either compilation time known or unknown.  In C
62 	family languages, there exists quite amount compilation time unknown
63 	dependencies because of possible alias relation of data references.
64 	We categorize PG's edge to two types: "true" edge that represents
65 	compilation time known data dependencies; "alias" edge for all other
66 	data dependencies.
67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68 	partitions in each strong connected component (SCC) correspondingly.
69 	Build new PG for merged partitions.
70      7) Traverse PG again and this time with both "true" and "alias" edges
71 	included.  We try to break SCCs by removing some edges.  Because
72 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73 	by removing some "alias" edges.  It's NP-hard to choose optimal
74 	edge set, fortunately simple approximation is good enough for us
75 	given the small problem scale.
76      8) Collect all data dependencies of the removed "alias" edges.  Create
77 	runtime alias checks for collected data dependencies.
78      9) Version loop under the condition of runtime alias checks.  Given
79 	loop distribution generally introduces additional overhead, it is
80 	only useful if vectorization is achieved in distributed loop.  We
81 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82 	no distributed loop can be vectorized, we simply remove distributed
83 	loops and recover to the original one.
84 
85    TODO:
86      1) We only distribute innermost two-level loop nest now.  We should
87 	extend it for arbitrary loop nests in the future.
88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89 	desired to minimize loop overhead, maximize parallelism and maximize
90 	data reuse.  */
91 
92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h"
95 #include "coretypes.h"
96 #include "backend.h"
97 #include "tree.h"
98 #include "gimple.h"
99 #include "cfghooks.h"
100 #include "tree-pass.h"
101 #include "ssa.h"
102 #include "gimple-pretty-print.h"
103 #include "fold-const.h"
104 #include "cfganal.h"
105 #include "gimple-iterator.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-ssa-loop-manip.h"
110 #include "tree-ssa-loop-ivopts.h"
111 #include "tree-ssa-loop.h"
112 #include "tree-into-ssa.h"
113 #include "tree-ssa.h"
114 #include "cfgloop.h"
115 #include "tree-scalar-evolution.h"
116 #include "params.h"
117 #include "tree-vectorizer.h"
118 
119 
120 #define MAX_DATAREFS_NUM \
121 	((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
122 
123 /* Threshold controlling number of distributed partitions.  Given it may
124    be unnecessary if a memory stream cost model is invented in the future,
125    we define it as a temporary macro, rather than a parameter.  */
126 #define NUM_PARTITION_THRESHOLD (4)
127 
128 /* Hashtable helpers.  */
129 
130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
131 {
132   static inline hashval_t hash (const data_dependence_relation *);
133   static inline bool equal (const data_dependence_relation *,
134 			    const data_dependence_relation *);
135 };
136 
137 /* Hash function for data dependence.  */
138 
139 inline hashval_t
140 ddr_hasher::hash (const data_dependence_relation *ddr)
141 {
142   inchash::hash h;
143   h.add_ptr (DDR_A (ddr));
144   h.add_ptr (DDR_B (ddr));
145   return h.end ();
146 }
147 
148 /* Hash table equality function for data dependence.  */
149 
150 inline bool
151 ddr_hasher::equal (const data_dependence_relation *ddr1,
152 		   const data_dependence_relation *ddr2)
153 {
154   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
155 }
156 
157 /* The loop (nest) to be distributed.  */
158 static vec<loop_p> loop_nest;
159 
160 /* Vector of data references in the loop to be distributed.  */
161 static vec<data_reference_p> datarefs_vec;
162 
163 /* Store index of data reference in aux field.  */
164 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
165 
166 /* Hash table for data dependence relation in the loop to be distributed.  */
167 static hash_table<ddr_hasher> *ddrs_table;
168 
169 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
170 struct rdg_vertex
171 {
172   /* The statement represented by this vertex.  */
173   gimple *stmt;
174 
175   /* Vector of data-references in this statement.  */
176   vec<data_reference_p> datarefs;
177 
178   /* True when the statement contains a write to memory.  */
179   bool has_mem_write;
180 
181   /* True when the statement contains a read from memory.  */
182   bool has_mem_reads;
183 };
184 
185 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
186 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
187 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
188 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
189 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
190 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
191 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
192 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
193 
194 /* Data dependence type.  */
195 
196 enum rdg_dep_type
197 {
198   /* Read After Write (RAW).  */
199   flow_dd = 'f',
200 
201   /* Control dependence (execute conditional on).  */
202   control_dd = 'c'
203 };
204 
205 /* Dependence information attached to an edge of the RDG.  */
206 
207 struct rdg_edge
208 {
209   /* Type of the dependence.  */
210   enum rdg_dep_type type;
211 };
212 
213 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
214 
215 /* Dump vertex I in RDG to FILE.  */
216 
217 static void
218 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
219 {
220   struct vertex *v = &(rdg->vertices[i]);
221   struct graph_edge *e;
222 
223   fprintf (file, "(vertex %d: (%s%s) (in:", i,
224 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
225 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
226 
227   if (v->pred)
228     for (e = v->pred; e; e = e->pred_next)
229       fprintf (file, " %d", e->src);
230 
231   fprintf (file, ") (out:");
232 
233   if (v->succ)
234     for (e = v->succ; e; e = e->succ_next)
235       fprintf (file, " %d", e->dest);
236 
237   fprintf (file, ")\n");
238   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
239   fprintf (file, ")\n");
240 }
241 
242 /* Call dump_rdg_vertex on stderr.  */
243 
244 DEBUG_FUNCTION void
245 debug_rdg_vertex (struct graph *rdg, int i)
246 {
247   dump_rdg_vertex (stderr, rdg, i);
248 }
249 
250 /* Dump the reduced dependence graph RDG to FILE.  */
251 
252 static void
253 dump_rdg (FILE *file, struct graph *rdg)
254 {
255   fprintf (file, "(rdg\n");
256   for (int i = 0; i < rdg->n_vertices; i++)
257     dump_rdg_vertex (file, rdg, i);
258   fprintf (file, ")\n");
259 }
260 
261 /* Call dump_rdg on stderr.  */
262 
263 DEBUG_FUNCTION void
264 debug_rdg (struct graph *rdg)
265 {
266   dump_rdg (stderr, rdg);
267 }
268 
269 static void
270 dot_rdg_1 (FILE *file, struct graph *rdg)
271 {
272   int i;
273   pretty_printer buffer;
274   pp_needs_newline (&buffer) = false;
275   buffer.buffer->stream = file;
276 
277   fprintf (file, "digraph RDG {\n");
278 
279   for (i = 0; i < rdg->n_vertices; i++)
280     {
281       struct vertex *v = &(rdg->vertices[i]);
282       struct graph_edge *e;
283 
284       fprintf (file, "%d [label=\"[%d] ", i, i);
285       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
286       pp_flush (&buffer);
287       fprintf (file, "\"]\n");
288 
289       /* Highlight reads from memory.  */
290       if (RDG_MEM_READS_STMT (rdg, i))
291        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
292 
293       /* Highlight stores to memory.  */
294       if (RDG_MEM_WRITE_STMT (rdg, i))
295        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
296 
297       if (v->succ)
298        for (e = v->succ; e; e = e->succ_next)
299          switch (RDGE_TYPE (e))
300            {
301            case flow_dd:
302              /* These are the most common dependences: don't print these. */
303              fprintf (file, "%d -> %d \n", i, e->dest);
304              break;
305 
306 	   case control_dd:
307              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
308              break;
309 
310            default:
311              gcc_unreachable ();
312            }
313     }
314 
315   fprintf (file, "}\n\n");
316 }
317 
318 /* Display the Reduced Dependence Graph using dotty.  */
319 
320 DEBUG_FUNCTION void
321 dot_rdg (struct graph *rdg)
322 {
323   /* When debugging, you may want to enable the following code.  */
324 #ifdef HAVE_POPEN
325   FILE *file = popen ("dot -Tx11", "w");
326   if (!file)
327     return;
328   dot_rdg_1 (file, rdg);
329   fflush (file);
330   close (fileno (file));
331   pclose (file);
332 #else
333   dot_rdg_1 (stderr, rdg);
334 #endif
335 }
336 
337 /* Returns the index of STMT in RDG.  */
338 
339 static int
340 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
341 {
342   int index = gimple_uid (stmt);
343   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
344   return index;
345 }
346 
347 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
348    the index of DEF in RDG.  */
349 
350 static void
351 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
352 {
353   use_operand_p imm_use_p;
354   imm_use_iterator iterator;
355 
356   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
357     {
358       struct graph_edge *e;
359       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
360 
361       if (use < 0)
362 	continue;
363 
364       e = add_edge (rdg, idef, use);
365       e->data = XNEW (struct rdg_edge);
366       RDGE_TYPE (e) = flow_dd;
367     }
368 }
369 
370 /* Creates an edge for the control dependences of BB to the vertex V.  */
371 
372 static void
373 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
374 				    int v, control_dependences *cd)
375 {
376   bitmap_iterator bi;
377   unsigned edge_n;
378   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
379 			    0, edge_n, bi)
380     {
381       basic_block cond_bb = cd->get_edge_src (edge_n);
382       gimple *stmt = last_stmt (cond_bb);
383       if (stmt && is_ctrl_stmt (stmt))
384 	{
385 	  struct graph_edge *e;
386 	  int c = rdg_vertex_for_stmt (rdg, stmt);
387 	  if (c < 0)
388 	    continue;
389 
390 	  e = add_edge (rdg, c, v);
391 	  e->data = XNEW (struct rdg_edge);
392 	  RDGE_TYPE (e) = control_dd;
393 	}
394     }
395 }
396 
397 /* Creates the edges of the reduced dependence graph RDG.  */
398 
399 static void
400 create_rdg_flow_edges (struct graph *rdg)
401 {
402   int i;
403   def_operand_p def_p;
404   ssa_op_iter iter;
405 
406   for (i = 0; i < rdg->n_vertices; i++)
407     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
408 			      iter, SSA_OP_DEF)
409       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
410 }
411 
412 /* Creates the edges of the reduced dependence graph RDG.  */
413 
414 static void
415 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
416 {
417   int i;
418 
419   for (i = 0; i < rdg->n_vertices; i++)
420     {
421       gimple *stmt = RDG_STMT (rdg, i);
422       if (gimple_code (stmt) == GIMPLE_PHI)
423 	{
424 	  edge_iterator ei;
425 	  edge e;
426 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
427 	    if (flow_bb_inside_loop_p (loop, e->src))
428 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
429 	}
430       else
431 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
432     }
433 }
434 
435 /* Build the vertices of the reduced dependence graph RDG.  Return false
436    if that failed.  */
437 
438 static bool
439 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
440 {
441   int i;
442   gimple *stmt;
443 
444   FOR_EACH_VEC_ELT (stmts, i, stmt)
445     {
446       struct vertex *v = &(rdg->vertices[i]);
447 
448       /* Record statement to vertex mapping.  */
449       gimple_set_uid (stmt, i);
450 
451       v->data = XNEW (struct rdg_vertex);
452       RDGV_STMT (v) = stmt;
453       RDGV_DATAREFS (v).create (0);
454       RDGV_HAS_MEM_WRITE (v) = false;
455       RDGV_HAS_MEM_READS (v) = false;
456       if (gimple_code (stmt) == GIMPLE_PHI)
457 	continue;
458 
459       unsigned drp = datarefs_vec.length ();
460       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
461 	return false;
462       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
463 	{
464 	  data_reference_p dr = datarefs_vec[j];
465 	  if (DR_IS_READ (dr))
466 	    RDGV_HAS_MEM_READS (v) = true;
467 	  else
468 	    RDGV_HAS_MEM_WRITE (v) = true;
469 	  RDGV_DATAREFS (v).safe_push (dr);
470 	}
471     }
472   return true;
473 }
474 
475 /* Array mapping basic block's index to its topological order.  */
476 static int *bb_top_order_index;
477 /* And size of the array.  */
478 static int bb_top_order_index_size;
479 
480 /* If X has a smaller topological sort number than Y, returns -1;
481    if greater, returns 1.  */
482 
483 static int
484 bb_top_order_cmp (const void *x, const void *y)
485 {
486   basic_block bb1 = *(const basic_block *) x;
487   basic_block bb2 = *(const basic_block *) y;
488 
489   gcc_assert (bb1->index < bb_top_order_index_size
490 	      && bb2->index < bb_top_order_index_size);
491   gcc_assert (bb1 == bb2
492 	      || bb_top_order_index[bb1->index]
493 		 != bb_top_order_index[bb2->index]);
494 
495   return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
496 }
497 
498 /* Initialize STMTS with all the statements of LOOP.  We use topological
499    order to discover all statements.  The order is important because
500    generate_loops_for_partition is using the same traversal for identifying
501    statements in loop copies.  */
502 
503 static void
504 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
505 {
506   unsigned int i;
507   basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
508 
509   for (i = 0; i < loop->num_nodes; i++)
510     {
511       basic_block bb = bbs[i];
512 
513       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
514 	   gsi_next (&bsi))
515 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
516 	  stmts->safe_push (bsi.phi ());
517 
518       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
519 	   gsi_next (&bsi))
520 	{
521 	  gimple *stmt = gsi_stmt (bsi);
522 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
523 	    stmts->safe_push (stmt);
524 	}
525     }
526 
527   free (bbs);
528 }
529 
530 /* Free the reduced dependence graph RDG.  */
531 
532 static void
533 free_rdg (struct graph *rdg)
534 {
535   int i;
536 
537   for (i = 0; i < rdg->n_vertices; i++)
538     {
539       struct vertex *v = &(rdg->vertices[i]);
540       struct graph_edge *e;
541 
542       for (e = v->succ; e; e = e->succ_next)
543 	free (e->data);
544 
545       if (v->data)
546 	{
547 	  gimple_set_uid (RDGV_STMT (v), -1);
548 	  (RDGV_DATAREFS (v)).release ();
549 	  free (v->data);
550 	}
551     }
552 
553   free_graph (rdg);
554 }
555 
556 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
557    LOOP, and one edge per flow dependence or control dependence from control
558    dependence CD.  During visiting each statement, data references are also
559    collected and recorded in global data DATAREFS_VEC.  */
560 
561 static struct graph *
562 build_rdg (struct loop *loop, control_dependences *cd)
563 {
564   struct graph *rdg;
565 
566   /* Create the RDG vertices from the stmts of the loop nest.  */
567   auto_vec<gimple *, 10> stmts;
568   stmts_from_loop (loop, &stmts);
569   rdg = new_graph (stmts.length ());
570   if (!create_rdg_vertices (rdg, stmts, loop))
571     {
572       free_rdg (rdg);
573       return NULL;
574     }
575   stmts.release ();
576 
577   create_rdg_flow_edges (rdg);
578   if (cd)
579     create_rdg_cd_edges (rdg, cd, loop);
580 
581   return rdg;
582 }
583 
584 
585 /* Kind of distributed loop.  */
586 enum partition_kind {
587     PKIND_NORMAL,
588     /* Partial memset stands for a paritition can be distributed into a loop
589        of memset calls, rather than a single memset call.  It's handled just
590        like a normal parition, i.e, distributed as separate loop, no memset
591        call is generated.
592 
593        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
594        loop nest as deep as possible.  As a result, parloop achieves better
595        parallelization by parallelizing deeper loop nest.  This hack should
596        be unnecessary and removed once distributed memset can be understood
597        and analyzed in data reference analysis.  See PR82604 for more.  */
598     PKIND_PARTIAL_MEMSET,
599     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
600 };
601 
602 /* Type of distributed loop.  */
603 enum partition_type {
604     /* The distributed loop can be executed parallelly.  */
605     PTYPE_PARALLEL = 0,
606     /* The distributed loop has to be executed sequentially.  */
607     PTYPE_SEQUENTIAL
608 };
609 
610 /* Builtin info for loop distribution.  */
611 struct builtin_info
612 {
613   /* data-references a kind != PKIND_NORMAL partition is about.  */
614   data_reference_p dst_dr;
615   data_reference_p src_dr;
616   /* Base address and size of memory objects operated by the builtin.  Note
617      both dest and source memory objects must have the same size.  */
618   tree dst_base;
619   tree src_base;
620   tree size;
621   /* Base and offset part of dst_base after stripping constant offset.  This
622      is only used in memset builtin distribution for now.  */
623   tree dst_base_base;
624   unsigned HOST_WIDE_INT dst_base_offset;
625 };
626 
627 /* Partition for loop distribution.  */
628 struct partition
629 {
630   /* Statements of the partition.  */
631   bitmap stmts;
632   /* True if the partition defines variable which is used outside of loop.  */
633   bool reduction_p;
634   enum partition_kind kind;
635   enum partition_type type;
636   /* Data references in the partition.  */
637   bitmap datarefs;
638   /* Information of builtin parition.  */
639   struct builtin_info *builtin;
640 };
641 
642 
643 /* Allocate and initialize a partition from BITMAP.  */
644 
645 static partition *
646 partition_alloc (void)
647 {
648   partition *partition = XCNEW (struct partition);
649   partition->stmts = BITMAP_ALLOC (NULL);
650   partition->reduction_p = false;
651   partition->kind = PKIND_NORMAL;
652   partition->datarefs = BITMAP_ALLOC (NULL);
653   return partition;
654 }
655 
656 /* Free PARTITION.  */
657 
658 static void
659 partition_free (partition *partition)
660 {
661   BITMAP_FREE (partition->stmts);
662   BITMAP_FREE (partition->datarefs);
663   if (partition->builtin)
664     free (partition->builtin);
665 
666   free (partition);
667 }
668 
669 /* Returns true if the partition can be generated as a builtin.  */
670 
671 static bool
672 partition_builtin_p (partition *partition)
673 {
674   return partition->kind > PKIND_PARTIAL_MEMSET;
675 }
676 
677 /* Returns true if the partition contains a reduction.  */
678 
679 static bool
680 partition_reduction_p (partition *partition)
681 {
682   return partition->reduction_p;
683 }
684 
685 /* Partitions are fused because of different reasons.  */
686 enum fuse_type
687 {
688   FUSE_NON_BUILTIN = 0,
689   FUSE_REDUCTION = 1,
690   FUSE_SHARE_REF = 2,
691   FUSE_SAME_SCC = 3,
692   FUSE_FINALIZE = 4
693 };
694 
695 /* Description on different fusing reason.  */
696 static const char *fuse_message[] = {
697   "they are non-builtins",
698   "they have reductions",
699   "they have shared memory refs",
700   "they are in the same dependence scc",
701   "there is no point to distribute loop"};
702 
703 static void
704 update_type_for_merge (struct graph *, partition *, partition *);
705 
706 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
707    graph and we update type for result partition if it is non-NULL.  */
708 
709 static void
710 partition_merge_into (struct graph *rdg, partition *dest,
711 		      partition *partition, enum fuse_type ft)
712 {
713   if (dump_file && (dump_flags & TDF_DETAILS))
714     {
715       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
716       fprintf (dump_file, "  Part 1: ");
717       dump_bitmap (dump_file, dest->stmts);
718       fprintf (dump_file, "  Part 2: ");
719       dump_bitmap (dump_file, partition->stmts);
720     }
721 
722   dest->kind = PKIND_NORMAL;
723   if (dest->type == PTYPE_PARALLEL)
724     dest->type = partition->type;
725 
726   bitmap_ior_into (dest->stmts, partition->stmts);
727   if (partition_reduction_p (partition))
728     dest->reduction_p = true;
729 
730   /* Further check if any data dependence prevents us from executing the
731      new partition parallelly.  */
732   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
733     update_type_for_merge (rdg, dest, partition);
734 
735   bitmap_ior_into (dest->datarefs, partition->datarefs);
736 }
737 
738 
739 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
740    the LOOP.  */
741 
742 static bool
743 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
744 {
745   imm_use_iterator imm_iter;
746   use_operand_p use_p;
747 
748   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
749     {
750       if (is_gimple_debug (USE_STMT (use_p)))
751 	continue;
752 
753       basic_block use_bb = gimple_bb (USE_STMT (use_p));
754       if (!flow_bb_inside_loop_p (loop, use_bb))
755 	return true;
756     }
757 
758   return false;
759 }
760 
761 /* Returns true when STMT defines a scalar variable used after the
762    loop LOOP.  */
763 
764 static bool
765 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
766 {
767   def_operand_p def_p;
768   ssa_op_iter op_iter;
769 
770   if (gimple_code (stmt) == GIMPLE_PHI)
771     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
772 
773   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
774     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
775       return true;
776 
777   return false;
778 }
779 
780 /* Return a copy of LOOP placed before LOOP.  */
781 
782 static struct loop *
783 copy_loop_before (struct loop *loop)
784 {
785   struct loop *res;
786   edge preheader = loop_preheader_edge (loop);
787 
788   initialize_original_copy_tables ();
789   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
790   gcc_assert (res != NULL);
791   free_original_copy_tables ();
792   delete_update_ssa ();
793 
794   return res;
795 }
796 
797 /* Creates an empty basic block after LOOP.  */
798 
799 static void
800 create_bb_after_loop (struct loop *loop)
801 {
802   edge exit = single_exit (loop);
803 
804   if (!exit)
805     return;
806 
807   split_edge (exit);
808 }
809 
810 /* Generate code for PARTITION from the code in LOOP.  The loop is
811    copied when COPY_P is true.  All the statements not flagged in the
812    PARTITION bitmap are removed from the loop or from its copy.  The
813    statements are indexed in sequence inside a basic block, and the
814    basic blocks of a loop are taken in dom order.  */
815 
816 static void
817 generate_loops_for_partition (struct loop *loop, partition *partition,
818 			      bool copy_p)
819 {
820   unsigned i;
821   basic_block *bbs;
822 
823   if (copy_p)
824     {
825       int orig_loop_num = loop->orig_loop_num;
826       loop = copy_loop_before (loop);
827       gcc_assert (loop != NULL);
828       loop->orig_loop_num = orig_loop_num;
829       create_preheader (loop, CP_SIMPLE_PREHEADERS);
830       create_bb_after_loop (loop);
831     }
832   else
833     {
834       /* Origin number is set to the new versioned loop's num.  */
835       gcc_assert (loop->orig_loop_num != loop->num);
836     }
837 
838   /* Remove stmts not in the PARTITION bitmap.  */
839   bbs = get_loop_body_in_dom_order (loop);
840 
841   if (MAY_HAVE_DEBUG_BIND_STMTS)
842     for (i = 0; i < loop->num_nodes; i++)
843       {
844 	basic_block bb = bbs[i];
845 
846 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
847 	     gsi_next (&bsi))
848 	  {
849 	    gphi *phi = bsi.phi ();
850 	    if (!virtual_operand_p (gimple_phi_result (phi))
851 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
852 	      reset_debug_uses (phi);
853 	  }
854 
855 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
856 	  {
857 	    gimple *stmt = gsi_stmt (bsi);
858 	    if (gimple_code (stmt) != GIMPLE_LABEL
859 		&& !is_gimple_debug (stmt)
860 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
861 	      reset_debug_uses (stmt);
862 	  }
863       }
864 
865   for (i = 0; i < loop->num_nodes; i++)
866     {
867       basic_block bb = bbs[i];
868       edge inner_exit = NULL;
869 
870       if (loop != bb->loop_father)
871 	inner_exit = single_exit (bb->loop_father);
872 
873       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
874 	{
875 	  gphi *phi = bsi.phi ();
876 	  if (!virtual_operand_p (gimple_phi_result (phi))
877 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
878 	    remove_phi_node (&bsi, true);
879 	  else
880 	    gsi_next (&bsi);
881 	}
882 
883       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
884 	{
885 	  gimple *stmt = gsi_stmt (bsi);
886 	  if (gimple_code (stmt) != GIMPLE_LABEL
887 	      && !is_gimple_debug (stmt)
888 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
889 	    {
890 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
891 		 we choose its exit edge/path in order to avoid generating
892 		 infinite loop.  For all other cases, we choose an arbitrary
893 		 path through the empty CFG part that this unnecessary
894 		 control stmt controls.  */
895 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
896 		{
897 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
898 		    gimple_cond_make_true (cond_stmt);
899 		  else
900 		    gimple_cond_make_false (cond_stmt);
901 		  update_stmt (stmt);
902 		}
903 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
904 		{
905 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
906 		  gimple_switch_set_index
907 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
908 		  update_stmt (stmt);
909 		}
910 	      else
911 		{
912 		  unlink_stmt_vdef (stmt);
913 		  gsi_remove (&bsi, true);
914 		  release_defs (stmt);
915 		  continue;
916 		}
917 	    }
918 	  gsi_next (&bsi);
919 	}
920     }
921 
922   free (bbs);
923 }
924 
925 /* If VAL memory representation contains the same value in all bytes,
926    return that value, otherwise return -1.
927    E.g. for 0x24242424 return 0x24, for IEEE double
928    747708026454360457216.0 return 0x44, etc.  */
929 
930 static int
931 const_with_all_bytes_same (tree val)
932 {
933   unsigned char buf[64];
934   int i, len;
935 
936   if (integer_zerop (val)
937       || (TREE_CODE (val) == CONSTRUCTOR
938           && !TREE_CLOBBER_P (val)
939           && CONSTRUCTOR_NELTS (val) == 0))
940     return 0;
941 
942   if (real_zerop (val))
943     {
944       /* Only return 0 for +0.0, not for -0.0, which doesn't have
945 	 an all bytes same memory representation.  Don't transform
946 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
947       switch (TREE_CODE (val))
948 	{
949 	case REAL_CST:
950 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
951 	    return 0;
952 	  break;
953 	case COMPLEX_CST:
954 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
955 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
956 	    return 0;
957 	  break;
958 	case VECTOR_CST:
959 	  {
960 	    unsigned int count = vector_cst_encoded_nelts (val);
961 	    unsigned int j;
962 	    for (j = 0; j < count; ++j)
963 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
964 		break;
965 	    if (j == count)
966 	      return 0;
967 	    break;
968 	  }
969 	default:
970 	  break;
971 	}
972     }
973 
974   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
975     return -1;
976 
977   len = native_encode_expr (val, buf, sizeof (buf));
978   if (len == 0)
979     return -1;
980   for (i = 1; i < len; i++)
981     if (buf[i] != buf[0])
982       return -1;
983   return buf[0];
984 }
985 
986 /* Generate a call to memset for PARTITION in LOOP.  */
987 
988 static void
989 generate_memset_builtin (struct loop *loop, partition *partition)
990 {
991   gimple_stmt_iterator gsi;
992   tree mem, fn, nb_bytes;
993   tree val;
994   struct builtin_info *builtin = partition->builtin;
995   gimple *fn_call;
996 
997   /* The new statements will be placed before LOOP.  */
998   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
999 
1000   nb_bytes = builtin->size;
1001   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1002 				       false, GSI_CONTINUE_LINKING);
1003   mem = builtin->dst_base;
1004   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1005 				  false, GSI_CONTINUE_LINKING);
1006 
1007   /* This exactly matches the pattern recognition in classify_partition.  */
1008   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1009   /* Handle constants like 0x15151515 and similarly
1010      floating point constants etc. where all bytes are the same.  */
1011   int bytev = const_with_all_bytes_same (val);
1012   if (bytev != -1)
1013     val = build_int_cst (integer_type_node, bytev);
1014   else if (TREE_CODE (val) == INTEGER_CST)
1015     val = fold_convert (integer_type_node, val);
1016   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1017     {
1018       tree tem = make_ssa_name (integer_type_node);
1019       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1020       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1021       val = tem;
1022     }
1023 
1024   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1025   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1026   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1027 
1028   if (dump_file && (dump_flags & TDF_DETAILS))
1029     {
1030       fprintf (dump_file, "generated memset");
1031       if (bytev == 0)
1032 	fprintf (dump_file, " zero\n");
1033       else
1034 	fprintf (dump_file, "\n");
1035     }
1036 }
1037 
1038 /* Generate a call to memcpy for PARTITION in LOOP.  */
1039 
1040 static void
1041 generate_memcpy_builtin (struct loop *loop, partition *partition)
1042 {
1043   gimple_stmt_iterator gsi;
1044   gimple *fn_call;
1045   tree dest, src, fn, nb_bytes;
1046   enum built_in_function kind;
1047   struct builtin_info *builtin = partition->builtin;
1048 
1049   /* The new statements will be placed before LOOP.  */
1050   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1051 
1052   nb_bytes = builtin->size;
1053   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1054 				       false, GSI_CONTINUE_LINKING);
1055   dest = builtin->dst_base;
1056   src = builtin->src_base;
1057   if (partition->kind == PKIND_MEMCPY
1058       || ! ptr_derefs_may_alias_p (dest, src))
1059     kind = BUILT_IN_MEMCPY;
1060   else
1061     kind = BUILT_IN_MEMMOVE;
1062 
1063   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1064 				   false, GSI_CONTINUE_LINKING);
1065   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1066 				  false, GSI_CONTINUE_LINKING);
1067   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1068   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1069   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1070 
1071   if (dump_file && (dump_flags & TDF_DETAILS))
1072     {
1073       if (kind == BUILT_IN_MEMCPY)
1074 	fprintf (dump_file, "generated memcpy\n");
1075       else
1076 	fprintf (dump_file, "generated memmove\n");
1077     }
1078 }
1079 
1080 /* Remove and destroy the loop LOOP.  */
1081 
1082 static void
1083 destroy_loop (struct loop *loop)
1084 {
1085   unsigned nbbs = loop->num_nodes;
1086   edge exit = single_exit (loop);
1087   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1088   basic_block *bbs;
1089   unsigned i;
1090 
1091   bbs = get_loop_body_in_dom_order (loop);
1092 
1093   redirect_edge_pred (exit, src);
1094   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1095   exit->flags |= EDGE_FALLTHRU;
1096   cancel_loop_tree (loop);
1097   rescan_loop_exit (exit, false, true);
1098 
1099   i = nbbs;
1100   do
1101     {
1102       /* We have made sure to not leave any dangling uses of SSA
1103          names defined in the loop.  With the exception of virtuals.
1104 	 Make sure we replace all uses of virtual defs that will remain
1105 	 outside of the loop with the bare symbol as delete_basic_block
1106 	 will release them.  */
1107       --i;
1108       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1109 	   gsi_next (&gsi))
1110 	{
1111 	  gphi *phi = gsi.phi ();
1112 	  if (virtual_operand_p (gimple_phi_result (phi)))
1113 	    mark_virtual_phi_result_for_renaming (phi);
1114 	}
1115       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1116 	   gsi_next (&gsi))
1117 	{
1118 	  gimple *stmt = gsi_stmt (gsi);
1119 	  tree vdef = gimple_vdef (stmt);
1120 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1121 	    mark_virtual_operand_for_renaming (vdef);
1122 	}
1123       delete_basic_block (bbs[i]);
1124     }
1125   while (i != 0);
1126 
1127   free (bbs);
1128 
1129   set_immediate_dominator (CDI_DOMINATORS, dest,
1130 			   recompute_dominator (CDI_DOMINATORS, dest));
1131 }
1132 
1133 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1134 
1135 static bool
1136 generate_code_for_partition (struct loop *loop,
1137 			     partition *partition, bool copy_p)
1138 {
1139   switch (partition->kind)
1140     {
1141     case PKIND_NORMAL:
1142     case PKIND_PARTIAL_MEMSET:
1143       /* Reductions all have to be in the last partition.  */
1144       gcc_assert (!partition_reduction_p (partition)
1145 		  || !copy_p);
1146       generate_loops_for_partition (loop, partition, copy_p);
1147       return false;
1148 
1149     case PKIND_MEMSET:
1150       generate_memset_builtin (loop, partition);
1151       break;
1152 
1153     case PKIND_MEMCPY:
1154     case PKIND_MEMMOVE:
1155       generate_memcpy_builtin (loop, partition);
1156       break;
1157 
1158     default:
1159       gcc_unreachable ();
1160     }
1161 
1162   /* Common tail for partitions we turn into a call.  If this was the last
1163      partition for which we generate code, we have to destroy the loop.  */
1164   if (!copy_p)
1165     return true;
1166   return false;
1167 }
1168 
1169 /* Return data dependence relation for data references A and B.  The two
1170    data references must be in lexicographic order wrto reduced dependence
1171    graph RDG.  We firstly try to find ddr from global ddr hash table.  If
1172    it doesn't exist, compute the ddr and cache it.  */
1173 
1174 static data_dependence_relation *
1175 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1176 {
1177   struct data_dependence_relation ent, **slot;
1178   struct data_dependence_relation *ddr;
1179 
1180   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1181   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1182 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1183   ent.a = a;
1184   ent.b = b;
1185   slot = ddrs_table->find_slot (&ent, INSERT);
1186   if (*slot == NULL)
1187     {
1188       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1189       compute_affine_dependence (ddr, loop_nest[0]);
1190       *slot = ddr;
1191     }
1192 
1193   return *slot;
1194 }
1195 
1196 /* In reduced dependence graph RDG for loop distribution, return true if
1197    dependence between references DR1 and DR2 leads to a dependence cycle
1198    and such dependence cycle can't be resolved by runtime alias check.  */
1199 
1200 static bool
1201 data_dep_in_cycle_p (struct graph *rdg,
1202 		     data_reference_p dr1, data_reference_p dr2)
1203 {
1204   struct data_dependence_relation *ddr;
1205 
1206   /* Re-shuffle data-refs to be in topological order.  */
1207   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1208       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1209     std::swap (dr1, dr2);
1210 
1211   ddr = get_data_dependence (rdg, dr1, dr2);
1212 
1213   /* In case of no data dependence.  */
1214   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1215     return false;
1216   /* For unknown data dependence or known data dependence which can't be
1217      expressed in classic distance vector, we check if it can be resolved
1218      by runtime alias check.  If yes, we still consider data dependence
1219      as won't introduce data dependence cycle.  */
1220   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1221 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1222     return !runtime_alias_check_p (ddr, NULL, true);
1223   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1224     return true;
1225   else if (DDR_REVERSED_P (ddr)
1226 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1227     return false;
1228 
1229   return true;
1230 }
1231 
1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233    PARTITION1's type after merging PARTITION2 into PARTITION1.  */
1234 
1235 static void
1236 update_type_for_merge (struct graph *rdg,
1237 		       partition *partition1, partition *partition2)
1238 {
1239   unsigned i, j;
1240   bitmap_iterator bi, bj;
1241   data_reference_p dr1, dr2;
1242 
1243   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1244     {
1245       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1246 
1247       dr1 = datarefs_vec[i];
1248       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1249 	{
1250 	  dr2 = datarefs_vec[j];
1251 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1252 	    continue;
1253 
1254 	  /* Partition can only be executed sequentially if there is any
1255 	     data dependence cycle.  */
1256 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1257 	    {
1258 	      partition1->type = PTYPE_SEQUENTIAL;
1259 	      return;
1260 	    }
1261 	}
1262     }
1263 }
1264 
1265 /* Returns a partition with all the statements needed for computing
1266    the vertex V of the RDG, also including the loop exit conditions.  */
1267 
1268 static partition *
1269 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1270 {
1271   partition *partition = partition_alloc ();
1272   auto_vec<int, 3> nodes;
1273   unsigned i, j;
1274   int x;
1275   data_reference_p dr;
1276 
1277   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1278 
1279   FOR_EACH_VEC_ELT (nodes, i, x)
1280     {
1281       bitmap_set_bit (partition->stmts, x);
1282 
1283       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1284 	{
1285 	  unsigned idx = (unsigned) DR_INDEX (dr);
1286 	  gcc_assert (idx < datarefs_vec.length ());
1287 
1288 	  /* Partition can only be executed sequentially if there is any
1289 	     unknown data reference.  */
1290 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1291 	      || !DR_INIT (dr) || !DR_STEP (dr))
1292 	    partition->type = PTYPE_SEQUENTIAL;
1293 
1294 	  bitmap_set_bit (partition->datarefs, idx);
1295 	}
1296     }
1297 
1298   if (partition->type == PTYPE_SEQUENTIAL)
1299     return partition;
1300 
1301   /* Further check if any data dependence prevents us from executing the
1302      partition parallelly.  */
1303   update_type_for_merge (rdg, partition, partition);
1304 
1305   return partition;
1306 }
1307 
1308 /* Given PARTITION of LOOP and RDG, record single load/store data references
1309    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1310    data references.  */
1311 
1312 static bool
1313 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1314 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1315 {
1316   unsigned i;
1317   data_reference_p single_ld = NULL, single_st = NULL;
1318   bitmap_iterator bi;
1319 
1320   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1321     {
1322       gimple *stmt = RDG_STMT (rdg, i);
1323       data_reference_p dr;
1324 
1325       if (gimple_code (stmt) == GIMPLE_PHI)
1326 	continue;
1327 
1328       /* Any scalar stmts are ok.  */
1329       if (!gimple_vuse (stmt))
1330 	continue;
1331 
1332       /* Otherwise just regular loads/stores.  */
1333       if (!gimple_assign_single_p (stmt))
1334 	return false;
1335 
1336       /* But exactly one store and/or load.  */
1337       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1338 	{
1339 	  tree type = TREE_TYPE (DR_REF (dr));
1340 
1341 	  /* The memset, memcpy and memmove library calls are only
1342 	     able to deal with generic address space.  */
1343 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1344 	    return false;
1345 
1346 	  if (DR_IS_READ (dr))
1347 	    {
1348 	      if (single_ld != NULL)
1349 		return false;
1350 	      single_ld = dr;
1351 	    }
1352 	  else
1353 	    {
1354 	      if (single_st != NULL)
1355 		return false;
1356 	      single_st = dr;
1357 	    }
1358 	}
1359     }
1360 
1361   if (!single_st)
1362     return false;
1363 
1364   /* Bail out if this is a bitfield memory reference.  */
1365   if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1366       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1367     return false;
1368 
1369   /* Data reference must be executed exactly once per iteration of each
1370      loop in the loop nest.  We only need to check dominance information
1371      against the outermost one in a perfect loop nest because a bb can't
1372      dominate outermost loop's latch without dominating inner loop's.  */
1373   basic_block bb_st = gimple_bb (DR_STMT (single_st));
1374   if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1375     return false;
1376 
1377   if (single_ld)
1378     {
1379       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1380       /* Direct aggregate copy or via an SSA name temporary.  */
1381       if (load != store
1382 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1383 	return false;
1384 
1385       /* Bail out if this is a bitfield memory reference.  */
1386       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1387 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1388 	return false;
1389 
1390       /* Load and store must be in the same loop nest.  */
1391       basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1392       if (bb_st->loop_father != bb_ld->loop_father)
1393 	return false;
1394 
1395       /* Data reference must be executed exactly once per iteration.
1396 	 Same as single_st, we only need to check against the outermost
1397 	 loop.  */
1398       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1399 	return false;
1400 
1401       edge e = single_exit (bb_st->loop_father);
1402       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1403       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1404       if (dom_ld != dom_st)
1405 	return false;
1406     }
1407 
1408   *src_dr = single_ld;
1409   *dst_dr = single_st;
1410   return true;
1411 }
1412 
1413 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1414    loops from inner to outer to see if loop's step equals to access size at
1415    each level of loop.  Return 2 if we can prove this at all level loops;
1416    record access base and size in BASE and SIZE; save loop's step at each
1417    level of loop in STEPS if it is not null.  For example:
1418 
1419      int arr[100][100][100];
1420      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1421        for (j = 100; j > 0; j--)     ;steps[1] = -400
1422 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1423 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1424 
1425    Return 1 if we can prove the equality at the innermost loop, but not all
1426    level loops.  In this case, no information is recorded.
1427 
1428    Return 0 if no equality can be proven at any level loops.  */
1429 
1430 static int
1431 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1432 		      tree *size, vec<tree> *steps = NULL)
1433 {
1434   location_t loc = gimple_location (DR_STMT (dr));
1435   basic_block bb = gimple_bb (DR_STMT (dr));
1436   struct loop *loop = bb->loop_father;
1437   tree ref = DR_REF (dr);
1438   tree access_base = build_fold_addr_expr (ref);
1439   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1440   int res = 0;
1441 
1442   do {
1443       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1444       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1445 	return res;
1446 
1447       access_base = CHREC_LEFT (scev_fn);
1448       if (tree_contains_chrecs (access_base, NULL))
1449 	return res;
1450 
1451       tree scev_step = CHREC_RIGHT (scev_fn);
1452       /* Only support constant steps.  */
1453       if (TREE_CODE (scev_step) != INTEGER_CST)
1454 	return res;
1455 
1456       enum ev_direction access_dir = scev_direction (scev_fn);
1457       if (access_dir == EV_DIR_UNKNOWN)
1458 	return res;
1459 
1460       if (steps != NULL)
1461 	steps->safe_push (scev_step);
1462 
1463       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1464       /* Compute absolute value of scev step.  */
1465       if (access_dir == EV_DIR_DECREASES)
1466 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1467 
1468       /* At each level of loop, scev step must equal to access size.  In other
1469 	 words, DR must access consecutive memory between loop iterations.  */
1470       if (!operand_equal_p (scev_step, access_size, 0))
1471 	return res;
1472 
1473       /* Access stride can be computed for data reference at least for the
1474 	 innermost loop.  */
1475       res = 1;
1476 
1477       /* Compute DR's execution times in loop.  */
1478       tree niters = number_of_latch_executions (loop);
1479       niters = fold_convert_loc (loc, sizetype, niters);
1480       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1481 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1482 
1483       /* Compute DR's overall access size in loop.  */
1484       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1485 				     niters, scev_step);
1486       /* Adjust base address in case of negative step.  */
1487       if (access_dir == EV_DIR_DECREASES)
1488 	{
1489 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1490 				      scev_step, access_size);
1491 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1492 	}
1493   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1494 
1495   *base = access_base;
1496   *size = access_size;
1497   /* Access stride can be computed for data reference at each level loop.  */
1498   return 2;
1499 }
1500 
1501 /* Allocate and return builtin struct.  Record information like DST_DR,
1502    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1503 
1504 static struct builtin_info *
1505 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1506 	       tree dst_base, tree src_base, tree size)
1507 {
1508   struct builtin_info *builtin = XNEW (struct builtin_info);
1509   builtin->dst_dr = dst_dr;
1510   builtin->src_dr = src_dr;
1511   builtin->dst_base = dst_base;
1512   builtin->src_base = src_base;
1513   builtin->size = size;
1514   return builtin;
1515 }
1516 
1517 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1518    memset call.  */
1519 
1520 static void
1521 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1522 {
1523   gimple *stmt = DR_STMT (dr);
1524   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1525 
1526   if (const_with_all_bytes_same (rhs) == -1
1527       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1528 	  || (TYPE_MODE (TREE_TYPE (rhs))
1529 	      != TYPE_MODE (unsigned_char_type_node))))
1530     return;
1531 
1532   if (TREE_CODE (rhs) == SSA_NAME
1533       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1534       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1535     return;
1536 
1537   int res = compute_access_range (loop, dr, &base, &size);
1538   if (res == 0)
1539     return;
1540   if (res == 1)
1541     {
1542       partition->kind = PKIND_PARTIAL_MEMSET;
1543       return;
1544     }
1545 
1546   poly_uint64 base_offset;
1547   unsigned HOST_WIDE_INT const_base_offset;
1548   tree base_base = strip_offset (base, &base_offset);
1549   if (!base_offset.is_constant (&const_base_offset))
1550     return;
1551 
1552   struct builtin_info *builtin;
1553   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1554   builtin->dst_base_base = base_base;
1555   builtin->dst_base_offset = const_base_offset;
1556   partition->builtin = builtin;
1557   partition->kind = PKIND_MEMSET;
1558 }
1559 
1560 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1561    if it forms builtin memcpy or memmove call.  */
1562 
1563 static void
1564 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1565 		       data_reference_p dst_dr, data_reference_p src_dr)
1566 {
1567   tree base, size, src_base, src_size;
1568   auto_vec<tree> dst_steps, src_steps;
1569 
1570   /* Compute access range of both load and store.  */
1571   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1572   if (res != 2)
1573     return;
1574   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1575   if (res != 2)
1576     return;
1577 
1578   /* They much have the same access size.  */
1579   if (!operand_equal_p (size, src_size, 0))
1580     return;
1581 
1582   /* Load and store in loop nest must access memory in the same way, i.e,
1583      their must have the same steps in each loop of the nest.  */
1584   if (dst_steps.length () != src_steps.length ())
1585     return;
1586   for (unsigned i = 0; i < dst_steps.length (); ++i)
1587     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1588       return;
1589 
1590   /* Now check that if there is a dependence.  */
1591   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1592 
1593   /* Classify as memcpy if no dependence between load and store.  */
1594   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1595     {
1596       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1597       partition->kind = PKIND_MEMCPY;
1598       return;
1599     }
1600 
1601   /* Can't do memmove in case of unknown dependence or dependence without
1602      classical distance vector.  */
1603   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1604       || DDR_NUM_DIST_VECTS (ddr) == 0)
1605     return;
1606 
1607   unsigned i;
1608   lambda_vector dist_v;
1609   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1610   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1611     {
1612       unsigned dep_lev = dependence_level (dist_v, num_lev);
1613       /* Can't do memmove if load depends on store.  */
1614       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1615 	return;
1616     }
1617 
1618   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1619   partition->kind = PKIND_MEMMOVE;
1620   return;
1621 }
1622 
1623 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1624    For the moment we detect memset, memcpy and memmove patterns.  Bitmap
1625    STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.  */
1626 
1627 static void
1628 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1629 		    bitmap stmt_in_all_partitions)
1630 {
1631   bitmap_iterator bi;
1632   unsigned i;
1633   data_reference_p single_ld = NULL, single_st = NULL;
1634   bool volatiles_p = false, has_reduction = false;
1635 
1636   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1637     {
1638       gimple *stmt = RDG_STMT (rdg, i);
1639 
1640       if (gimple_has_volatile_ops (stmt))
1641 	volatiles_p = true;
1642 
1643       /* If the stmt is not included by all partitions and there is uses
1644 	 outside of the loop, then mark the partition as reduction.  */
1645       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1646 	{
1647 	  /* Due to limitation in the transform phase we have to fuse all
1648 	     reduction partitions.  As a result, this could cancel valid
1649 	     loop distribution especially for loop that induction variable
1650 	     is used outside of loop.  To workaround this issue, we skip
1651 	     marking partition as reudction if the reduction stmt belongs
1652 	     to all partitions.  In such case, reduction will be computed
1653 	     correctly no matter how partitions are fused/distributed.  */
1654 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1655 	    {
1656 	      partition->reduction_p = true;
1657 	      return;
1658 	    }
1659 	  has_reduction = true;
1660 	}
1661     }
1662 
1663   /* Perform general partition disqualification for builtins.  */
1664   if (volatiles_p
1665       /* Simple workaround to prevent classifying the partition as builtin
1666 	 if it contains any use outside of loop.  */
1667       || has_reduction
1668       || !flag_tree_loop_distribute_patterns)
1669     return;
1670 
1671   /* Find single load/store data references for builtin partition.  */
1672   if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1673     return;
1674 
1675   /* Classify the builtin kind.  */
1676   if (single_ld == NULL)
1677     classify_builtin_st (loop, partition, single_st);
1678   else
1679     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1680 }
1681 
1682 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1683    object in RDG.  */
1684 
1685 static bool
1686 share_memory_accesses (struct graph *rdg,
1687 		       partition *partition1, partition *partition2)
1688 {
1689   unsigned i, j;
1690   bitmap_iterator bi, bj;
1691   data_reference_p dr1, dr2;
1692 
1693   /* First check whether in the intersection of the two partitions are
1694      any loads or stores.  Common loads are the situation that happens
1695      most often.  */
1696   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1697     if (RDG_MEM_WRITE_STMT (rdg, i)
1698 	|| RDG_MEM_READS_STMT (rdg, i))
1699       return true;
1700 
1701   /* Then check whether the two partitions access the same memory object.  */
1702   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1703     {
1704       dr1 = datarefs_vec[i];
1705 
1706       if (!DR_BASE_ADDRESS (dr1)
1707 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1708 	continue;
1709 
1710       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1711 	{
1712 	  dr2 = datarefs_vec[j];
1713 
1714 	  if (!DR_BASE_ADDRESS (dr2)
1715 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1716 	    continue;
1717 
1718 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1719 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1720 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1721 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1722 	    return true;
1723 	}
1724     }
1725 
1726   return false;
1727 }
1728 
1729 /* For each seed statement in STARTING_STMTS, this function builds
1730    partition for it by adding depended statements according to RDG.
1731    All partitions are recorded in PARTITIONS.  */
1732 
1733 static void
1734 rdg_build_partitions (struct graph *rdg,
1735 		      vec<gimple *> starting_stmts,
1736 		      vec<partition *> *partitions)
1737 {
1738   auto_bitmap processed;
1739   int i;
1740   gimple *stmt;
1741 
1742   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1743     {
1744       int v = rdg_vertex_for_stmt (rdg, stmt);
1745 
1746       if (dump_file && (dump_flags & TDF_DETAILS))
1747 	fprintf (dump_file,
1748 		 "ldist asked to generate code for vertex %d\n", v);
1749 
1750       /* If the vertex is already contained in another partition so
1751          is the partition rooted at it.  */
1752       if (bitmap_bit_p (processed, v))
1753 	continue;
1754 
1755       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1756       bitmap_ior_into (processed, partition->stmts);
1757 
1758       if (dump_file && (dump_flags & TDF_DETAILS))
1759 	{
1760 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1761 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1762 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1763 	}
1764 
1765       partitions->safe_push (partition);
1766     }
1767 
1768   /* All vertices should have been assigned to at least one partition now,
1769      other than vertices belonging to dead code.  */
1770 }
1771 
1772 /* Dump to FILE the PARTITIONS.  */
1773 
1774 static void
1775 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1776 {
1777   int i;
1778   partition *partition;
1779 
1780   FOR_EACH_VEC_ELT (partitions, i, partition)
1781     debug_bitmap_file (file, partition->stmts);
1782 }
1783 
1784 /* Debug PARTITIONS.  */
1785 extern void debug_rdg_partitions (vec<partition *> );
1786 
1787 DEBUG_FUNCTION void
1788 debug_rdg_partitions (vec<partition *> partitions)
1789 {
1790   dump_rdg_partitions (stderr, partitions);
1791 }
1792 
1793 /* Returns the number of read and write operations in the RDG.  */
1794 
1795 static int
1796 number_of_rw_in_rdg (struct graph *rdg)
1797 {
1798   int i, res = 0;
1799 
1800   for (i = 0; i < rdg->n_vertices; i++)
1801     {
1802       if (RDG_MEM_WRITE_STMT (rdg, i))
1803 	++res;
1804 
1805       if (RDG_MEM_READS_STMT (rdg, i))
1806 	++res;
1807     }
1808 
1809   return res;
1810 }
1811 
1812 /* Returns the number of read and write operations in a PARTITION of
1813    the RDG.  */
1814 
1815 static int
1816 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1817 {
1818   int res = 0;
1819   unsigned i;
1820   bitmap_iterator ii;
1821 
1822   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1823     {
1824       if (RDG_MEM_WRITE_STMT (rdg, i))
1825 	++res;
1826 
1827       if (RDG_MEM_READS_STMT (rdg, i))
1828 	++res;
1829     }
1830 
1831   return res;
1832 }
1833 
1834 /* Returns true when one of the PARTITIONS contains all the read or
1835    write operations of RDG.  */
1836 
1837 static bool
1838 partition_contains_all_rw (struct graph *rdg,
1839 			   vec<partition *> partitions)
1840 {
1841   int i;
1842   partition *partition;
1843   int nrw = number_of_rw_in_rdg (rdg);
1844 
1845   FOR_EACH_VEC_ELT (partitions, i, partition)
1846     if (nrw == number_of_rw_in_partition (rdg, partition))
1847       return true;
1848 
1849   return false;
1850 }
1851 
1852 /* Compute partition dependence created by the data references in DRS1
1853    and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
1854    not NULL, we record dependence introduced by possible alias between
1855    two data references in ALIAS_DDRS; otherwise, we simply ignore such
1856    dependence as if it doesn't exist at all.  */
1857 
1858 static int
1859 pg_add_dependence_edges (struct graph *rdg, int dir,
1860 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1861 {
1862   unsigned i, j;
1863   bitmap_iterator bi, bj;
1864   data_reference_p dr1, dr2, saved_dr1;
1865 
1866   /* dependence direction - 0 is no dependence, -1 is back,
1867      1 is forth, 2 is both (we can stop then, merging will occur).  */
1868   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1869     {
1870       dr1 = datarefs_vec[i];
1871 
1872       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1873 	{
1874 	  int res, this_dir = 1;
1875 	  ddr_p ddr;
1876 
1877 	  dr2 = datarefs_vec[j];
1878 
1879 	  /* Skip all <read, read> data dependence.  */
1880 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1881 	    continue;
1882 
1883 	  saved_dr1 = dr1;
1884 	  /* Re-shuffle data-refs to be in topological order.  */
1885 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1886 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1887 	    {
1888 	      std::swap (dr1, dr2);
1889 	      this_dir = -this_dir;
1890 	    }
1891 	  ddr = get_data_dependence (rdg, dr1, dr2);
1892 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1893 	    {
1894 	      this_dir = 0;
1895 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1896 					   DR_BASE_ADDRESS (dr2));
1897 	      /* Be conservative.  If data references are not well analyzed,
1898 		 or the two data references have the same base address and
1899 		 offset, add dependence and consider it alias to each other.
1900 		 In other words, the dependence can not be resolved by
1901 		 runtime alias check.  */
1902 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1903 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1904 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
1905 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1906 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1907 		  || res == 0)
1908 		this_dir = 2;
1909 	      /* Data dependence could be resolved by runtime alias check,
1910 		 record it in ALIAS_DDRS.  */
1911 	      else if (alias_ddrs != NULL)
1912 		alias_ddrs->safe_push (ddr);
1913 	      /* Or simply ignore it.  */
1914 	    }
1915 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1916 	    {
1917 	      if (DDR_REVERSED_P (ddr))
1918 		this_dir = -this_dir;
1919 
1920 	      /* Known dependences can still be unordered througout the
1921 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
1922 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
1923 		this_dir = 2;
1924 	      /* If the overlap is exact preserve stmt order.  */
1925 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1926 		;
1927 	      /* Else as the distance vector is lexicographic positive swap
1928 		 the dependence direction.  */
1929 	      else
1930 		this_dir = -this_dir;
1931 	    }
1932 	  else
1933 	    this_dir = 0;
1934 	  if (this_dir == 2)
1935 	    return 2;
1936 	  else if (dir == 0)
1937 	    dir = this_dir;
1938 	  else if (this_dir != 0 && dir != this_dir)
1939 	    return 2;
1940 	  /* Shuffle "back" dr1.  */
1941 	  dr1 = saved_dr1;
1942 	}
1943     }
1944   return dir;
1945 }
1946 
1947 /* Compare postorder number of the partition graph vertices V1 and V2.  */
1948 
1949 static int
1950 pgcmp (const void *v1_, const void *v2_)
1951 {
1952   const vertex *v1 = (const vertex *)v1_;
1953   const vertex *v2 = (const vertex *)v2_;
1954   return v2->post - v1->post;
1955 }
1956 
1957 /* Data attached to vertices of partition dependence graph.  */
1958 struct pg_vdata
1959 {
1960   /* ID of the corresponding partition.  */
1961   int id;
1962   /* The partition.  */
1963   struct partition *partition;
1964 };
1965 
1966 /* Data attached to edges of partition dependence graph.  */
1967 struct pg_edata
1968 {
1969   /* If the dependence edge can be resolved by runtime alias check,
1970      this vector contains data dependence relations for runtime alias
1971      check.  On the other hand, if the dependence edge is introduced
1972      because of compilation time known data dependence, this vector
1973      contains nothing.  */
1974   vec<ddr_p> alias_ddrs;
1975 };
1976 
1977 /* Callback data for traversing edges in graph.  */
1978 struct pg_edge_callback_data
1979 {
1980   /* Bitmap contains strong connected components should be merged.  */
1981   bitmap sccs_to_merge;
1982   /* Array constains component information for all vertices.  */
1983   int *vertices_component;
1984   /* Vector to record all data dependence relations which are needed
1985      to break strong connected components by runtime alias checks.  */
1986   vec<ddr_p> *alias_ddrs;
1987 };
1988 
1989 /* Initialize vertice's data for partition dependence graph PG with
1990    PARTITIONS.  */
1991 
1992 static void
1993 init_partition_graph_vertices (struct graph *pg,
1994 			       vec<struct partition *> *partitions)
1995 {
1996   int i;
1997   partition *partition;
1998   struct pg_vdata *data;
1999 
2000   for (i = 0; partitions->iterate (i, &partition); ++i)
2001     {
2002       data = new pg_vdata;
2003       pg->vertices[i].data = data;
2004       data->id = i;
2005       data->partition = partition;
2006     }
2007 }
2008 
2009 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2010    dependence relations to the EDGE if DDRS isn't NULL.  */
2011 
2012 static void
2013 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2014 {
2015   struct graph_edge *e = add_edge (pg, i, j);
2016 
2017   /* If the edge is attached with data dependence relations, it means this
2018      dependence edge can be resolved by runtime alias checks.  */
2019   if (ddrs != NULL)
2020     {
2021       struct pg_edata *data = new pg_edata;
2022 
2023       gcc_assert (ddrs->length () > 0);
2024       e->data = data;
2025       data->alias_ddrs = vNULL;
2026       data->alias_ddrs.safe_splice (*ddrs);
2027     }
2028 }
2029 
2030 /* Callback function for graph travesal algorithm.  It returns true
2031    if edge E should skipped when traversing the graph.  */
2032 
2033 static bool
2034 pg_skip_alias_edge (struct graph_edge *e)
2035 {
2036   struct pg_edata *data = (struct pg_edata *)e->data;
2037   return (data != NULL && data->alias_ddrs.length () > 0);
2038 }
2039 
2040 /* Callback function freeing data attached to edge E of graph.  */
2041 
2042 static void
2043 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2044 {
2045   if (e->data != NULL)
2046     {
2047       struct pg_edata *data = (struct pg_edata *)e->data;
2048       data->alias_ddrs.release ();
2049       delete data;
2050     }
2051 }
2052 
2053 /* Free data attached to vertice of partition dependence graph PG.  */
2054 
2055 static void
2056 free_partition_graph_vdata (struct graph *pg)
2057 {
2058   int i;
2059   struct pg_vdata *data;
2060 
2061   for (i = 0; i < pg->n_vertices; ++i)
2062     {
2063       data = (struct pg_vdata *)pg->vertices[i].data;
2064       delete data;
2065     }
2066 }
2067 
2068 /* Build and return partition dependence graph for PARTITIONS.  RDG is
2069    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2070    is true, data dependence caused by possible alias between references
2071    is ignored, as if it doesn't exist at all; otherwise all depdendences
2072    are considered.  */
2073 
2074 static struct graph *
2075 build_partition_graph (struct graph *rdg,
2076 		       vec<struct partition *> *partitions,
2077 		       bool ignore_alias_p)
2078 {
2079   int i, j;
2080   struct partition *partition1, *partition2;
2081   graph *pg = new_graph (partitions->length ());
2082   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2083 
2084   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2085 
2086   init_partition_graph_vertices (pg, partitions);
2087 
2088   for (i = 0; partitions->iterate (i, &partition1); ++i)
2089     {
2090       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2091 	{
2092 	  /* dependence direction - 0 is no dependence, -1 is back,
2093 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2094 	  int dir = 0;
2095 
2096 	  /* If the first partition has reduction, add back edge; if the
2097 	     second partition has reduction, add forth edge.  This makes
2098 	     sure that reduction partition will be sorted as the last one.  */
2099 	  if (partition_reduction_p (partition1))
2100 	    dir = -1;
2101 	  else if (partition_reduction_p (partition2))
2102 	    dir = 1;
2103 
2104 	  /* Cleanup the temporary vector.  */
2105 	  alias_ddrs.truncate (0);
2106 
2107 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2108 					 partition2->datarefs, alias_ddrs_p);
2109 
2110 	  /* Add edge to partition graph if there exists dependence.  There
2111 	     are two types of edges.  One type edge is caused by compilation
2112 	     time known dependence, this type can not be resolved by runtime
2113 	     alias check.  The other type can be resolved by runtime alias
2114 	     check.  */
2115 	  if (dir == 1 || dir == 2
2116 	      || alias_ddrs.length () > 0)
2117 	    {
2118 	      /* Attach data dependence relations to edge that can be resolved
2119 		 by runtime alias check.  */
2120 	      bool alias_edge_p = (dir != 1 && dir != 2);
2121 	      add_partition_graph_edge (pg, i, j,
2122 					(alias_edge_p) ? &alias_ddrs : NULL);
2123 	    }
2124 	  if (dir == -1 || dir == 2
2125 	      || alias_ddrs.length () > 0)
2126 	    {
2127 	      /* Attach data dependence relations to edge that can be resolved
2128 		 by runtime alias check.  */
2129 	      bool alias_edge_p = (dir != -1 && dir != 2);
2130 	      add_partition_graph_edge (pg, j, i,
2131 					(alias_edge_p) ? &alias_ddrs : NULL);
2132 	    }
2133 	}
2134     }
2135   return pg;
2136 }
2137 
2138 /* Sort partitions in PG in descending post order and store them in
2139    PARTITIONS.  */
2140 
2141 static void
2142 sort_partitions_by_post_order (struct graph *pg,
2143 			       vec<struct partition *> *partitions)
2144 {
2145   int i;
2146   struct pg_vdata *data;
2147 
2148   /* Now order the remaining nodes in descending postorder.  */
2149   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2150   partitions->truncate (0);
2151   for (i = 0; i < pg->n_vertices; ++i)
2152     {
2153       data = (struct pg_vdata *)pg->vertices[i].data;
2154       if (data->partition)
2155 	partitions->safe_push (data->partition);
2156     }
2157 }
2158 
2159 /* Given reduced dependence graph RDG merge strong connected components
2160    of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
2161    possible alias between references is ignored, as if it doesn't exist
2162    at all; otherwise all depdendences are considered.  */
2163 
2164 static void
2165 merge_dep_scc_partitions (struct graph *rdg,
2166 			  vec<struct partition *> *partitions,
2167 			  bool ignore_alias_p)
2168 {
2169   struct partition *partition1, *partition2;
2170   struct pg_vdata *data;
2171   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2172   int i, j, num_sccs = graphds_scc (pg, NULL);
2173 
2174   /* Strong connected compoenent means dependence cycle, we cannot distribute
2175      them.  So fuse them together.  */
2176   if ((unsigned) num_sccs < partitions->length ())
2177     {
2178       for (i = 0; i < num_sccs; ++i)
2179 	{
2180 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2181 	    if (pg->vertices[j].component == i)
2182 	      break;
2183 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2184 	    if (pg->vertices[j].component == i)
2185 	      {
2186 		partition_merge_into (NULL, partition1,
2187 				      partition2, FUSE_SAME_SCC);
2188 		partition1->type = PTYPE_SEQUENTIAL;
2189 		(*partitions)[j] = NULL;
2190 		partition_free (partition2);
2191 		data = (struct pg_vdata *)pg->vertices[j].data;
2192 		data->partition = NULL;
2193 	      }
2194 	}
2195     }
2196 
2197   sort_partitions_by_post_order (pg, partitions);
2198   gcc_assert (partitions->length () == (unsigned)num_sccs);
2199   free_partition_graph_vdata (pg);
2200   free_graph (pg);
2201 }
2202 
2203 /* Callback function for traversing edge E in graph G.  DATA is private
2204    callback data.  */
2205 
2206 static void
2207 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2208 {
2209   int i, j, component;
2210   struct pg_edge_callback_data *cbdata;
2211   struct pg_edata *edata = (struct pg_edata *) e->data;
2212 
2213   /* If the edge doesn't have attached data dependence, it represents
2214      compilation time known dependences.  This type dependence cannot
2215      be resolved by runtime alias check.  */
2216   if (edata == NULL || edata->alias_ddrs.length () == 0)
2217     return;
2218 
2219   cbdata = (struct pg_edge_callback_data *) data;
2220   i = e->src;
2221   j = e->dest;
2222   component = cbdata->vertices_component[i];
2223   /* Vertices are topologically sorted according to compilation time
2224      known dependences, so we can break strong connected components
2225      by removing edges of the opposite direction, i.e, edges pointing
2226      from vertice with smaller post number to vertice with bigger post
2227      number.  */
2228   if (g->vertices[i].post < g->vertices[j].post
2229       /* We only need to remove edges connecting vertices in the same
2230 	 strong connected component to break it.  */
2231       && component == cbdata->vertices_component[j]
2232       /* Check if we want to break the strong connected component or not.  */
2233       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2234     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2235 }
2236 
2237 /* This is the main function breaking strong conected components in
2238    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2239    relations for runtime alias check in ALIAS_DDRS.  */
2240 
2241 static void
2242 break_alias_scc_partitions (struct graph *rdg,
2243 			    vec<struct partition *> *partitions,
2244 			    vec<ddr_p> *alias_ddrs)
2245 {
2246   int i, j, k, num_sccs, num_sccs_no_alias;
2247   /* Build partition dependence graph.  */
2248   graph *pg = build_partition_graph (rdg, partitions, false);
2249 
2250   alias_ddrs->truncate (0);
2251   /* Find strong connected components in the graph, with all dependence edges
2252      considered.  */
2253   num_sccs = graphds_scc (pg, NULL);
2254   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2255      compilation time known dependences are merged before this function.  */
2256   if ((unsigned) num_sccs < partitions->length ())
2257     {
2258       struct pg_edge_callback_data cbdata;
2259       auto_bitmap sccs_to_merge;
2260       auto_vec<enum partition_type> scc_types;
2261       struct partition *partition, *first;
2262 
2263       /* If all partitions in a SCC have the same type, we can simply merge the
2264 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2265       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2266       for (i = 0; i < num_sccs; ++i)
2267 	{
2268 	  for (j = 0; partitions->iterate (j, &first); ++j)
2269 	    if (pg->vertices[j].component == i)
2270 	      break;
2271 	  for (++j; partitions->iterate (j, &partition); ++j)
2272 	    {
2273 	      if (pg->vertices[j].component != i)
2274 		continue;
2275 
2276 	      /* Note we Merge partitions of parallel type on purpose, though
2277 		 the result partition is sequential.  The reason is vectorizer
2278 		 can do more accurate runtime alias check in this case.  Also
2279 		 it results in more conservative distribution.  */
2280 	      if (first->type != partition->type)
2281 		{
2282 		  bitmap_clear_bit (sccs_to_merge, i);
2283 		  break;
2284 		}
2285 	    }
2286 	}
2287 
2288       /* Initialize callback data for traversing.  */
2289       cbdata.sccs_to_merge = sccs_to_merge;
2290       cbdata.alias_ddrs = alias_ddrs;
2291       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2292       /* Record the component information which will be corrupted by next
2293 	 graph scc finding call.  */
2294       for (i = 0; i < pg->n_vertices; ++i)
2295 	cbdata.vertices_component[i] = pg->vertices[i].component;
2296 
2297       /* Collect data dependences for runtime alias checks to break SCCs.  */
2298       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2299 	{
2300 	  /* Run SCC finding algorithm again, with alias dependence edges
2301 	     skipped.  This is to topologically sort partitions according to
2302 	     compilation time known dependence.  Note the topological order
2303 	     is stored in the form of pg's post order number.  */
2304 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2305 	  gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2306 	  /* With topological order, we can construct two subgraphs L and R.
2307 	     L contains edge <x, y> where x < y in terms of post order, while
2308 	     R contains edge <x, y> where x > y.  Edges for compilation time
2309 	     known dependence all fall in R, so we break SCCs by removing all
2310 	     (alias) edges of in subgraph L.  */
2311 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2312 	}
2313 
2314       /* For SCC that doesn't need to be broken, merge it.  */
2315       for (i = 0; i < num_sccs; ++i)
2316 	{
2317 	  if (!bitmap_bit_p (sccs_to_merge, i))
2318 	    continue;
2319 
2320 	  for (j = 0; partitions->iterate (j, &first); ++j)
2321 	    if (cbdata.vertices_component[j] == i)
2322 	      break;
2323 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2324 	    {
2325 	      struct pg_vdata *data;
2326 
2327 	      if (cbdata.vertices_component[k] != i)
2328 		continue;
2329 
2330 	      /* Update postorder number so that merged reduction partition is
2331 		 sorted after other partitions.  */
2332 	      if (!partition_reduction_p (first)
2333 		  && partition_reduction_p (partition))
2334 		{
2335 		  gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2336 		  pg->vertices[j].post = pg->vertices[k].post;
2337 		}
2338 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2339 	      (*partitions)[k] = NULL;
2340 	      partition_free (partition);
2341 	      data = (struct pg_vdata *)pg->vertices[k].data;
2342 	      gcc_assert (data->id == k);
2343 	      data->partition = NULL;
2344 	      /* The result partition of merged SCC must be sequential.  */
2345 	      first->type = PTYPE_SEQUENTIAL;
2346 	    }
2347 	}
2348     }
2349 
2350   sort_partitions_by_post_order (pg, partitions);
2351   free_partition_graph_vdata (pg);
2352   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2353   free_graph (pg);
2354 
2355   if (dump_file && (dump_flags & TDF_DETAILS))
2356     {
2357       fprintf (dump_file, "Possible alias data dependence to break:\n");
2358       dump_data_dependence_relations (dump_file, *alias_ddrs);
2359     }
2360 }
2361 
2362 /* Compute and return an expression whose value is the segment length which
2363    will be accessed by DR in NITERS iterations.  */
2364 
2365 static tree
2366 data_ref_segment_size (struct data_reference *dr, tree niters)
2367 {
2368   niters = size_binop (MINUS_EXPR,
2369 		       fold_convert (sizetype, niters),
2370 		       size_one_node);
2371   return size_binop (MULT_EXPR,
2372 		     fold_convert (sizetype, DR_STEP (dr)),
2373 		     fold_convert (sizetype, niters));
2374 }
2375 
2376 /* Return true if LOOP's latch is dominated by statement for data reference
2377    DR.  */
2378 
2379 static inline bool
2380 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2381 {
2382   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2383 			 gimple_bb (DR_STMT (dr)));
2384 }
2385 
2386 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2387    data dependence relations ALIAS_DDRS.  */
2388 
2389 static void
2390 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2391 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2392 {
2393   unsigned int i;
2394   unsigned HOST_WIDE_INT factor = 1;
2395   tree niters_plus_one, niters = number_of_latch_executions (loop);
2396 
2397   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2398   niters = fold_convert (sizetype, niters);
2399   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2400 
2401   if (dump_file && (dump_flags & TDF_DETAILS))
2402     fprintf (dump_file, "Creating alias check pairs:\n");
2403 
2404   /* Iterate all data dependence relations and compute alias check pairs.  */
2405   for (i = 0; i < alias_ddrs->length (); i++)
2406     {
2407       ddr_p ddr = (*alias_ddrs)[i];
2408       struct data_reference *dr_a = DDR_A (ddr);
2409       struct data_reference *dr_b = DDR_B (ddr);
2410       tree seg_length_a, seg_length_b;
2411       int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2412 					    DR_BASE_ADDRESS (dr_b));
2413 
2414       if (comp_res == 0)
2415 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2416       gcc_assert (comp_res != 0);
2417 
2418       if (latch_dominated_by_data_ref (loop, dr_a))
2419 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2420       else
2421 	seg_length_a = data_ref_segment_size (dr_a, niters);
2422 
2423       if (latch_dominated_by_data_ref (loop, dr_b))
2424 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2425       else
2426 	seg_length_b = data_ref_segment_size (dr_b, niters);
2427 
2428       unsigned HOST_WIDE_INT access_size_a
2429 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2430       unsigned HOST_WIDE_INT access_size_b
2431 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2432       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2433       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2434 
2435       dr_with_seg_len_pair_t dr_with_seg_len_pair
2436 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2437 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2438 
2439       /* Canonicalize pairs by sorting the two DR members.  */
2440       if (comp_res > 0)
2441 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2442 
2443       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2444     }
2445 
2446   if (tree_fits_uhwi_p (niters))
2447     factor = tree_to_uhwi (niters);
2448 
2449   /* Prune alias check pairs.  */
2450   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2451   if (dump_file && (dump_flags & TDF_DETAILS))
2452     fprintf (dump_file,
2453 	     "Improved number of alias checks from %d to %d\n",
2454 	     alias_ddrs->length (), comp_alias_pairs->length ());
2455 }
2456 
2457 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2458    checks and version LOOP under condition of these runtime alias checks.  */
2459 
2460 static void
2461 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2462 {
2463   profile_probability prob;
2464   basic_block cond_bb;
2465   struct loop *nloop;
2466   tree lhs, arg0, cond_expr = NULL_TREE;
2467   gimple_seq cond_stmts = NULL;
2468   gimple *call_stmt = NULL;
2469   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2470 
2471   /* Generate code for runtime alias checks if necessary.  */
2472   gcc_assert (alias_ddrs->length () > 0);
2473 
2474   if (dump_file && (dump_flags & TDF_DETAILS))
2475     fprintf (dump_file,
2476 	     "Version loop <%d> with runtime alias check\n", loop->num);
2477 
2478   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2479   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2480   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2481 				      is_gimple_val, NULL_TREE);
2482 
2483   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2484   if (flag_tree_loop_vectorize)
2485     {
2486       /* Generate internal function call for loop distribution alias check.  */
2487       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2488 					      2, NULL_TREE, cond_expr);
2489       lhs = make_ssa_name (boolean_type_node);
2490       gimple_call_set_lhs (call_stmt, lhs);
2491     }
2492   else
2493     lhs = cond_expr;
2494 
2495   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2496   initialize_original_copy_tables ();
2497   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2498 			prob, prob.invert (), true);
2499   free_original_copy_tables ();
2500   /* Record the original loop number in newly generated loops.  In case of
2501      distribution, the original loop will be distributed and the new loop
2502      is kept.  */
2503   loop->orig_loop_num = nloop->num;
2504   nloop->orig_loop_num = nloop->num;
2505   nloop->dont_vectorize = true;
2506   nloop->force_vectorize = false;
2507 
2508   if (call_stmt)
2509     {
2510       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2511 	 loop could be destroyed.  */
2512       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2513       gimple_call_set_arg (call_stmt, 0, arg0);
2514       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2515     }
2516 
2517   if (cond_stmts)
2518     {
2519       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2520       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2521     }
2522   update_ssa (TODO_update_ssa);
2523 }
2524 
2525 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2526    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2527 
2528 static inline bool
2529 version_for_distribution_p (vec<struct partition *> *partitions,
2530 			    vec<ddr_p> *alias_ddrs)
2531 {
2532   /* No need to version loop if we have only one partition.  */
2533   if (partitions->length () == 1)
2534     return false;
2535 
2536   /* Need to version loop if runtime alias check is necessary.  */
2537   return (alias_ddrs->length () > 0);
2538 }
2539 
2540 /* Compare base offset of builtin mem* partitions P1 and P2.  */
2541 
2542 static bool
2543 offset_cmp (struct partition *p1, struct partition *p2)
2544 {
2545   gcc_assert (p1 != NULL && p1->builtin != NULL);
2546   gcc_assert (p2 != NULL && p2->builtin != NULL);
2547   return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2548 }
2549 
2550 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2551    case optimization transforming below code:
2552 
2553      __builtin_memset (&obj, 0, 100);
2554      _1 = &obj + 100;
2555      __builtin_memset (_1, 0, 200);
2556      _2 = &obj + 300;
2557      __builtin_memset (_2, 0, 100);
2558 
2559    into:
2560 
2561      __builtin_memset (&obj, 0, 400);
2562 
2563    Note we don't have dependence information between different partitions
2564    at this point, as a result, we can't handle nonadjacent memset builtin
2565    partitions since dependence might be broken.  */
2566 
2567 static void
2568 fuse_memset_builtins (vec<struct partition *> *partitions)
2569 {
2570   unsigned i, j;
2571   struct partition *part1, *part2;
2572   tree rhs1, rhs2;
2573 
2574   for (i = 0; partitions->iterate (i, &part1);)
2575     {
2576       if (part1->kind != PKIND_MEMSET)
2577 	{
2578 	  i++;
2579 	  continue;
2580 	}
2581 
2582       /* Find sub-array of memset builtins of the same base.  Index range
2583 	 of the sub-array is [i, j) with "j > i".  */
2584       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2585 	{
2586 	  if (part2->kind != PKIND_MEMSET
2587 	      || !operand_equal_p (part1->builtin->dst_base_base,
2588 				   part2->builtin->dst_base_base, 0))
2589 	    break;
2590 
2591 	  /* Memset calls setting different values can't be merged.  */
2592 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2593 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2594 	  if (!operand_equal_p (rhs1, rhs2, 0))
2595 	    break;
2596 	}
2597 
2598       /* Stable sort is required in order to avoid breaking dependence.  */
2599       std::stable_sort (&(*partitions)[i],
2600 			&(*partitions)[i] + j - i, offset_cmp);
2601       /* Continue with next partition.  */
2602       i = j;
2603     }
2604 
2605   /* Merge all consecutive memset builtin partitions.  */
2606   for (i = 0; i < partitions->length () - 1;)
2607     {
2608       part1 = (*partitions)[i];
2609       if (part1->kind != PKIND_MEMSET)
2610 	{
2611 	  i++;
2612 	  continue;
2613 	}
2614 
2615       part2 = (*partitions)[i + 1];
2616       /* Only merge memset partitions of the same base and with constant
2617 	 access sizes.  */
2618       if (part2->kind != PKIND_MEMSET
2619 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2620 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2621 	  || !operand_equal_p (part1->builtin->dst_base_base,
2622 			       part2->builtin->dst_base_base, 0))
2623 	{
2624 	  i++;
2625 	  continue;
2626 	}
2627       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2628       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2629       int bytev1 = const_with_all_bytes_same (rhs1);
2630       int bytev2 = const_with_all_bytes_same (rhs2);
2631       /* Only merge memset partitions of the same value.  */
2632       if (bytev1 != bytev2 || bytev1 == -1)
2633 	{
2634 	  i++;
2635 	  continue;
2636 	}
2637       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2638 			       wi::to_wide (part1->builtin->size));
2639       /* Only merge adjacent memset partitions.  */
2640       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2641 	{
2642 	  i++;
2643 	  continue;
2644 	}
2645       /* Merge partitions[i] and partitions[i+1].  */
2646       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2647 					  part1->builtin->size,
2648 					  part2->builtin->size);
2649       partition_free (part2);
2650       partitions->ordered_remove (i + 1);
2651     }
2652 }
2653 
2654 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2655    ALIAS_DDRS contains ddrs which need runtime alias check.  */
2656 
2657 static void
2658 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2659 		     vec<ddr_p> *alias_ddrs)
2660 {
2661   unsigned i;
2662   struct partition *partition, *a;
2663 
2664   if (partitions->length () == 1
2665       || alias_ddrs->length () > 0)
2666     return;
2667 
2668   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2669   bool same_type_p = true;
2670   enum partition_type type = ((*partitions)[0])->type;
2671   for (i = 0; partitions->iterate (i, &partition); ++i)
2672     {
2673       same_type_p &= (type == partition->type);
2674       if (partition_builtin_p (partition))
2675 	{
2676 	  num_builtin++;
2677 	  continue;
2678 	}
2679       num_normal++;
2680       if (partition->kind == PKIND_PARTIAL_MEMSET)
2681 	num_partial_memset++;
2682     }
2683 
2684   /* Don't distribute current loop into too many loops given we don't have
2685      memory stream cost model.  Be even more conservative in case of loop
2686      nest distribution.  */
2687   if ((same_type_p && num_builtin == 0
2688        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2689       || (loop->inner != NULL
2690 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2691       || (loop->inner == NULL
2692 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2693     {
2694       a = (*partitions)[0];
2695       for (i = 1; partitions->iterate (i, &partition); ++i)
2696 	{
2697 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2698 	  partition_free (partition);
2699 	}
2700       partitions->truncate (1);
2701     }
2702 
2703   /* Fuse memset builtins if possible.  */
2704   if (partitions->length () > 1)
2705     fuse_memset_builtins (partitions);
2706 }
2707 
2708 /* Distributes the code from LOOP in such a way that producer statements
2709    are placed before consumer statements.  Tries to separate only the
2710    statements from STMTS into separate loops.  Returns the number of
2711    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2712    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2713 
2714 static int
2715 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2716 		 control_dependences *cd, int *nb_calls, bool *destroy_p)
2717 {
2718   ddrs_table = new hash_table<ddr_hasher> (389);
2719   struct graph *rdg;
2720   partition *partition;
2721   bool any_builtin;
2722   int i, nbp;
2723 
2724   *destroy_p = false;
2725   *nb_calls = 0;
2726   loop_nest.create (0);
2727   if (!find_loop_nest (loop, &loop_nest))
2728     {
2729       loop_nest.release ();
2730       delete ddrs_table;
2731       return 0;
2732     }
2733 
2734   datarefs_vec.create (20);
2735   rdg = build_rdg (loop, cd);
2736   if (!rdg)
2737     {
2738       if (dump_file && (dump_flags & TDF_DETAILS))
2739 	fprintf (dump_file,
2740 		 "Loop %d not distributed: failed to build the RDG.\n",
2741 		 loop->num);
2742 
2743       loop_nest.release ();
2744       free_data_refs (datarefs_vec);
2745       delete ddrs_table;
2746       return 0;
2747     }
2748 
2749   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2750     {
2751       if (dump_file && (dump_flags & TDF_DETAILS))
2752 	fprintf (dump_file,
2753 		 "Loop %d not distributed: too many memory references.\n",
2754 		 loop->num);
2755 
2756       free_rdg (rdg);
2757       loop_nest.release ();
2758       free_data_refs (datarefs_vec);
2759       delete ddrs_table;
2760       return 0;
2761     }
2762 
2763   data_reference_p dref;
2764   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2765     dref->aux = (void *) (uintptr_t) i;
2766 
2767   if (dump_file && (dump_flags & TDF_DETAILS))
2768     dump_rdg (dump_file, rdg);
2769 
2770   auto_vec<struct partition *, 3> partitions;
2771   rdg_build_partitions (rdg, stmts, &partitions);
2772 
2773   auto_vec<ddr_p> alias_ddrs;
2774 
2775   auto_bitmap stmt_in_all_partitions;
2776   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2777   for (i = 1; partitions.iterate (i, &partition); ++i)
2778     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2779 
2780   any_builtin = false;
2781   FOR_EACH_VEC_ELT (partitions, i, partition)
2782     {
2783       classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2784       any_builtin |= partition_builtin_p (partition);
2785     }
2786 
2787   /* If we are only distributing patterns but did not detect any,
2788      simply bail out.  */
2789   if (!flag_tree_loop_distribution
2790       && !any_builtin)
2791     {
2792       nbp = 0;
2793       goto ldist_done;
2794     }
2795 
2796   /* If we are only distributing patterns fuse all partitions that
2797      were not classified as builtins.  This also avoids chopping
2798      a loop into pieces, separated by builtin calls.  That is, we
2799      only want no or a single loop body remaining.  */
2800   struct partition *into;
2801   if (!flag_tree_loop_distribution)
2802     {
2803       for (i = 0; partitions.iterate (i, &into); ++i)
2804 	if (!partition_builtin_p (into))
2805 	  break;
2806       for (++i; partitions.iterate (i, &partition); ++i)
2807 	if (!partition_builtin_p (partition))
2808 	  {
2809 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2810 	    partitions.unordered_remove (i);
2811 	    partition_free (partition);
2812 	    i--;
2813 	  }
2814     }
2815 
2816   /* Due to limitations in the transform phase we have to fuse all
2817      reduction partitions into the last partition so the existing
2818      loop will contain all loop-closed PHI nodes.  */
2819   for (i = 0; partitions.iterate (i, &into); ++i)
2820     if (partition_reduction_p (into))
2821       break;
2822   for (i = i + 1; partitions.iterate (i, &partition); ++i)
2823     if (partition_reduction_p (partition))
2824       {
2825 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2826 	partitions.unordered_remove (i);
2827 	partition_free (partition);
2828 	i--;
2829       }
2830 
2831   /* Apply our simple cost model - fuse partitions with similar
2832      memory accesses.  */
2833   for (i = 0; partitions.iterate (i, &into); ++i)
2834     {
2835       bool changed = false;
2836       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2837 	continue;
2838       for (int j = i + 1;
2839 	   partitions.iterate (j, &partition); ++j)
2840 	{
2841 	  if (share_memory_accesses (rdg, into, partition))
2842 	    {
2843 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2844 	      partitions.unordered_remove (j);
2845 	      partition_free (partition);
2846 	      j--;
2847 	      changed = true;
2848 	    }
2849 	}
2850       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2851          accesses when 1 and 2 have similar accesses but not 0 and 1
2852 	 then in the next iteration we will fail to consider merging
2853 	 1 into 0,2.  So try again if we did any merging into 0.  */
2854       if (changed)
2855 	i--;
2856     }
2857 
2858   /* Build the partition dependency graph and fuse partitions in strong
2859      connected component.  */
2860   if (partitions.length () > 1)
2861     {
2862       /* Don't support loop nest distribution under runtime alias check
2863 	 since it's not likely to enable many vectorization opportunities.  */
2864       if (loop->inner)
2865 	merge_dep_scc_partitions (rdg, &partitions, false);
2866       else
2867 	{
2868 	  merge_dep_scc_partitions (rdg, &partitions, true);
2869 	  if (partitions.length () > 1)
2870 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2871 	}
2872     }
2873 
2874   finalize_partitions (loop, &partitions, &alias_ddrs);
2875 
2876   nbp = partitions.length ();
2877   if (nbp == 0
2878       || (nbp == 1 && !partition_builtin_p (partitions[0]))
2879       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2880     {
2881       nbp = 0;
2882       goto ldist_done;
2883     }
2884 
2885   if (version_for_distribution_p (&partitions, &alias_ddrs))
2886     version_loop_by_alias_check (loop, &alias_ddrs);
2887 
2888   if (dump_file && (dump_flags & TDF_DETAILS))
2889     {
2890       fprintf (dump_file,
2891 	       "distribute loop <%d> into partitions:\n", loop->num);
2892       dump_rdg_partitions (dump_file, partitions);
2893     }
2894 
2895   FOR_EACH_VEC_ELT (partitions, i, partition)
2896     {
2897       if (partition_builtin_p (partition))
2898 	(*nb_calls)++;
2899       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2900     }
2901 
2902  ldist_done:
2903   loop_nest.release ();
2904   free_data_refs (datarefs_vec);
2905   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2906        iter != ddrs_table->end (); ++iter)
2907     {
2908       free_dependence_relation (*iter);
2909       *iter = NULL;
2910     }
2911   delete ddrs_table;
2912 
2913   FOR_EACH_VEC_ELT (partitions, i, partition)
2914     partition_free (partition);
2915 
2916   free_rdg (rdg);
2917   return nbp - *nb_calls;
2918 }
2919 
2920 /* Distribute all loops in the current function.  */
2921 
2922 namespace {
2923 
2924 const pass_data pass_data_loop_distribution =
2925 {
2926   GIMPLE_PASS, /* type */
2927   "ldist", /* name */
2928   OPTGROUP_LOOP, /* optinfo_flags */
2929   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2930   ( PROP_cfg | PROP_ssa ), /* properties_required */
2931   0, /* properties_provided */
2932   0, /* properties_destroyed */
2933   0, /* todo_flags_start */
2934   0, /* todo_flags_finish */
2935 };
2936 
2937 class pass_loop_distribution : public gimple_opt_pass
2938 {
2939 public:
2940   pass_loop_distribution (gcc::context *ctxt)
2941     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2942   {}
2943 
2944   /* opt_pass methods: */
2945   virtual bool gate (function *)
2946     {
2947       return flag_tree_loop_distribution
2948 	|| flag_tree_loop_distribute_patterns;
2949     }
2950 
2951   virtual unsigned int execute (function *);
2952 
2953 }; // class pass_loop_distribution
2954 
2955 
2956 /* Given LOOP, this function records seed statements for distribution in
2957    WORK_LIST.  Return false if there is nothing for distribution.  */
2958 
2959 static bool
2960 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2961 {
2962   basic_block *bbs = get_loop_body_in_dom_order (loop);
2963 
2964   /* Initialize the worklist with stmts we seed the partitions with.  */
2965   for (unsigned i = 0; i < loop->num_nodes; ++i)
2966     {
2967       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2968 	   !gsi_end_p (gsi); gsi_next (&gsi))
2969 	{
2970 	  gphi *phi = gsi.phi ();
2971 	  if (virtual_operand_p (gimple_phi_result (phi)))
2972 	    continue;
2973 	  /* Distribute stmts which have defs that are used outside of
2974 	     the loop.  */
2975 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2976 	    continue;
2977 	  work_list->safe_push (phi);
2978 	}
2979       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2980 	   !gsi_end_p (gsi); gsi_next (&gsi))
2981 	{
2982 	  gimple *stmt = gsi_stmt (gsi);
2983 
2984 	  /* If there is a stmt with side-effects bail out - we
2985 	     cannot and should not distribute this loop.  */
2986 	  if (gimple_has_side_effects (stmt))
2987 	    {
2988 	      free (bbs);
2989 	      return false;
2990 	    }
2991 
2992 	  /* Distribute stmts which have defs that are used outside of
2993 	     the loop.  */
2994 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2995 	    ;
2996 	  /* Otherwise only distribute stores for now.  */
2997 	  else if (!gimple_vdef (stmt))
2998 	    continue;
2999 
3000 	  work_list->safe_push (stmt);
3001 	}
3002     }
3003   free (bbs);
3004   return work_list->length () > 0;
3005 }
3006 
3007 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3008    perfect loop nest.  */
3009 
3010 static struct loop *
3011 prepare_perfect_loop_nest (struct loop *loop)
3012 {
3013   struct loop *outer = loop_outer (loop);
3014   tree niters = number_of_latch_executions (loop);
3015 
3016   /* TODO: We only support the innermost 3-level loop nest distribution
3017      because of compilation time issue for now.  This should be relaxed
3018      in the future.  Note we only allow 3-level loop nest distribution
3019      when parallelizing loops.  */
3020   while ((loop->inner == NULL
3021 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3022 	 && loop_outer (outer)
3023 	 && outer->inner == loop && loop->next == NULL
3024 	 && single_exit (outer)
3025 	 && optimize_loop_for_speed_p (outer)
3026 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3027 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3028 	 && niters != chrec_dont_know)
3029     {
3030       loop = outer;
3031       outer = loop_outer (loop);
3032     }
3033 
3034   return loop;
3035 }
3036 
3037 unsigned int
3038 pass_loop_distribution::execute (function *fun)
3039 {
3040   struct loop *loop;
3041   bool changed = false;
3042   basic_block bb;
3043   control_dependences *cd = NULL;
3044   auto_vec<loop_p> loops_to_be_destroyed;
3045 
3046   if (number_of_loops (fun) <= 1)
3047     return 0;
3048 
3049   /* Compute topological order for basic blocks.  Topological order is
3050      needed because data dependence is computed for data references in
3051      lexicographical order.  */
3052   if (bb_top_order_index == NULL)
3053     {
3054       int rpo_num;
3055       int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3056 
3057       bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3058       bb_top_order_index_size = last_basic_block_for_fn (cfun);
3059       rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3060       for (int i = 0; i < rpo_num; i++)
3061 	bb_top_order_index[rpo[i]] = i;
3062 
3063       free (rpo);
3064     }
3065 
3066   FOR_ALL_BB_FN (bb, fun)
3067     {
3068       gimple_stmt_iterator gsi;
3069       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3070 	gimple_set_uid (gsi_stmt (gsi), -1);
3071       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3072 	gimple_set_uid (gsi_stmt (gsi), -1);
3073     }
3074 
3075   /* We can at the moment only distribute non-nested loops, thus restrict
3076      walking to innermost loops.  */
3077   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3078     {
3079       /* Don't distribute multiple exit edges loop, or cold loop.  */
3080       if (!single_exit (loop)
3081 	  || !optimize_loop_for_speed_p (loop))
3082 	continue;
3083 
3084       /* Don't distribute loop if niters is unknown.  */
3085       tree niters = number_of_latch_executions (loop);
3086       if (niters == NULL_TREE || niters == chrec_dont_know)
3087 	continue;
3088 
3089       /* Get the perfect loop nest for distribution.  */
3090       loop = prepare_perfect_loop_nest (loop);
3091       for (; loop; loop = loop->inner)
3092 	{
3093 	  auto_vec<gimple *> work_list;
3094 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3095 	    break;
3096 
3097 	  const char *str = loop->inner ? " nest" : "";
3098 	  location_t loc = find_loop_location (loop);
3099 	  if (!cd)
3100 	    {
3101 	      calculate_dominance_info (CDI_DOMINATORS);
3102 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3103 	      cd = new control_dependences ();
3104 	      free_dominance_info (CDI_POST_DOMINATORS);
3105 	    }
3106 
3107 	  bool destroy_p;
3108 	  int nb_generated_loops, nb_generated_calls;
3109 	  nb_generated_loops = distribute_loop (loop, work_list, cd,
3110 						&nb_generated_calls,
3111 						&destroy_p);
3112 	  if (destroy_p)
3113 	    loops_to_be_destroyed.safe_push (loop);
3114 
3115 	  if (nb_generated_loops + nb_generated_calls > 0)
3116 	    {
3117 	      changed = true;
3118 	      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3119 			       loc, "Loop%s %d distributed: split to %d loops "
3120 			       "and %d library calls.\n", str, loop->num,
3121 			       nb_generated_loops, nb_generated_calls);
3122 
3123 	      break;
3124 	    }
3125 
3126 	  if (dump_file && (dump_flags & TDF_DETAILS))
3127 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3128 	}
3129     }
3130 
3131   if (cd)
3132     delete cd;
3133 
3134   if (bb_top_order_index != NULL)
3135     {
3136       free (bb_top_order_index);
3137       bb_top_order_index = NULL;
3138       bb_top_order_index_size = 0;
3139     }
3140 
3141   if (changed)
3142     {
3143       /* Destroy loop bodies that could not be reused.  Do this late as we
3144 	 otherwise can end up refering to stale data in control dependences.  */
3145       unsigned i;
3146       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3147 	destroy_loop (loop);
3148 
3149       /* Cached scalar evolutions now may refer to wrong or non-existing
3150 	 loops.  */
3151       scev_reset_htab ();
3152       mark_virtual_operands_for_renaming (fun);
3153       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3154     }
3155 
3156   checking_verify_loop_structure ();
3157 
3158   return changed ? TODO_cleanup_cfg : 0;
3159 }
3160 
3161 } // anon namespace
3162 
3163 gimple_opt_pass *
3164 make_pass_loop_distribution (gcc::context *ctxt)
3165 {
3166   return new pass_loop_distribution (ctxt);
3167 }
3168