1 /* Loop distribution.
2    Copyright (C) 2006-2018 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    Loop distribution is the dual of loop fusion.  It separates statements
40    of a loop (or loop nest) into multiple loops (or loop nests) with the
41    same loop header.  The major goal is to separate statements which may
42    be vectorized from those that can't.  This pass implements distribution
43    in the following steps:
44 
45      1) Seed partitions with specific type statements.  For now we support
46 	two types seed statements: statement defining variable used outside
47 	of loop; statement storing to memory.
48      2) Build reduced dependence graph (RDG) for loop to be distributed.
49 	The vertices (RDG:V) model all statements in the loop and the edges
50 	(RDG:E) model flow and control dependencies between statements.
51      3) Apart from RDG, compute data dependencies between memory references.
52      4) Starting from seed statement, build up partition by adding depended
53 	statements according to RDG's dependence information.  Partition is
54 	classified as parallel type if it can be executed paralleled; or as
55 	sequential type if it can't.  Parallel type partition is further
56 	classified as different builtin kinds if it can be implemented as
57 	builtin function calls.
58      5) Build partition dependence graph (PG) based on data dependencies.
59 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60 	all data dependencies between every partitions pair.  In general,
61 	data dependence is either compilation time known or unknown.  In C
62 	family languages, there exists quite amount compilation time unknown
63 	dependencies because of possible alias relation of data references.
64 	We categorize PG's edge to two types: "true" edge that represents
65 	compilation time known data dependencies; "alias" edge for all other
66 	data dependencies.
67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68 	partitions in each strong connected component (SCC) correspondingly.
69 	Build new PG for merged partitions.
70      7) Traverse PG again and this time with both "true" and "alias" edges
71 	included.  We try to break SCCs by removing some edges.  Because
72 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73 	by removing some "alias" edges.  It's NP-hard to choose optimal
74 	edge set, fortunately simple approximation is good enough for us
75 	given the small problem scale.
76      8) Collect all data dependencies of the removed "alias" edges.  Create
77 	runtime alias checks for collected data dependencies.
78      9) Version loop under the condition of runtime alias checks.  Given
79 	loop distribution generally introduces additional overhead, it is
80 	only useful if vectorization is achieved in distributed loop.  We
81 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82 	no distributed loop can be vectorized, we simply remove distributed
83 	loops and recover to the original one.
84 
85    TODO:
86      1) We only distribute innermost two-level loop nest now.  We should
87 	extend it for arbitrary loop nests in the future.
88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89 	desired to minimize loop overhead, maximize parallelism and maximize
90 	data reuse.  */
91 
92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h"
95 #include "coretypes.h"
96 #include "backend.h"
97 #include "tree.h"
98 #include "gimple.h"
99 #include "cfghooks.h"
100 #include "tree-pass.h"
101 #include "ssa.h"
102 #include "gimple-pretty-print.h"
103 #include "fold-const.h"
104 #include "cfganal.h"
105 #include "gimple-iterator.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-ssa-loop-manip.h"
110 #include "tree-ssa-loop-ivopts.h"
111 #include "tree-ssa-loop.h"
112 #include "tree-into-ssa.h"
113 #include "tree-ssa.h"
114 #include "cfgloop.h"
115 #include "tree-scalar-evolution.h"
116 #include "params.h"
117 #include "tree-vectorizer.h"
118 #include "tree-eh.h"
119 
120 
121 #define MAX_DATAREFS_NUM \
122 	((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
123 
124 /* Threshold controlling number of distributed partitions.  Given it may
125    be unnecessary if a memory stream cost model is invented in the future,
126    we define it as a temporary macro, rather than a parameter.  */
127 #define NUM_PARTITION_THRESHOLD (4)
128 
129 /* Hashtable helpers.  */
130 
131 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
132 {
133   static inline hashval_t hash (const data_dependence_relation *);
134   static inline bool equal (const data_dependence_relation *,
135 			    const data_dependence_relation *);
136 };
137 
138 /* Hash function for data dependence.  */
139 
140 inline hashval_t
hash(const data_dependence_relation * ddr)141 ddr_hasher::hash (const data_dependence_relation *ddr)
142 {
143   inchash::hash h;
144   h.add_ptr (DDR_A (ddr));
145   h.add_ptr (DDR_B (ddr));
146   return h.end ();
147 }
148 
149 /* Hash table equality function for data dependence.  */
150 
151 inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)152 ddr_hasher::equal (const data_dependence_relation *ddr1,
153 		   const data_dependence_relation *ddr2)
154 {
155   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
156 }
157 
158 /* The loop (nest) to be distributed.  */
159 static vec<loop_p> loop_nest;
160 
161 /* Vector of data references in the loop to be distributed.  */
162 static vec<data_reference_p> datarefs_vec;
163 
164 /* Store index of data reference in aux field.  */
165 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
166 
167 /* Hash table for data dependence relation in the loop to be distributed.  */
168 static hash_table<ddr_hasher> *ddrs_table;
169 
170 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
171 struct rdg_vertex
172 {
173   /* The statement represented by this vertex.  */
174   gimple *stmt;
175 
176   /* Vector of data-references in this statement.  */
177   vec<data_reference_p> datarefs;
178 
179   /* True when the statement contains a write to memory.  */
180   bool has_mem_write;
181 
182   /* True when the statement contains a read from memory.  */
183   bool has_mem_reads;
184 };
185 
186 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
187 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
188 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
189 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
190 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
191 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
192 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
193 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
194 
195 /* Data dependence type.  */
196 
197 enum rdg_dep_type
198 {
199   /* Read After Write (RAW).  */
200   flow_dd = 'f',
201 
202   /* Control dependence (execute conditional on).  */
203   control_dd = 'c'
204 };
205 
206 /* Dependence information attached to an edge of the RDG.  */
207 
208 struct rdg_edge
209 {
210   /* Type of the dependence.  */
211   enum rdg_dep_type type;
212 };
213 
214 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
215 
216 /* Dump vertex I in RDG to FILE.  */
217 
218 static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)219 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
220 {
221   struct vertex *v = &(rdg->vertices[i]);
222   struct graph_edge *e;
223 
224   fprintf (file, "(vertex %d: (%s%s) (in:", i,
225 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
226 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
227 
228   if (v->pred)
229     for (e = v->pred; e; e = e->pred_next)
230       fprintf (file, " %d", e->src);
231 
232   fprintf (file, ") (out:");
233 
234   if (v->succ)
235     for (e = v->succ; e; e = e->succ_next)
236       fprintf (file, " %d", e->dest);
237 
238   fprintf (file, ")\n");
239   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
240   fprintf (file, ")\n");
241 }
242 
243 /* Call dump_rdg_vertex on stderr.  */
244 
245 DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)246 debug_rdg_vertex (struct graph *rdg, int i)
247 {
248   dump_rdg_vertex (stderr, rdg, i);
249 }
250 
251 /* Dump the reduced dependence graph RDG to FILE.  */
252 
253 static void
dump_rdg(FILE * file,struct graph * rdg)254 dump_rdg (FILE *file, struct graph *rdg)
255 {
256   fprintf (file, "(rdg\n");
257   for (int i = 0; i < rdg->n_vertices; i++)
258     dump_rdg_vertex (file, rdg, i);
259   fprintf (file, ")\n");
260 }
261 
262 /* Call dump_rdg on stderr.  */
263 
264 DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)265 debug_rdg (struct graph *rdg)
266 {
267   dump_rdg (stderr, rdg);
268 }
269 
270 static void
dot_rdg_1(FILE * file,struct graph * rdg)271 dot_rdg_1 (FILE *file, struct graph *rdg)
272 {
273   int i;
274   pretty_printer buffer;
275   pp_needs_newline (&buffer) = false;
276   buffer.buffer->stream = file;
277 
278   fprintf (file, "digraph RDG {\n");
279 
280   for (i = 0; i < rdg->n_vertices; i++)
281     {
282       struct vertex *v = &(rdg->vertices[i]);
283       struct graph_edge *e;
284 
285       fprintf (file, "%d [label=\"[%d] ", i, i);
286       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
287       pp_flush (&buffer);
288       fprintf (file, "\"]\n");
289 
290       /* Highlight reads from memory.  */
291       if (RDG_MEM_READS_STMT (rdg, i))
292        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
293 
294       /* Highlight stores to memory.  */
295       if (RDG_MEM_WRITE_STMT (rdg, i))
296        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
297 
298       if (v->succ)
299        for (e = v->succ; e; e = e->succ_next)
300          switch (RDGE_TYPE (e))
301            {
302            case flow_dd:
303              /* These are the most common dependences: don't print these. */
304              fprintf (file, "%d -> %d \n", i, e->dest);
305              break;
306 
307 	   case control_dd:
308              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
309              break;
310 
311            default:
312              gcc_unreachable ();
313            }
314     }
315 
316   fprintf (file, "}\n\n");
317 }
318 
319 /* Display the Reduced Dependence Graph using dotty.  */
320 
321 DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)322 dot_rdg (struct graph *rdg)
323 {
324   /* When debugging, you may want to enable the following code.  */
325 #ifdef HAVE_POPEN
326   FILE *file = popen ("dot -Tx11", "w");
327   if (!file)
328     return;
329   dot_rdg_1 (file, rdg);
330   fflush (file);
331   close (fileno (file));
332   pclose (file);
333 #else
334   dot_rdg_1 (stderr, rdg);
335 #endif
336 }
337 
338 /* Returns the index of STMT in RDG.  */
339 
340 static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)341 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
342 {
343   int index = gimple_uid (stmt);
344   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
345   return index;
346 }
347 
348 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
349    the index of DEF in RDG.  */
350 
351 static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)352 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
353 {
354   use_operand_p imm_use_p;
355   imm_use_iterator iterator;
356 
357   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
358     {
359       struct graph_edge *e;
360       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
361 
362       if (use < 0)
363 	continue;
364 
365       e = add_edge (rdg, idef, use);
366       e->data = XNEW (struct rdg_edge);
367       RDGE_TYPE (e) = flow_dd;
368     }
369 }
370 
371 /* Creates an edge for the control dependences of BB to the vertex V.  */
372 
373 static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)374 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
375 				    int v, control_dependences *cd)
376 {
377   bitmap_iterator bi;
378   unsigned edge_n;
379   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
380 			    0, edge_n, bi)
381     {
382       basic_block cond_bb = cd->get_edge_src (edge_n);
383       gimple *stmt = last_stmt (cond_bb);
384       if (stmt && is_ctrl_stmt (stmt))
385 	{
386 	  struct graph_edge *e;
387 	  int c = rdg_vertex_for_stmt (rdg, stmt);
388 	  if (c < 0)
389 	    continue;
390 
391 	  e = add_edge (rdg, c, v);
392 	  e->data = XNEW (struct rdg_edge);
393 	  RDGE_TYPE (e) = control_dd;
394 	}
395     }
396 }
397 
398 /* Creates the edges of the reduced dependence graph RDG.  */
399 
400 static void
create_rdg_flow_edges(struct graph * rdg)401 create_rdg_flow_edges (struct graph *rdg)
402 {
403   int i;
404   def_operand_p def_p;
405   ssa_op_iter iter;
406 
407   for (i = 0; i < rdg->n_vertices; i++)
408     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
409 			      iter, SSA_OP_DEF)
410       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
411 }
412 
413 /* Creates the edges of the reduced dependence graph RDG.  */
414 
415 static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)416 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
417 {
418   int i;
419 
420   for (i = 0; i < rdg->n_vertices; i++)
421     {
422       gimple *stmt = RDG_STMT (rdg, i);
423       if (gimple_code (stmt) == GIMPLE_PHI)
424 	{
425 	  edge_iterator ei;
426 	  edge e;
427 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
428 	    if (flow_bb_inside_loop_p (loop, e->src))
429 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
430 	}
431       else
432 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
433     }
434 }
435 
436 /* Build the vertices of the reduced dependence graph RDG.  Return false
437    if that failed.  */
438 
439 static bool
create_rdg_vertices(struct graph * rdg,vec<gimple * > stmts,loop_p loop)440 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
441 {
442   int i;
443   gimple *stmt;
444 
445   FOR_EACH_VEC_ELT (stmts, i, stmt)
446     {
447       struct vertex *v = &(rdg->vertices[i]);
448 
449       /* Record statement to vertex mapping.  */
450       gimple_set_uid (stmt, i);
451 
452       v->data = XNEW (struct rdg_vertex);
453       RDGV_STMT (v) = stmt;
454       RDGV_DATAREFS (v).create (0);
455       RDGV_HAS_MEM_WRITE (v) = false;
456       RDGV_HAS_MEM_READS (v) = false;
457       if (gimple_code (stmt) == GIMPLE_PHI)
458 	continue;
459 
460       unsigned drp = datarefs_vec.length ();
461       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
462 	return false;
463       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
464 	{
465 	  data_reference_p dr = datarefs_vec[j];
466 	  if (DR_IS_READ (dr))
467 	    RDGV_HAS_MEM_READS (v) = true;
468 	  else
469 	    RDGV_HAS_MEM_WRITE (v) = true;
470 	  RDGV_DATAREFS (v).safe_push (dr);
471 	}
472     }
473   return true;
474 }
475 
476 /* Array mapping basic block's index to its topological order.  */
477 static int *bb_top_order_index;
478 /* And size of the array.  */
479 static int bb_top_order_index_size;
480 
481 /* If X has a smaller topological sort number than Y, returns -1;
482    if greater, returns 1.  */
483 
484 static int
bb_top_order_cmp(const void * x,const void * y)485 bb_top_order_cmp (const void *x, const void *y)
486 {
487   basic_block bb1 = *(const basic_block *) x;
488   basic_block bb2 = *(const basic_block *) y;
489 
490   gcc_assert (bb1->index < bb_top_order_index_size
491 	      && bb2->index < bb_top_order_index_size);
492   gcc_assert (bb1 == bb2
493 	      || bb_top_order_index[bb1->index]
494 		 != bb_top_order_index[bb2->index]);
495 
496   return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
497 }
498 
499 /* Initialize STMTS with all the statements of LOOP.  We use topological
500    order to discover all statements.  The order is important because
501    generate_loops_for_partition is using the same traversal for identifying
502    statements in loop copies.  */
503 
504 static void
stmts_from_loop(struct loop * loop,vec<gimple * > * stmts)505 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
506 {
507   unsigned int i;
508   basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
509 
510   for (i = 0; i < loop->num_nodes; i++)
511     {
512       basic_block bb = bbs[i];
513 
514       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
515 	   gsi_next (&bsi))
516 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
517 	  stmts->safe_push (bsi.phi ());
518 
519       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
520 	   gsi_next (&bsi))
521 	{
522 	  gimple *stmt = gsi_stmt (bsi);
523 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
524 	    stmts->safe_push (stmt);
525 	}
526     }
527 
528   free (bbs);
529 }
530 
531 /* Free the reduced dependence graph RDG.  */
532 
533 static void
free_rdg(struct graph * rdg)534 free_rdg (struct graph *rdg)
535 {
536   int i;
537 
538   for (i = 0; i < rdg->n_vertices; i++)
539     {
540       struct vertex *v = &(rdg->vertices[i]);
541       struct graph_edge *e;
542 
543       for (e = v->succ; e; e = e->succ_next)
544 	free (e->data);
545 
546       if (v->data)
547 	{
548 	  gimple_set_uid (RDGV_STMT (v), -1);
549 	  (RDGV_DATAREFS (v)).release ();
550 	  free (v->data);
551 	}
552     }
553 
554   free_graph (rdg);
555 }
556 
557 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
558    LOOP, and one edge per flow dependence or control dependence from control
559    dependence CD.  During visiting each statement, data references are also
560    collected and recorded in global data DATAREFS_VEC.  */
561 
562 static struct graph *
build_rdg(struct loop * loop,control_dependences * cd)563 build_rdg (struct loop *loop, control_dependences *cd)
564 {
565   struct graph *rdg;
566 
567   /* Create the RDG vertices from the stmts of the loop nest.  */
568   auto_vec<gimple *, 10> stmts;
569   stmts_from_loop (loop, &stmts);
570   rdg = new_graph (stmts.length ());
571   if (!create_rdg_vertices (rdg, stmts, loop))
572     {
573       free_rdg (rdg);
574       return NULL;
575     }
576   stmts.release ();
577 
578   create_rdg_flow_edges (rdg);
579   if (cd)
580     create_rdg_cd_edges (rdg, cd, loop);
581 
582   return rdg;
583 }
584 
585 
586 /* Kind of distributed loop.  */
587 enum partition_kind {
588     PKIND_NORMAL,
589     /* Partial memset stands for a paritition can be distributed into a loop
590        of memset calls, rather than a single memset call.  It's handled just
591        like a normal parition, i.e, distributed as separate loop, no memset
592        call is generated.
593 
594        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
595        loop nest as deep as possible.  As a result, parloop achieves better
596        parallelization by parallelizing deeper loop nest.  This hack should
597        be unnecessary and removed once distributed memset can be understood
598        and analyzed in data reference analysis.  See PR82604 for more.  */
599     PKIND_PARTIAL_MEMSET,
600     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
601 };
602 
603 /* Type of distributed loop.  */
604 enum partition_type {
605     /* The distributed loop can be executed parallelly.  */
606     PTYPE_PARALLEL = 0,
607     /* The distributed loop has to be executed sequentially.  */
608     PTYPE_SEQUENTIAL
609 };
610 
611 /* Builtin info for loop distribution.  */
612 struct builtin_info
613 {
614   /* data-references a kind != PKIND_NORMAL partition is about.  */
615   data_reference_p dst_dr;
616   data_reference_p src_dr;
617   /* Base address and size of memory objects operated by the builtin.  Note
618      both dest and source memory objects must have the same size.  */
619   tree dst_base;
620   tree src_base;
621   tree size;
622   /* Base and offset part of dst_base after stripping constant offset.  This
623      is only used in memset builtin distribution for now.  */
624   tree dst_base_base;
625   unsigned HOST_WIDE_INT dst_base_offset;
626 };
627 
628 /* Partition for loop distribution.  */
629 struct partition
630 {
631   /* Statements of the partition.  */
632   bitmap stmts;
633   /* True if the partition defines variable which is used outside of loop.  */
634   bool reduction_p;
635   enum partition_kind kind;
636   enum partition_type type;
637   /* Data references in the partition.  */
638   bitmap datarefs;
639   /* Information of builtin parition.  */
640   struct builtin_info *builtin;
641 };
642 
643 
644 /* Allocate and initialize a partition from BITMAP.  */
645 
646 static partition *
partition_alloc(void)647 partition_alloc (void)
648 {
649   partition *partition = XCNEW (struct partition);
650   partition->stmts = BITMAP_ALLOC (NULL);
651   partition->reduction_p = false;
652   partition->kind = PKIND_NORMAL;
653   partition->datarefs = BITMAP_ALLOC (NULL);
654   return partition;
655 }
656 
657 /* Free PARTITION.  */
658 
659 static void
partition_free(partition * partition)660 partition_free (partition *partition)
661 {
662   BITMAP_FREE (partition->stmts);
663   BITMAP_FREE (partition->datarefs);
664   if (partition->builtin)
665     free (partition->builtin);
666 
667   free (partition);
668 }
669 
670 /* Returns true if the partition can be generated as a builtin.  */
671 
672 static bool
partition_builtin_p(partition * partition)673 partition_builtin_p (partition *partition)
674 {
675   return partition->kind > PKIND_PARTIAL_MEMSET;
676 }
677 
678 /* Returns true if the partition contains a reduction.  */
679 
680 static bool
partition_reduction_p(partition * partition)681 partition_reduction_p (partition *partition)
682 {
683   return partition->reduction_p;
684 }
685 
686 /* Partitions are fused because of different reasons.  */
687 enum fuse_type
688 {
689   FUSE_NON_BUILTIN = 0,
690   FUSE_REDUCTION = 1,
691   FUSE_SHARE_REF = 2,
692   FUSE_SAME_SCC = 3,
693   FUSE_FINALIZE = 4
694 };
695 
696 /* Description on different fusing reason.  */
697 static const char *fuse_message[] = {
698   "they are non-builtins",
699   "they have reductions",
700   "they have shared memory refs",
701   "they are in the same dependence scc",
702   "there is no point to distribute loop"};
703 
704 static void
705 update_type_for_merge (struct graph *, partition *, partition *);
706 
707 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
708    graph and we update type for result partition if it is non-NULL.  */
709 
710 static void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)711 partition_merge_into (struct graph *rdg, partition *dest,
712 		      partition *partition, enum fuse_type ft)
713 {
714   if (dump_file && (dump_flags & TDF_DETAILS))
715     {
716       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
717       fprintf (dump_file, "  Part 1: ");
718       dump_bitmap (dump_file, dest->stmts);
719       fprintf (dump_file, "  Part 2: ");
720       dump_bitmap (dump_file, partition->stmts);
721     }
722 
723   dest->kind = PKIND_NORMAL;
724   if (dest->type == PTYPE_PARALLEL)
725     dest->type = partition->type;
726 
727   bitmap_ior_into (dest->stmts, partition->stmts);
728   if (partition_reduction_p (partition))
729     dest->reduction_p = true;
730 
731   /* Further check if any data dependence prevents us from executing the
732      new partition parallelly.  */
733   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
734     update_type_for_merge (rdg, dest, partition);
735 
736   bitmap_ior_into (dest->datarefs, partition->datarefs);
737 }
738 
739 
740 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
741    the LOOP.  */
742 
743 static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)744 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
745 {
746   imm_use_iterator imm_iter;
747   use_operand_p use_p;
748 
749   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
750     {
751       if (is_gimple_debug (USE_STMT (use_p)))
752 	continue;
753 
754       basic_block use_bb = gimple_bb (USE_STMT (use_p));
755       if (!flow_bb_inside_loop_p (loop, use_bb))
756 	return true;
757     }
758 
759   return false;
760 }
761 
762 /* Returns true when STMT defines a scalar variable used after the
763    loop LOOP.  */
764 
765 static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)766 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
767 {
768   def_operand_p def_p;
769   ssa_op_iter op_iter;
770 
771   if (gimple_code (stmt) == GIMPLE_PHI)
772     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
773 
774   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
775     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
776       return true;
777 
778   return false;
779 }
780 
781 /* Return a copy of LOOP placed before LOOP.  */
782 
783 static struct loop *
copy_loop_before(struct loop * loop)784 copy_loop_before (struct loop *loop)
785 {
786   struct loop *res;
787   edge preheader = loop_preheader_edge (loop);
788 
789   initialize_original_copy_tables ();
790   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
791   gcc_assert (res != NULL);
792   free_original_copy_tables ();
793   delete_update_ssa ();
794 
795   return res;
796 }
797 
798 /* Creates an empty basic block after LOOP.  */
799 
800 static void
create_bb_after_loop(struct loop * loop)801 create_bb_after_loop (struct loop *loop)
802 {
803   edge exit = single_exit (loop);
804 
805   if (!exit)
806     return;
807 
808   split_edge (exit);
809 }
810 
811 /* Generate code for PARTITION from the code in LOOP.  The loop is
812    copied when COPY_P is true.  All the statements not flagged in the
813    PARTITION bitmap are removed from the loop or from its copy.  The
814    statements are indexed in sequence inside a basic block, and the
815    basic blocks of a loop are taken in dom order.  */
816 
817 static void
generate_loops_for_partition(struct loop * loop,partition * partition,bool copy_p)818 generate_loops_for_partition (struct loop *loop, partition *partition,
819 			      bool copy_p)
820 {
821   unsigned i;
822   basic_block *bbs;
823 
824   if (copy_p)
825     {
826       int orig_loop_num = loop->orig_loop_num;
827       loop = copy_loop_before (loop);
828       gcc_assert (loop != NULL);
829       loop->orig_loop_num = orig_loop_num;
830       create_preheader (loop, CP_SIMPLE_PREHEADERS);
831       create_bb_after_loop (loop);
832     }
833   else
834     {
835       /* Origin number is set to the new versioned loop's num.  */
836       gcc_assert (loop->orig_loop_num != loop->num);
837     }
838 
839   /* Remove stmts not in the PARTITION bitmap.  */
840   bbs = get_loop_body_in_dom_order (loop);
841 
842   if (MAY_HAVE_DEBUG_BIND_STMTS)
843     for (i = 0; i < loop->num_nodes; i++)
844       {
845 	basic_block bb = bbs[i];
846 
847 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
848 	     gsi_next (&bsi))
849 	  {
850 	    gphi *phi = bsi.phi ();
851 	    if (!virtual_operand_p (gimple_phi_result (phi))
852 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
853 	      reset_debug_uses (phi);
854 	  }
855 
856 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
857 	  {
858 	    gimple *stmt = gsi_stmt (bsi);
859 	    if (gimple_code (stmt) != GIMPLE_LABEL
860 		&& !is_gimple_debug (stmt)
861 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
862 	      reset_debug_uses (stmt);
863 	  }
864       }
865 
866   for (i = 0; i < loop->num_nodes; i++)
867     {
868       basic_block bb = bbs[i];
869       edge inner_exit = NULL;
870 
871       if (loop != bb->loop_father)
872 	inner_exit = single_exit (bb->loop_father);
873 
874       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
875 	{
876 	  gphi *phi = bsi.phi ();
877 	  if (!virtual_operand_p (gimple_phi_result (phi))
878 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
879 	    remove_phi_node (&bsi, true);
880 	  else
881 	    gsi_next (&bsi);
882 	}
883 
884       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
885 	{
886 	  gimple *stmt = gsi_stmt (bsi);
887 	  if (gimple_code (stmt) != GIMPLE_LABEL
888 	      && !is_gimple_debug (stmt)
889 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
890 	    {
891 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
892 		 we choose its exit edge/path in order to avoid generating
893 		 infinite loop.  For all other cases, we choose an arbitrary
894 		 path through the empty CFG part that this unnecessary
895 		 control stmt controls.  */
896 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
897 		{
898 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
899 		    gimple_cond_make_true (cond_stmt);
900 		  else
901 		    gimple_cond_make_false (cond_stmt);
902 		  update_stmt (stmt);
903 		}
904 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
905 		{
906 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
907 		  gimple_switch_set_index
908 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
909 		  update_stmt (stmt);
910 		}
911 	      else
912 		{
913 		  unlink_stmt_vdef (stmt);
914 		  gsi_remove (&bsi, true);
915 		  release_defs (stmt);
916 		  continue;
917 		}
918 	    }
919 	  gsi_next (&bsi);
920 	}
921     }
922 
923   free (bbs);
924 }
925 
926 /* If VAL memory representation contains the same value in all bytes,
927    return that value, otherwise return -1.
928    E.g. for 0x24242424 return 0x24, for IEEE double
929    747708026454360457216.0 return 0x44, etc.  */
930 
931 static int
const_with_all_bytes_same(tree val)932 const_with_all_bytes_same (tree val)
933 {
934   unsigned char buf[64];
935   int i, len;
936 
937   if (integer_zerop (val)
938       || (TREE_CODE (val) == CONSTRUCTOR
939           && !TREE_CLOBBER_P (val)
940           && CONSTRUCTOR_NELTS (val) == 0))
941     return 0;
942 
943   if (real_zerop (val))
944     {
945       /* Only return 0 for +0.0, not for -0.0, which doesn't have
946 	 an all bytes same memory representation.  Don't transform
947 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
948       switch (TREE_CODE (val))
949 	{
950 	case REAL_CST:
951 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
952 	    return 0;
953 	  break;
954 	case COMPLEX_CST:
955 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
956 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
957 	    return 0;
958 	  break;
959 	case VECTOR_CST:
960 	  {
961 	    unsigned int count = vector_cst_encoded_nelts (val);
962 	    unsigned int j;
963 	    for (j = 0; j < count; ++j)
964 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
965 		break;
966 	    if (j == count)
967 	      return 0;
968 	    break;
969 	  }
970 	default:
971 	  break;
972 	}
973     }
974 
975   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
976     return -1;
977 
978   len = native_encode_expr (val, buf, sizeof (buf));
979   if (len == 0)
980     return -1;
981   for (i = 1; i < len; i++)
982     if (buf[i] != buf[0])
983       return -1;
984   return buf[0];
985 }
986 
987 /* Generate a call to memset for PARTITION in LOOP.  */
988 
989 static void
generate_memset_builtin(struct loop * loop,partition * partition)990 generate_memset_builtin (struct loop *loop, partition *partition)
991 {
992   gimple_stmt_iterator gsi;
993   tree mem, fn, nb_bytes;
994   tree val;
995   struct builtin_info *builtin = partition->builtin;
996   gimple *fn_call;
997 
998   /* The new statements will be placed before LOOP.  */
999   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1000 
1001   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1002   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1003 				       false, GSI_CONTINUE_LINKING);
1004   mem = builtin->dst_base;
1005   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1006 				  false, GSI_CONTINUE_LINKING);
1007 
1008   /* This exactly matches the pattern recognition in classify_partition.  */
1009   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1010   /* Handle constants like 0x15151515 and similarly
1011      floating point constants etc. where all bytes are the same.  */
1012   int bytev = const_with_all_bytes_same (val);
1013   if (bytev != -1)
1014     val = build_int_cst (integer_type_node, bytev);
1015   else if (TREE_CODE (val) == INTEGER_CST)
1016     val = fold_convert (integer_type_node, val);
1017   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1018     {
1019       tree tem = make_ssa_name (integer_type_node);
1020       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1021       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1022       val = tem;
1023     }
1024 
1025   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1026   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1027   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1028 
1029   if (dump_file && (dump_flags & TDF_DETAILS))
1030     {
1031       fprintf (dump_file, "generated memset");
1032       if (bytev == 0)
1033 	fprintf (dump_file, " zero\n");
1034       else
1035 	fprintf (dump_file, "\n");
1036     }
1037 }
1038 
1039 /* Generate a call to memcpy for PARTITION in LOOP.  */
1040 
1041 static void
generate_memcpy_builtin(struct loop * loop,partition * partition)1042 generate_memcpy_builtin (struct loop *loop, partition *partition)
1043 {
1044   gimple_stmt_iterator gsi;
1045   gimple *fn_call;
1046   tree dest, src, fn, nb_bytes;
1047   enum built_in_function kind;
1048   struct builtin_info *builtin = partition->builtin;
1049 
1050   /* The new statements will be placed before LOOP.  */
1051   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1052 
1053   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1054   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1055 				       false, GSI_CONTINUE_LINKING);
1056   dest = builtin->dst_base;
1057   src = builtin->src_base;
1058   if (partition->kind == PKIND_MEMCPY
1059       || ! ptr_derefs_may_alias_p (dest, src))
1060     kind = BUILT_IN_MEMCPY;
1061   else
1062     kind = BUILT_IN_MEMMOVE;
1063 
1064   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1065 				   false, GSI_CONTINUE_LINKING);
1066   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1067 				  false, GSI_CONTINUE_LINKING);
1068   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1069   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1070   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1071 
1072   if (dump_file && (dump_flags & TDF_DETAILS))
1073     {
1074       if (kind == BUILT_IN_MEMCPY)
1075 	fprintf (dump_file, "generated memcpy\n");
1076       else
1077 	fprintf (dump_file, "generated memmove\n");
1078     }
1079 }
1080 
1081 /* Remove and destroy the loop LOOP.  */
1082 
1083 static void
destroy_loop(struct loop * loop)1084 destroy_loop (struct loop *loop)
1085 {
1086   unsigned nbbs = loop->num_nodes;
1087   edge exit = single_exit (loop);
1088   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1089   basic_block *bbs;
1090   unsigned i;
1091 
1092   bbs = get_loop_body_in_dom_order (loop);
1093 
1094   redirect_edge_pred (exit, src);
1095   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1096   exit->flags |= EDGE_FALLTHRU;
1097   cancel_loop_tree (loop);
1098   rescan_loop_exit (exit, false, true);
1099 
1100   i = nbbs;
1101   do
1102     {
1103       /* We have made sure to not leave any dangling uses of SSA
1104          names defined in the loop.  With the exception of virtuals.
1105 	 Make sure we replace all uses of virtual defs that will remain
1106 	 outside of the loop with the bare symbol as delete_basic_block
1107 	 will release them.  */
1108       --i;
1109       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1110 	   gsi_next (&gsi))
1111 	{
1112 	  gphi *phi = gsi.phi ();
1113 	  if (virtual_operand_p (gimple_phi_result (phi)))
1114 	    mark_virtual_phi_result_for_renaming (phi);
1115 	}
1116       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1117 	   gsi_next (&gsi))
1118 	{
1119 	  gimple *stmt = gsi_stmt (gsi);
1120 	  tree vdef = gimple_vdef (stmt);
1121 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1122 	    mark_virtual_operand_for_renaming (vdef);
1123 	}
1124       delete_basic_block (bbs[i]);
1125     }
1126   while (i != 0);
1127 
1128   free (bbs);
1129 
1130   set_immediate_dominator (CDI_DOMINATORS, dest,
1131 			   recompute_dominator (CDI_DOMINATORS, dest));
1132 }
1133 
1134 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1135 
1136 static bool
generate_code_for_partition(struct loop * loop,partition * partition,bool copy_p)1137 generate_code_for_partition (struct loop *loop,
1138 			     partition *partition, bool copy_p)
1139 {
1140   switch (partition->kind)
1141     {
1142     case PKIND_NORMAL:
1143     case PKIND_PARTIAL_MEMSET:
1144       /* Reductions all have to be in the last partition.  */
1145       gcc_assert (!partition_reduction_p (partition)
1146 		  || !copy_p);
1147       generate_loops_for_partition (loop, partition, copy_p);
1148       return false;
1149 
1150     case PKIND_MEMSET:
1151       generate_memset_builtin (loop, partition);
1152       break;
1153 
1154     case PKIND_MEMCPY:
1155     case PKIND_MEMMOVE:
1156       generate_memcpy_builtin (loop, partition);
1157       break;
1158 
1159     default:
1160       gcc_unreachable ();
1161     }
1162 
1163   /* Common tail for partitions we turn into a call.  If this was the last
1164      partition for which we generate code, we have to destroy the loop.  */
1165   if (!copy_p)
1166     return true;
1167   return false;
1168 }
1169 
1170 /* Return data dependence relation for data references A and B.  The two
1171    data references must be in lexicographic order wrto reduced dependence
1172    graph RDG.  We firstly try to find ddr from global ddr hash table.  If
1173    it doesn't exist, compute the ddr and cache it.  */
1174 
1175 static data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)1176 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1177 {
1178   struct data_dependence_relation ent, **slot;
1179   struct data_dependence_relation *ddr;
1180 
1181   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1182   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1183 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1184   ent.a = a;
1185   ent.b = b;
1186   slot = ddrs_table->find_slot (&ent, INSERT);
1187   if (*slot == NULL)
1188     {
1189       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1190       compute_affine_dependence (ddr, loop_nest[0]);
1191       *slot = ddr;
1192     }
1193 
1194   return *slot;
1195 }
1196 
1197 /* In reduced dependence graph RDG for loop distribution, return true if
1198    dependence between references DR1 and DR2 leads to a dependence cycle
1199    and such dependence cycle can't be resolved by runtime alias check.  */
1200 
1201 static bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)1202 data_dep_in_cycle_p (struct graph *rdg,
1203 		     data_reference_p dr1, data_reference_p dr2)
1204 {
1205   struct data_dependence_relation *ddr;
1206 
1207   /* Re-shuffle data-refs to be in topological order.  */
1208   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1209       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1210     std::swap (dr1, dr2);
1211 
1212   ddr = get_data_dependence (rdg, dr1, dr2);
1213 
1214   /* In case of no data dependence.  */
1215   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1216     return false;
1217   /* For unknown data dependence or known data dependence which can't be
1218      expressed in classic distance vector, we check if it can be resolved
1219      by runtime alias check.  If yes, we still consider data dependence
1220      as won't introduce data dependence cycle.  */
1221   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1222 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1223     return !runtime_alias_check_p (ddr, NULL, true);
1224   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1225     return true;
1226   else if (DDR_REVERSED_P (ddr)
1227 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1228     return false;
1229 
1230   return true;
1231 }
1232 
1233 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1234    PARTITION1's type after merging PARTITION2 into PARTITION1.  */
1235 
1236 static void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)1237 update_type_for_merge (struct graph *rdg,
1238 		       partition *partition1, partition *partition2)
1239 {
1240   unsigned i, j;
1241   bitmap_iterator bi, bj;
1242   data_reference_p dr1, dr2;
1243 
1244   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1245     {
1246       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1247 
1248       dr1 = datarefs_vec[i];
1249       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1250 	{
1251 	  dr2 = datarefs_vec[j];
1252 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1253 	    continue;
1254 
1255 	  /* Partition can only be executed sequentially if there is any
1256 	     data dependence cycle.  */
1257 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1258 	    {
1259 	      partition1->type = PTYPE_SEQUENTIAL;
1260 	      return;
1261 	    }
1262 	}
1263     }
1264 }
1265 
1266 /* Returns a partition with all the statements needed for computing
1267    the vertex V of the RDG, also including the loop exit conditions.  */
1268 
1269 static partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)1270 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1271 {
1272   partition *partition = partition_alloc ();
1273   auto_vec<int, 3> nodes;
1274   unsigned i, j;
1275   int x;
1276   data_reference_p dr;
1277 
1278   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1279 
1280   FOR_EACH_VEC_ELT (nodes, i, x)
1281     {
1282       bitmap_set_bit (partition->stmts, x);
1283 
1284       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1285 	{
1286 	  unsigned idx = (unsigned) DR_INDEX (dr);
1287 	  gcc_assert (idx < datarefs_vec.length ());
1288 
1289 	  /* Partition can only be executed sequentially if there is any
1290 	     unknown data reference.  */
1291 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1292 	      || !DR_INIT (dr) || !DR_STEP (dr))
1293 	    partition->type = PTYPE_SEQUENTIAL;
1294 
1295 	  bitmap_set_bit (partition->datarefs, idx);
1296 	}
1297     }
1298 
1299   if (partition->type == PTYPE_SEQUENTIAL)
1300     return partition;
1301 
1302   /* Further check if any data dependence prevents us from executing the
1303      partition parallelly.  */
1304   update_type_for_merge (rdg, partition, partition);
1305 
1306   return partition;
1307 }
1308 
1309 /* Given PARTITION of LOOP and RDG, record single load/store data references
1310    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1311    data references.  */
1312 
1313 static bool
find_single_drs(struct loop * loop,struct graph * rdg,partition * partition,data_reference_p * dst_dr,data_reference_p * src_dr)1314 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1315 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1316 {
1317   unsigned i;
1318   data_reference_p single_ld = NULL, single_st = NULL;
1319   bitmap_iterator bi;
1320 
1321   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1322     {
1323       gimple *stmt = RDG_STMT (rdg, i);
1324       data_reference_p dr;
1325 
1326       if (gimple_code (stmt) == GIMPLE_PHI)
1327 	continue;
1328 
1329       /* Any scalar stmts are ok.  */
1330       if (!gimple_vuse (stmt))
1331 	continue;
1332 
1333       /* Otherwise just regular loads/stores.  */
1334       if (!gimple_assign_single_p (stmt))
1335 	return false;
1336 
1337       /* But exactly one store and/or load.  */
1338       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1339 	{
1340 	  tree type = TREE_TYPE (DR_REF (dr));
1341 
1342 	  /* The memset, memcpy and memmove library calls are only
1343 	     able to deal with generic address space.  */
1344 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1345 	    return false;
1346 
1347 	  if (DR_IS_READ (dr))
1348 	    {
1349 	      if (single_ld != NULL)
1350 		return false;
1351 	      single_ld = dr;
1352 	    }
1353 	  else
1354 	    {
1355 	      if (single_st != NULL)
1356 		return false;
1357 	      single_st = dr;
1358 	    }
1359 	}
1360     }
1361 
1362   if (!single_st)
1363     return false;
1364 
1365   /* Bail out if this is a bitfield memory reference.  */
1366   if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1367       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1368     return false;
1369 
1370   /* Data reference must be executed exactly once per iteration of each
1371      loop in the loop nest.  We only need to check dominance information
1372      against the outermost one in a perfect loop nest because a bb can't
1373      dominate outermost loop's latch without dominating inner loop's.  */
1374   basic_block bb_st = gimple_bb (DR_STMT (single_st));
1375   if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1376     return false;
1377 
1378   if (single_ld)
1379     {
1380       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1381       /* Direct aggregate copy or via an SSA name temporary.  */
1382       if (load != store
1383 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1384 	return false;
1385 
1386       /* Bail out if this is a bitfield memory reference.  */
1387       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1388 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1389 	return false;
1390 
1391       /* Load and store must be in the same loop nest.  */
1392       basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1393       if (bb_st->loop_father != bb_ld->loop_father)
1394 	return false;
1395 
1396       /* Data reference must be executed exactly once per iteration.
1397 	 Same as single_st, we only need to check against the outermost
1398 	 loop.  */
1399       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1400 	return false;
1401 
1402       edge e = single_exit (bb_st->loop_father);
1403       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1404       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1405       if (dom_ld != dom_st)
1406 	return false;
1407     }
1408 
1409   *src_dr = single_ld;
1410   *dst_dr = single_st;
1411   return true;
1412 }
1413 
1414 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1415    loops from inner to outer to see if loop's step equals to access size at
1416    each level of loop.  Return 2 if we can prove this at all level loops;
1417    record access base and size in BASE and SIZE; save loop's step at each
1418    level of loop in STEPS if it is not null.  For example:
1419 
1420      int arr[100][100][100];
1421      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1422        for (j = 100; j > 0; j--)     ;steps[1] = -400
1423 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1424 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1425 
1426    Return 1 if we can prove the equality at the innermost loop, but not all
1427    level loops.  In this case, no information is recorded.
1428 
1429    Return 0 if no equality can be proven at any level loops.  */
1430 
1431 static int
1432 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1433 		      tree *size, vec<tree> *steps = NULL)
1434 {
1435   location_t loc = gimple_location (DR_STMT (dr));
1436   basic_block bb = gimple_bb (DR_STMT (dr));
1437   struct loop *loop = bb->loop_father;
1438   tree ref = DR_REF (dr);
1439   tree access_base = build_fold_addr_expr (ref);
1440   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1441   int res = 0;
1442 
1443   do {
1444       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1445       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1446 	return res;
1447 
1448       access_base = CHREC_LEFT (scev_fn);
1449       if (tree_contains_chrecs (access_base, NULL))
1450 	return res;
1451 
1452       tree scev_step = CHREC_RIGHT (scev_fn);
1453       /* Only support constant steps.  */
1454       if (TREE_CODE (scev_step) != INTEGER_CST)
1455 	return res;
1456 
1457       enum ev_direction access_dir = scev_direction (scev_fn);
1458       if (access_dir == EV_DIR_UNKNOWN)
1459 	return res;
1460 
1461       if (steps != NULL)
1462 	steps->safe_push (scev_step);
1463 
1464       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1465       /* Compute absolute value of scev step.  */
1466       if (access_dir == EV_DIR_DECREASES)
1467 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1468 
1469       /* At each level of loop, scev step must equal to access size.  In other
1470 	 words, DR must access consecutive memory between loop iterations.  */
1471       if (!operand_equal_p (scev_step, access_size, 0))
1472 	return res;
1473 
1474       /* Access stride can be computed for data reference at least for the
1475 	 innermost loop.  */
1476       res = 1;
1477 
1478       /* Compute DR's execution times in loop.  */
1479       tree niters = number_of_latch_executions (loop);
1480       niters = fold_convert_loc (loc, sizetype, niters);
1481       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1482 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1483 
1484       /* Compute DR's overall access size in loop.  */
1485       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1486 				     niters, scev_step);
1487       /* Adjust base address in case of negative step.  */
1488       if (access_dir == EV_DIR_DECREASES)
1489 	{
1490 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1491 				      scev_step, access_size);
1492 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1493 	}
1494   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1495 
1496   *base = access_base;
1497   *size = access_size;
1498   /* Access stride can be computed for data reference at each level loop.  */
1499   return 2;
1500 }
1501 
1502 /* Allocate and return builtin struct.  Record information like DST_DR,
1503    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1504 
1505 static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1506 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1507 	       tree dst_base, tree src_base, tree size)
1508 {
1509   struct builtin_info *builtin = XNEW (struct builtin_info);
1510   builtin->dst_dr = dst_dr;
1511   builtin->src_dr = src_dr;
1512   builtin->dst_base = dst_base;
1513   builtin->src_base = src_base;
1514   builtin->size = size;
1515   return builtin;
1516 }
1517 
1518 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1519    memset call.  */
1520 
1521 static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1522 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1523 {
1524   gimple *stmt = DR_STMT (dr);
1525   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1526 
1527   if (const_with_all_bytes_same (rhs) == -1
1528       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1529 	  || (TYPE_MODE (TREE_TYPE (rhs))
1530 	      != TYPE_MODE (unsigned_char_type_node))))
1531     return;
1532 
1533   if (TREE_CODE (rhs) == SSA_NAME
1534       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1535       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1536     return;
1537 
1538   int res = compute_access_range (loop, dr, &base, &size);
1539   if (res == 0)
1540     return;
1541   if (res == 1)
1542     {
1543       partition->kind = PKIND_PARTIAL_MEMSET;
1544       return;
1545     }
1546 
1547   poly_uint64 base_offset;
1548   unsigned HOST_WIDE_INT const_base_offset;
1549   tree base_base = strip_offset (base, &base_offset);
1550   if (!base_offset.is_constant (&const_base_offset))
1551     return;
1552 
1553   struct builtin_info *builtin;
1554   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1555   builtin->dst_base_base = base_base;
1556   builtin->dst_base_offset = const_base_offset;
1557   partition->builtin = builtin;
1558   partition->kind = PKIND_MEMSET;
1559 }
1560 
1561 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1562    if it forms builtin memcpy or memmove call.  */
1563 
1564 static void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)1565 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1566 		       data_reference_p dst_dr, data_reference_p src_dr)
1567 {
1568   tree base, size, src_base, src_size;
1569   auto_vec<tree> dst_steps, src_steps;
1570 
1571   /* Compute access range of both load and store.  */
1572   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1573   if (res != 2)
1574     return;
1575   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1576   if (res != 2)
1577     return;
1578 
1579   /* They much have the same access size.  */
1580   if (!operand_equal_p (size, src_size, 0))
1581     return;
1582 
1583   /* Load and store in loop nest must access memory in the same way, i.e,
1584      their must have the same steps in each loop of the nest.  */
1585   if (dst_steps.length () != src_steps.length ())
1586     return;
1587   for (unsigned i = 0; i < dst_steps.length (); ++i)
1588     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1589       return;
1590 
1591   /* Now check that if there is a dependence.  */
1592   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1593 
1594   /* Classify as memcpy if no dependence between load and store.  */
1595   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1596     {
1597       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1598       partition->kind = PKIND_MEMCPY;
1599       return;
1600     }
1601 
1602   /* Can't do memmove in case of unknown dependence or dependence without
1603      classical distance vector.  */
1604   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1605       || DDR_NUM_DIST_VECTS (ddr) == 0)
1606     return;
1607 
1608   unsigned i;
1609   lambda_vector dist_v;
1610   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1611   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1612     {
1613       unsigned dep_lev = dependence_level (dist_v, num_lev);
1614       /* Can't do memmove if load depends on store.  */
1615       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1616 	return;
1617     }
1618 
1619   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1620   partition->kind = PKIND_MEMMOVE;
1621   return;
1622 }
1623 
1624 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1625    For the moment we detect memset, memcpy and memmove patterns.  Bitmap
1626    STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.  */
1627 
1628 static void
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)1629 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1630 		    bitmap stmt_in_all_partitions)
1631 {
1632   bitmap_iterator bi;
1633   unsigned i;
1634   data_reference_p single_ld = NULL, single_st = NULL;
1635   bool volatiles_p = false, has_reduction = false;
1636 
1637   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1638     {
1639       gimple *stmt = RDG_STMT (rdg, i);
1640 
1641       if (gimple_has_volatile_ops (stmt))
1642 	volatiles_p = true;
1643 
1644       /* If the stmt is not included by all partitions and there is uses
1645 	 outside of the loop, then mark the partition as reduction.  */
1646       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1647 	{
1648 	  /* Due to limitation in the transform phase we have to fuse all
1649 	     reduction partitions.  As a result, this could cancel valid
1650 	     loop distribution especially for loop that induction variable
1651 	     is used outside of loop.  To workaround this issue, we skip
1652 	     marking partition as reudction if the reduction stmt belongs
1653 	     to all partitions.  In such case, reduction will be computed
1654 	     correctly no matter how partitions are fused/distributed.  */
1655 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1656 	    {
1657 	      partition->reduction_p = true;
1658 	      return;
1659 	    }
1660 	  has_reduction = true;
1661 	}
1662     }
1663 
1664   /* Perform general partition disqualification for builtins.  */
1665   if (volatiles_p
1666       /* Simple workaround to prevent classifying the partition as builtin
1667 	 if it contains any use outside of loop.  */
1668       || has_reduction
1669       || !flag_tree_loop_distribute_patterns)
1670     return;
1671 
1672   /* Find single load/store data references for builtin partition.  */
1673   if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1674     return;
1675 
1676   /* Classify the builtin kind.  */
1677   if (single_ld == NULL)
1678     classify_builtin_st (loop, partition, single_st);
1679   else
1680     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1681 }
1682 
1683 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1684    object in RDG.  */
1685 
1686 static bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)1687 share_memory_accesses (struct graph *rdg,
1688 		       partition *partition1, partition *partition2)
1689 {
1690   unsigned i, j;
1691   bitmap_iterator bi, bj;
1692   data_reference_p dr1, dr2;
1693 
1694   /* First check whether in the intersection of the two partitions are
1695      any loads or stores.  Common loads are the situation that happens
1696      most often.  */
1697   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1698     if (RDG_MEM_WRITE_STMT (rdg, i)
1699 	|| RDG_MEM_READS_STMT (rdg, i))
1700       return true;
1701 
1702   /* Then check whether the two partitions access the same memory object.  */
1703   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1704     {
1705       dr1 = datarefs_vec[i];
1706 
1707       if (!DR_BASE_ADDRESS (dr1)
1708 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1709 	continue;
1710 
1711       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1712 	{
1713 	  dr2 = datarefs_vec[j];
1714 
1715 	  if (!DR_BASE_ADDRESS (dr2)
1716 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1717 	    continue;
1718 
1719 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1720 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1721 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1722 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1723 	    return true;
1724 	}
1725     }
1726 
1727   return false;
1728 }
1729 
1730 /* For each seed statement in STARTING_STMTS, this function builds
1731    partition for it by adding depended statements according to RDG.
1732    All partitions are recorded in PARTITIONS.  */
1733 
1734 static void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)1735 rdg_build_partitions (struct graph *rdg,
1736 		      vec<gimple *> starting_stmts,
1737 		      vec<partition *> *partitions)
1738 {
1739   auto_bitmap processed;
1740   int i;
1741   gimple *stmt;
1742 
1743   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1744     {
1745       int v = rdg_vertex_for_stmt (rdg, stmt);
1746 
1747       if (dump_file && (dump_flags & TDF_DETAILS))
1748 	fprintf (dump_file,
1749 		 "ldist asked to generate code for vertex %d\n", v);
1750 
1751       /* If the vertex is already contained in another partition so
1752          is the partition rooted at it.  */
1753       if (bitmap_bit_p (processed, v))
1754 	continue;
1755 
1756       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1757       bitmap_ior_into (processed, partition->stmts);
1758 
1759       if (dump_file && (dump_flags & TDF_DETAILS))
1760 	{
1761 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1762 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1763 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1764 	}
1765 
1766       partitions->safe_push (partition);
1767     }
1768 
1769   /* All vertices should have been assigned to at least one partition now,
1770      other than vertices belonging to dead code.  */
1771 }
1772 
1773 /* Dump to FILE the PARTITIONS.  */
1774 
1775 static void
dump_rdg_partitions(FILE * file,vec<partition * > partitions)1776 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1777 {
1778   int i;
1779   partition *partition;
1780 
1781   FOR_EACH_VEC_ELT (partitions, i, partition)
1782     debug_bitmap_file (file, partition->stmts);
1783 }
1784 
1785 /* Debug PARTITIONS.  */
1786 extern void debug_rdg_partitions (vec<partition *> );
1787 
1788 DEBUG_FUNCTION void
debug_rdg_partitions(vec<partition * > partitions)1789 debug_rdg_partitions (vec<partition *> partitions)
1790 {
1791   dump_rdg_partitions (stderr, partitions);
1792 }
1793 
1794 /* Returns the number of read and write operations in the RDG.  */
1795 
1796 static int
number_of_rw_in_rdg(struct graph * rdg)1797 number_of_rw_in_rdg (struct graph *rdg)
1798 {
1799   int i, res = 0;
1800 
1801   for (i = 0; i < rdg->n_vertices; i++)
1802     {
1803       if (RDG_MEM_WRITE_STMT (rdg, i))
1804 	++res;
1805 
1806       if (RDG_MEM_READS_STMT (rdg, i))
1807 	++res;
1808     }
1809 
1810   return res;
1811 }
1812 
1813 /* Returns the number of read and write operations in a PARTITION of
1814    the RDG.  */
1815 
1816 static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)1817 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1818 {
1819   int res = 0;
1820   unsigned i;
1821   bitmap_iterator ii;
1822 
1823   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1824     {
1825       if (RDG_MEM_WRITE_STMT (rdg, i))
1826 	++res;
1827 
1828       if (RDG_MEM_READS_STMT (rdg, i))
1829 	++res;
1830     }
1831 
1832   return res;
1833 }
1834 
1835 /* Returns true when one of the PARTITIONS contains all the read or
1836    write operations of RDG.  */
1837 
1838 static bool
partition_contains_all_rw(struct graph * rdg,vec<partition * > partitions)1839 partition_contains_all_rw (struct graph *rdg,
1840 			   vec<partition *> partitions)
1841 {
1842   int i;
1843   partition *partition;
1844   int nrw = number_of_rw_in_rdg (rdg);
1845 
1846   FOR_EACH_VEC_ELT (partitions, i, partition)
1847     if (nrw == number_of_rw_in_partition (rdg, partition))
1848       return true;
1849 
1850   return false;
1851 }
1852 
1853 /* Compute partition dependence created by the data references in DRS1
1854    and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
1855    not NULL, we record dependence introduced by possible alias between
1856    two data references in ALIAS_DDRS; otherwise, we simply ignore such
1857    dependence as if it doesn't exist at all.  */
1858 
1859 static int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)1860 pg_add_dependence_edges (struct graph *rdg, int dir,
1861 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1862 {
1863   unsigned i, j;
1864   bitmap_iterator bi, bj;
1865   data_reference_p dr1, dr2, saved_dr1;
1866 
1867   /* dependence direction - 0 is no dependence, -1 is back,
1868      1 is forth, 2 is both (we can stop then, merging will occur).  */
1869   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1870     {
1871       dr1 = datarefs_vec[i];
1872 
1873       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1874 	{
1875 	  int res, this_dir = 1;
1876 	  ddr_p ddr;
1877 
1878 	  dr2 = datarefs_vec[j];
1879 
1880 	  /* Skip all <read, read> data dependence.  */
1881 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1882 	    continue;
1883 
1884 	  saved_dr1 = dr1;
1885 	  /* Re-shuffle data-refs to be in topological order.  */
1886 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1887 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1888 	    {
1889 	      std::swap (dr1, dr2);
1890 	      this_dir = -this_dir;
1891 	    }
1892 	  ddr = get_data_dependence (rdg, dr1, dr2);
1893 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1894 	    {
1895 	      this_dir = 0;
1896 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1897 					   DR_BASE_ADDRESS (dr2));
1898 	      /* Be conservative.  If data references are not well analyzed,
1899 		 or the two data references have the same base address and
1900 		 offset, add dependence and consider it alias to each other.
1901 		 In other words, the dependence can not be resolved by
1902 		 runtime alias check.  */
1903 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1904 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1905 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
1906 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1907 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1908 		  || res == 0)
1909 		this_dir = 2;
1910 	      /* Data dependence could be resolved by runtime alias check,
1911 		 record it in ALIAS_DDRS.  */
1912 	      else if (alias_ddrs != NULL)
1913 		alias_ddrs->safe_push (ddr);
1914 	      /* Or simply ignore it.  */
1915 	    }
1916 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1917 	    {
1918 	      if (DDR_REVERSED_P (ddr))
1919 		this_dir = -this_dir;
1920 
1921 	      /* Known dependences can still be unordered througout the
1922 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
1923 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
1924 		this_dir = 2;
1925 	      /* If the overlap is exact preserve stmt order.  */
1926 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1927 					    DDR_NB_LOOPS (ddr)))
1928 		;
1929 	      /* Else as the distance vector is lexicographic positive swap
1930 		 the dependence direction.  */
1931 	      else
1932 		this_dir = -this_dir;
1933 	    }
1934 	  else
1935 	    this_dir = 0;
1936 	  if (this_dir == 2)
1937 	    return 2;
1938 	  else if (dir == 0)
1939 	    dir = this_dir;
1940 	  else if (this_dir != 0 && dir != this_dir)
1941 	    return 2;
1942 	  /* Shuffle "back" dr1.  */
1943 	  dr1 = saved_dr1;
1944 	}
1945     }
1946   return dir;
1947 }
1948 
1949 /* Compare postorder number of the partition graph vertices V1 and V2.  */
1950 
1951 static int
pgcmp(const void * v1_,const void * v2_)1952 pgcmp (const void *v1_, const void *v2_)
1953 {
1954   const vertex *v1 = (const vertex *)v1_;
1955   const vertex *v2 = (const vertex *)v2_;
1956   return v2->post - v1->post;
1957 }
1958 
1959 /* Data attached to vertices of partition dependence graph.  */
1960 struct pg_vdata
1961 {
1962   /* ID of the corresponding partition.  */
1963   int id;
1964   /* The partition.  */
1965   struct partition *partition;
1966 };
1967 
1968 /* Data attached to edges of partition dependence graph.  */
1969 struct pg_edata
1970 {
1971   /* If the dependence edge can be resolved by runtime alias check,
1972      this vector contains data dependence relations for runtime alias
1973      check.  On the other hand, if the dependence edge is introduced
1974      because of compilation time known data dependence, this vector
1975      contains nothing.  */
1976   vec<ddr_p> alias_ddrs;
1977 };
1978 
1979 /* Callback data for traversing edges in graph.  */
1980 struct pg_edge_callback_data
1981 {
1982   /* Bitmap contains strong connected components should be merged.  */
1983   bitmap sccs_to_merge;
1984   /* Array constains component information for all vertices.  */
1985   int *vertices_component;
1986   /* Vector to record all data dependence relations which are needed
1987      to break strong connected components by runtime alias checks.  */
1988   vec<ddr_p> *alias_ddrs;
1989 };
1990 
1991 /* Initialize vertice's data for partition dependence graph PG with
1992    PARTITIONS.  */
1993 
1994 static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)1995 init_partition_graph_vertices (struct graph *pg,
1996 			       vec<struct partition *> *partitions)
1997 {
1998   int i;
1999   partition *partition;
2000   struct pg_vdata *data;
2001 
2002   for (i = 0; partitions->iterate (i, &partition); ++i)
2003     {
2004       data = new pg_vdata;
2005       pg->vertices[i].data = data;
2006       data->id = i;
2007       data->partition = partition;
2008     }
2009 }
2010 
2011 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2012    dependence relations to the EDGE if DDRS isn't NULL.  */
2013 
2014 static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2015 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2016 {
2017   struct graph_edge *e = add_edge (pg, i, j);
2018 
2019   /* If the edge is attached with data dependence relations, it means this
2020      dependence edge can be resolved by runtime alias checks.  */
2021   if (ddrs != NULL)
2022     {
2023       struct pg_edata *data = new pg_edata;
2024 
2025       gcc_assert (ddrs->length () > 0);
2026       e->data = data;
2027       data->alias_ddrs = vNULL;
2028       data->alias_ddrs.safe_splice (*ddrs);
2029     }
2030 }
2031 
2032 /* Callback function for graph travesal algorithm.  It returns true
2033    if edge E should skipped when traversing the graph.  */
2034 
2035 static bool
pg_skip_alias_edge(struct graph_edge * e)2036 pg_skip_alias_edge (struct graph_edge *e)
2037 {
2038   struct pg_edata *data = (struct pg_edata *)e->data;
2039   return (data != NULL && data->alias_ddrs.length () > 0);
2040 }
2041 
2042 /* Callback function freeing data attached to edge E of graph.  */
2043 
2044 static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2045 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2046 {
2047   if (e->data != NULL)
2048     {
2049       struct pg_edata *data = (struct pg_edata *)e->data;
2050       data->alias_ddrs.release ();
2051       delete data;
2052     }
2053 }
2054 
2055 /* Free data attached to vertice of partition dependence graph PG.  */
2056 
2057 static void
free_partition_graph_vdata(struct graph * pg)2058 free_partition_graph_vdata (struct graph *pg)
2059 {
2060   int i;
2061   struct pg_vdata *data;
2062 
2063   for (i = 0; i < pg->n_vertices; ++i)
2064     {
2065       data = (struct pg_vdata *)pg->vertices[i].data;
2066       delete data;
2067     }
2068 }
2069 
2070 /* Build and return partition dependence graph for PARTITIONS.  RDG is
2071    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2072    is true, data dependence caused by possible alias between references
2073    is ignored, as if it doesn't exist at all; otherwise all depdendences
2074    are considered.  */
2075 
2076 static struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2077 build_partition_graph (struct graph *rdg,
2078 		       vec<struct partition *> *partitions,
2079 		       bool ignore_alias_p)
2080 {
2081   int i, j;
2082   struct partition *partition1, *partition2;
2083   graph *pg = new_graph (partitions->length ());
2084   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2085 
2086   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2087 
2088   init_partition_graph_vertices (pg, partitions);
2089 
2090   for (i = 0; partitions->iterate (i, &partition1); ++i)
2091     {
2092       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2093 	{
2094 	  /* dependence direction - 0 is no dependence, -1 is back,
2095 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2096 	  int dir = 0;
2097 
2098 	  /* If the first partition has reduction, add back edge; if the
2099 	     second partition has reduction, add forth edge.  This makes
2100 	     sure that reduction partition will be sorted as the last one.  */
2101 	  if (partition_reduction_p (partition1))
2102 	    dir = -1;
2103 	  else if (partition_reduction_p (partition2))
2104 	    dir = 1;
2105 
2106 	  /* Cleanup the temporary vector.  */
2107 	  alias_ddrs.truncate (0);
2108 
2109 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2110 					 partition2->datarefs, alias_ddrs_p);
2111 
2112 	  /* Add edge to partition graph if there exists dependence.  There
2113 	     are two types of edges.  One type edge is caused by compilation
2114 	     time known dependence, this type can not be resolved by runtime
2115 	     alias check.  The other type can be resolved by runtime alias
2116 	     check.  */
2117 	  if (dir == 1 || dir == 2
2118 	      || alias_ddrs.length () > 0)
2119 	    {
2120 	      /* Attach data dependence relations to edge that can be resolved
2121 		 by runtime alias check.  */
2122 	      bool alias_edge_p = (dir != 1 && dir != 2);
2123 	      add_partition_graph_edge (pg, i, j,
2124 					(alias_edge_p) ? &alias_ddrs : NULL);
2125 	    }
2126 	  if (dir == -1 || dir == 2
2127 	      || alias_ddrs.length () > 0)
2128 	    {
2129 	      /* Attach data dependence relations to edge that can be resolved
2130 		 by runtime alias check.  */
2131 	      bool alias_edge_p = (dir != -1 && dir != 2);
2132 	      add_partition_graph_edge (pg, j, i,
2133 					(alias_edge_p) ? &alias_ddrs : NULL);
2134 	    }
2135 	}
2136     }
2137   return pg;
2138 }
2139 
2140 /* Sort partitions in PG in descending post order and store them in
2141    PARTITIONS.  */
2142 
2143 static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2144 sort_partitions_by_post_order (struct graph *pg,
2145 			       vec<struct partition *> *partitions)
2146 {
2147   int i;
2148   struct pg_vdata *data;
2149 
2150   /* Now order the remaining nodes in descending postorder.  */
2151   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2152   partitions->truncate (0);
2153   for (i = 0; i < pg->n_vertices; ++i)
2154     {
2155       data = (struct pg_vdata *)pg->vertices[i].data;
2156       if (data->partition)
2157 	partitions->safe_push (data->partition);
2158     }
2159 }
2160 
2161 /* Given reduced dependence graph RDG merge strong connected components
2162    of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
2163    possible alias between references is ignored, as if it doesn't exist
2164    at all; otherwise all depdendences are considered.  */
2165 
2166 static void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2167 merge_dep_scc_partitions (struct graph *rdg,
2168 			  vec<struct partition *> *partitions,
2169 			  bool ignore_alias_p)
2170 {
2171   struct partition *partition1, *partition2;
2172   struct pg_vdata *data;
2173   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2174   int i, j, num_sccs = graphds_scc (pg, NULL);
2175 
2176   /* Strong connected compoenent means dependence cycle, we cannot distribute
2177      them.  So fuse them together.  */
2178   if ((unsigned) num_sccs < partitions->length ())
2179     {
2180       for (i = 0; i < num_sccs; ++i)
2181 	{
2182 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2183 	    if (pg->vertices[j].component == i)
2184 	      break;
2185 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2186 	    if (pg->vertices[j].component == i)
2187 	      {
2188 		partition_merge_into (NULL, partition1,
2189 				      partition2, FUSE_SAME_SCC);
2190 		partition1->type = PTYPE_SEQUENTIAL;
2191 		(*partitions)[j] = NULL;
2192 		partition_free (partition2);
2193 		data = (struct pg_vdata *)pg->vertices[j].data;
2194 		data->partition = NULL;
2195 	      }
2196 	}
2197     }
2198 
2199   sort_partitions_by_post_order (pg, partitions);
2200   gcc_assert (partitions->length () == (unsigned)num_sccs);
2201   free_partition_graph_vdata (pg);
2202   free_graph (pg);
2203 }
2204 
2205 /* Callback function for traversing edge E in graph G.  DATA is private
2206    callback data.  */
2207 
2208 static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2209 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2210 {
2211   int i, j, component;
2212   struct pg_edge_callback_data *cbdata;
2213   struct pg_edata *edata = (struct pg_edata *) e->data;
2214 
2215   /* If the edge doesn't have attached data dependence, it represents
2216      compilation time known dependences.  This type dependence cannot
2217      be resolved by runtime alias check.  */
2218   if (edata == NULL || edata->alias_ddrs.length () == 0)
2219     return;
2220 
2221   cbdata = (struct pg_edge_callback_data *) data;
2222   i = e->src;
2223   j = e->dest;
2224   component = cbdata->vertices_component[i];
2225   /* Vertices are topologically sorted according to compilation time
2226      known dependences, so we can break strong connected components
2227      by removing edges of the opposite direction, i.e, edges pointing
2228      from vertice with smaller post number to vertice with bigger post
2229      number.  */
2230   if (g->vertices[i].post < g->vertices[j].post
2231       /* We only need to remove edges connecting vertices in the same
2232 	 strong connected component to break it.  */
2233       && component == cbdata->vertices_component[j]
2234       /* Check if we want to break the strong connected component or not.  */
2235       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2236     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2237 }
2238 
2239 /* This is the main function breaking strong conected components in
2240    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2241    relations for runtime alias check in ALIAS_DDRS.  */
2242 
2243 static void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2244 break_alias_scc_partitions (struct graph *rdg,
2245 			    vec<struct partition *> *partitions,
2246 			    vec<ddr_p> *alias_ddrs)
2247 {
2248   int i, j, k, num_sccs, num_sccs_no_alias;
2249   /* Build partition dependence graph.  */
2250   graph *pg = build_partition_graph (rdg, partitions, false);
2251 
2252   alias_ddrs->truncate (0);
2253   /* Find strong connected components in the graph, with all dependence edges
2254      considered.  */
2255   num_sccs = graphds_scc (pg, NULL);
2256   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2257      compilation time known dependences are merged before this function.  */
2258   if ((unsigned) num_sccs < partitions->length ())
2259     {
2260       struct pg_edge_callback_data cbdata;
2261       auto_bitmap sccs_to_merge;
2262       auto_vec<enum partition_type> scc_types;
2263       struct partition *partition, *first;
2264 
2265       /* If all partitions in a SCC have the same type, we can simply merge the
2266 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2267       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2268       for (i = 0; i < num_sccs; ++i)
2269 	{
2270 	  for (j = 0; partitions->iterate (j, &first); ++j)
2271 	    if (pg->vertices[j].component == i)
2272 	      break;
2273 	  for (++j; partitions->iterate (j, &partition); ++j)
2274 	    {
2275 	      if (pg->vertices[j].component != i)
2276 		continue;
2277 
2278 	      /* Note we Merge partitions of parallel type on purpose, though
2279 		 the result partition is sequential.  The reason is vectorizer
2280 		 can do more accurate runtime alias check in this case.  Also
2281 		 it results in more conservative distribution.  */
2282 	      if (first->type != partition->type)
2283 		{
2284 		  bitmap_clear_bit (sccs_to_merge, i);
2285 		  break;
2286 		}
2287 	    }
2288 	}
2289 
2290       /* Initialize callback data for traversing.  */
2291       cbdata.sccs_to_merge = sccs_to_merge;
2292       cbdata.alias_ddrs = alias_ddrs;
2293       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2294       /* Record the component information which will be corrupted by next
2295 	 graph scc finding call.  */
2296       for (i = 0; i < pg->n_vertices; ++i)
2297 	cbdata.vertices_component[i] = pg->vertices[i].component;
2298 
2299       /* Collect data dependences for runtime alias checks to break SCCs.  */
2300       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2301 	{
2302 	  /* Run SCC finding algorithm again, with alias dependence edges
2303 	     skipped.  This is to topologically sort partitions according to
2304 	     compilation time known dependence.  Note the topological order
2305 	     is stored in the form of pg's post order number.  */
2306 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2307 	  gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2308 	  /* With topological order, we can construct two subgraphs L and R.
2309 	     L contains edge <x, y> where x < y in terms of post order, while
2310 	     R contains edge <x, y> where x > y.  Edges for compilation time
2311 	     known dependence all fall in R, so we break SCCs by removing all
2312 	     (alias) edges of in subgraph L.  */
2313 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2314 	}
2315 
2316       /* For SCC that doesn't need to be broken, merge it.  */
2317       for (i = 0; i < num_sccs; ++i)
2318 	{
2319 	  if (!bitmap_bit_p (sccs_to_merge, i))
2320 	    continue;
2321 
2322 	  for (j = 0; partitions->iterate (j, &first); ++j)
2323 	    if (cbdata.vertices_component[j] == i)
2324 	      break;
2325 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2326 	    {
2327 	      struct pg_vdata *data;
2328 
2329 	      if (cbdata.vertices_component[k] != i)
2330 		continue;
2331 
2332 	      /* Update postorder number so that merged reduction partition is
2333 		 sorted after other partitions.  */
2334 	      if (!partition_reduction_p (first)
2335 		  && partition_reduction_p (partition))
2336 		{
2337 		  gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2338 		  pg->vertices[j].post = pg->vertices[k].post;
2339 		}
2340 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2341 	      (*partitions)[k] = NULL;
2342 	      partition_free (partition);
2343 	      data = (struct pg_vdata *)pg->vertices[k].data;
2344 	      gcc_assert (data->id == k);
2345 	      data->partition = NULL;
2346 	      /* The result partition of merged SCC must be sequential.  */
2347 	      first->type = PTYPE_SEQUENTIAL;
2348 	    }
2349 	}
2350     }
2351 
2352   sort_partitions_by_post_order (pg, partitions);
2353   free_partition_graph_vdata (pg);
2354   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2355   free_graph (pg);
2356 
2357   if (dump_file && (dump_flags & TDF_DETAILS))
2358     {
2359       fprintf (dump_file, "Possible alias data dependence to break:\n");
2360       dump_data_dependence_relations (dump_file, *alias_ddrs);
2361     }
2362 }
2363 
2364 /* Compute and return an expression whose value is the segment length which
2365    will be accessed by DR in NITERS iterations.  */
2366 
2367 static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2368 data_ref_segment_size (struct data_reference *dr, tree niters)
2369 {
2370   niters = size_binop (MINUS_EXPR,
2371 		       fold_convert (sizetype, niters),
2372 		       size_one_node);
2373   return size_binop (MULT_EXPR,
2374 		     fold_convert (sizetype, DR_STEP (dr)),
2375 		     fold_convert (sizetype, niters));
2376 }
2377 
2378 /* Return true if LOOP's latch is dominated by statement for data reference
2379    DR.  */
2380 
2381 static inline bool
latch_dominated_by_data_ref(struct loop * loop,data_reference * dr)2382 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2383 {
2384   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2385 			 gimple_bb (DR_STMT (dr)));
2386 }
2387 
2388 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2389    data dependence relations ALIAS_DDRS.  */
2390 
2391 static void
compute_alias_check_pairs(struct loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)2392 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2393 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2394 {
2395   unsigned int i;
2396   unsigned HOST_WIDE_INT factor = 1;
2397   tree niters_plus_one, niters = number_of_latch_executions (loop);
2398 
2399   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2400   niters = fold_convert (sizetype, niters);
2401   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2402 
2403   if (dump_file && (dump_flags & TDF_DETAILS))
2404     fprintf (dump_file, "Creating alias check pairs:\n");
2405 
2406   /* Iterate all data dependence relations and compute alias check pairs.  */
2407   for (i = 0; i < alias_ddrs->length (); i++)
2408     {
2409       ddr_p ddr = (*alias_ddrs)[i];
2410       struct data_reference *dr_a = DDR_A (ddr);
2411       struct data_reference *dr_b = DDR_B (ddr);
2412       tree seg_length_a, seg_length_b;
2413       int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2414 					    DR_BASE_ADDRESS (dr_b));
2415 
2416       if (comp_res == 0)
2417 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2418       gcc_assert (comp_res != 0);
2419 
2420       if (latch_dominated_by_data_ref (loop, dr_a))
2421 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2422       else
2423 	seg_length_a = data_ref_segment_size (dr_a, niters);
2424 
2425       if (latch_dominated_by_data_ref (loop, dr_b))
2426 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2427       else
2428 	seg_length_b = data_ref_segment_size (dr_b, niters);
2429 
2430       unsigned HOST_WIDE_INT access_size_a
2431 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2432       unsigned HOST_WIDE_INT access_size_b
2433 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2434       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2435       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2436 
2437       dr_with_seg_len_pair_t dr_with_seg_len_pair
2438 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2439 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2440 
2441       /* Canonicalize pairs by sorting the two DR members.  */
2442       if (comp_res > 0)
2443 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2444 
2445       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2446     }
2447 
2448   if (tree_fits_uhwi_p (niters))
2449     factor = tree_to_uhwi (niters);
2450 
2451   /* Prune alias check pairs.  */
2452   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2453   if (dump_file && (dump_flags & TDF_DETAILS))
2454     fprintf (dump_file,
2455 	     "Improved number of alias checks from %d to %d\n",
2456 	     alias_ddrs->length (), comp_alias_pairs->length ());
2457 }
2458 
2459 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2460    checks and version LOOP under condition of these runtime alias checks.  */
2461 
2462 static void
version_loop_by_alias_check(struct loop * loop,vec<ddr_p> * alias_ddrs)2463 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2464 {
2465   profile_probability prob;
2466   basic_block cond_bb;
2467   struct loop *nloop;
2468   tree lhs, arg0, cond_expr = NULL_TREE;
2469   gimple_seq cond_stmts = NULL;
2470   gimple *call_stmt = NULL;
2471   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2472 
2473   /* Generate code for runtime alias checks if necessary.  */
2474   gcc_assert (alias_ddrs->length () > 0);
2475 
2476   if (dump_file && (dump_flags & TDF_DETAILS))
2477     fprintf (dump_file,
2478 	     "Version loop <%d> with runtime alias check\n", loop->num);
2479 
2480   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2481   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2482   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2483 				      is_gimple_val, NULL_TREE);
2484 
2485   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2486   if (flag_tree_loop_vectorize)
2487     {
2488       /* Generate internal function call for loop distribution alias check.  */
2489       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2490 					      2, NULL_TREE, cond_expr);
2491       lhs = make_ssa_name (boolean_type_node);
2492       gimple_call_set_lhs (call_stmt, lhs);
2493     }
2494   else
2495     lhs = cond_expr;
2496 
2497   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2498   initialize_original_copy_tables ();
2499   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2500 			prob, prob.invert (), true);
2501   free_original_copy_tables ();
2502   /* Record the original loop number in newly generated loops.  In case of
2503      distribution, the original loop will be distributed and the new loop
2504      is kept.  */
2505   loop->orig_loop_num = nloop->num;
2506   nloop->orig_loop_num = nloop->num;
2507   nloop->dont_vectorize = true;
2508   nloop->force_vectorize = false;
2509 
2510   if (call_stmt)
2511     {
2512       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2513 	 loop could be destroyed.  */
2514       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2515       gimple_call_set_arg (call_stmt, 0, arg0);
2516       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2517     }
2518 
2519   if (cond_stmts)
2520     {
2521       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2522       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2523     }
2524   update_ssa (TODO_update_ssa);
2525 }
2526 
2527 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2528    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2529 
2530 static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2531 version_for_distribution_p (vec<struct partition *> *partitions,
2532 			    vec<ddr_p> *alias_ddrs)
2533 {
2534   /* No need to version loop if we have only one partition.  */
2535   if (partitions->length () == 1)
2536     return false;
2537 
2538   /* Need to version loop if runtime alias check is necessary.  */
2539   return (alias_ddrs->length () > 0);
2540 }
2541 
2542 /* Compare base offset of builtin mem* partitions P1 and P2.  */
2543 
2544 static bool
offset_cmp(struct partition * p1,struct partition * p2)2545 offset_cmp (struct partition *p1, struct partition *p2)
2546 {
2547   gcc_assert (p1 != NULL && p1->builtin != NULL);
2548   gcc_assert (p2 != NULL && p2->builtin != NULL);
2549   return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2550 }
2551 
2552 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2553    case optimization transforming below code:
2554 
2555      __builtin_memset (&obj, 0, 100);
2556      _1 = &obj + 100;
2557      __builtin_memset (_1, 0, 200);
2558      _2 = &obj + 300;
2559      __builtin_memset (_2, 0, 100);
2560 
2561    into:
2562 
2563      __builtin_memset (&obj, 0, 400);
2564 
2565    Note we don't have dependence information between different partitions
2566    at this point, as a result, we can't handle nonadjacent memset builtin
2567    partitions since dependence might be broken.  */
2568 
2569 static void
fuse_memset_builtins(vec<struct partition * > * partitions)2570 fuse_memset_builtins (vec<struct partition *> *partitions)
2571 {
2572   unsigned i, j;
2573   struct partition *part1, *part2;
2574   tree rhs1, rhs2;
2575 
2576   for (i = 0; partitions->iterate (i, &part1);)
2577     {
2578       if (part1->kind != PKIND_MEMSET)
2579 	{
2580 	  i++;
2581 	  continue;
2582 	}
2583 
2584       /* Find sub-array of memset builtins of the same base.  Index range
2585 	 of the sub-array is [i, j) with "j > i".  */
2586       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2587 	{
2588 	  if (part2->kind != PKIND_MEMSET
2589 	      || !operand_equal_p (part1->builtin->dst_base_base,
2590 				   part2->builtin->dst_base_base, 0))
2591 	    break;
2592 
2593 	  /* Memset calls setting different values can't be merged.  */
2594 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2595 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2596 	  if (!operand_equal_p (rhs1, rhs2, 0))
2597 	    break;
2598 	}
2599 
2600       /* Stable sort is required in order to avoid breaking dependence.  */
2601       std::stable_sort (&(*partitions)[i],
2602 			&(*partitions)[i] + j - i, offset_cmp);
2603       /* Continue with next partition.  */
2604       i = j;
2605     }
2606 
2607   /* Merge all consecutive memset builtin partitions.  */
2608   for (i = 0; i < partitions->length () - 1;)
2609     {
2610       part1 = (*partitions)[i];
2611       if (part1->kind != PKIND_MEMSET)
2612 	{
2613 	  i++;
2614 	  continue;
2615 	}
2616 
2617       part2 = (*partitions)[i + 1];
2618       /* Only merge memset partitions of the same base and with constant
2619 	 access sizes.  */
2620       if (part2->kind != PKIND_MEMSET
2621 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2622 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2623 	  || !operand_equal_p (part1->builtin->dst_base_base,
2624 			       part2->builtin->dst_base_base, 0))
2625 	{
2626 	  i++;
2627 	  continue;
2628 	}
2629       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2630       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2631       int bytev1 = const_with_all_bytes_same (rhs1);
2632       int bytev2 = const_with_all_bytes_same (rhs2);
2633       /* Only merge memset partitions of the same value.  */
2634       if (bytev1 != bytev2 || bytev1 == -1)
2635 	{
2636 	  i++;
2637 	  continue;
2638 	}
2639       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2640 			       wi::to_wide (part1->builtin->size));
2641       /* Only merge adjacent memset partitions.  */
2642       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2643 	{
2644 	  i++;
2645 	  continue;
2646 	}
2647       /* Merge partitions[i] and partitions[i+1].  */
2648       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2649 					  part1->builtin->size,
2650 					  part2->builtin->size);
2651       partition_free (part2);
2652       partitions->ordered_remove (i + 1);
2653     }
2654 }
2655 
2656 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2657    ALIAS_DDRS contains ddrs which need runtime alias check.  */
2658 
2659 static void
finalize_partitions(struct loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2660 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2661 		     vec<ddr_p> *alias_ddrs)
2662 {
2663   unsigned i;
2664   struct partition *partition, *a;
2665 
2666   if (partitions->length () == 1
2667       || alias_ddrs->length () > 0)
2668     return;
2669 
2670   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2671   bool same_type_p = true;
2672   enum partition_type type = ((*partitions)[0])->type;
2673   for (i = 0; partitions->iterate (i, &partition); ++i)
2674     {
2675       same_type_p &= (type == partition->type);
2676       if (partition_builtin_p (partition))
2677 	{
2678 	  num_builtin++;
2679 	  continue;
2680 	}
2681       num_normal++;
2682       if (partition->kind == PKIND_PARTIAL_MEMSET)
2683 	num_partial_memset++;
2684     }
2685 
2686   /* Don't distribute current loop into too many loops given we don't have
2687      memory stream cost model.  Be even more conservative in case of loop
2688      nest distribution.  */
2689   if ((same_type_p && num_builtin == 0
2690        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2691       || (loop->inner != NULL
2692 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2693       || (loop->inner == NULL
2694 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2695     {
2696       a = (*partitions)[0];
2697       for (i = 1; partitions->iterate (i, &partition); ++i)
2698 	{
2699 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2700 	  partition_free (partition);
2701 	}
2702       partitions->truncate (1);
2703     }
2704 
2705   /* Fuse memset builtins if possible.  */
2706   if (partitions->length () > 1)
2707     fuse_memset_builtins (partitions);
2708 }
2709 
2710 /* Distributes the code from LOOP in such a way that producer statements
2711    are placed before consumer statements.  Tries to separate only the
2712    statements from STMTS into separate loops.  Returns the number of
2713    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2714    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2715 
2716 static int
distribute_loop(struct loop * loop,vec<gimple * > stmts,control_dependences * cd,int * nb_calls,bool * destroy_p)2717 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2718 		 control_dependences *cd, int *nb_calls, bool *destroy_p)
2719 {
2720   ddrs_table = new hash_table<ddr_hasher> (389);
2721   struct graph *rdg;
2722   partition *partition;
2723   bool any_builtin;
2724   int i, nbp;
2725 
2726   *destroy_p = false;
2727   *nb_calls = 0;
2728   loop_nest.create (0);
2729   if (!find_loop_nest (loop, &loop_nest))
2730     {
2731       loop_nest.release ();
2732       delete ddrs_table;
2733       return 0;
2734     }
2735 
2736   datarefs_vec.create (20);
2737   rdg = build_rdg (loop, cd);
2738   if (!rdg)
2739     {
2740       if (dump_file && (dump_flags & TDF_DETAILS))
2741 	fprintf (dump_file,
2742 		 "Loop %d not distributed: failed to build the RDG.\n",
2743 		 loop->num);
2744 
2745       loop_nest.release ();
2746       free_data_refs (datarefs_vec);
2747       delete ddrs_table;
2748       return 0;
2749     }
2750 
2751   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2752     {
2753       if (dump_file && (dump_flags & TDF_DETAILS))
2754 	fprintf (dump_file,
2755 		 "Loop %d not distributed: too many memory references.\n",
2756 		 loop->num);
2757 
2758       free_rdg (rdg);
2759       loop_nest.release ();
2760       free_data_refs (datarefs_vec);
2761       delete ddrs_table;
2762       return 0;
2763     }
2764 
2765   data_reference_p dref;
2766   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2767     dref->aux = (void *) (uintptr_t) i;
2768 
2769   if (dump_file && (dump_flags & TDF_DETAILS))
2770     dump_rdg (dump_file, rdg);
2771 
2772   auto_vec<struct partition *, 3> partitions;
2773   rdg_build_partitions (rdg, stmts, &partitions);
2774 
2775   auto_vec<ddr_p> alias_ddrs;
2776 
2777   auto_bitmap stmt_in_all_partitions;
2778   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2779   for (i = 1; partitions.iterate (i, &partition); ++i)
2780     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2781 
2782   any_builtin = false;
2783   FOR_EACH_VEC_ELT (partitions, i, partition)
2784     {
2785       classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2786       any_builtin |= partition_builtin_p (partition);
2787     }
2788 
2789   /* If we are only distributing patterns but did not detect any,
2790      simply bail out.  */
2791   if (!flag_tree_loop_distribution
2792       && !any_builtin)
2793     {
2794       nbp = 0;
2795       goto ldist_done;
2796     }
2797 
2798   /* If we are only distributing patterns fuse all partitions that
2799      were not classified as builtins.  This also avoids chopping
2800      a loop into pieces, separated by builtin calls.  That is, we
2801      only want no or a single loop body remaining.  */
2802   struct partition *into;
2803   if (!flag_tree_loop_distribution)
2804     {
2805       for (i = 0; partitions.iterate (i, &into); ++i)
2806 	if (!partition_builtin_p (into))
2807 	  break;
2808       for (++i; partitions.iterate (i, &partition); ++i)
2809 	if (!partition_builtin_p (partition))
2810 	  {
2811 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2812 	    partitions.unordered_remove (i);
2813 	    partition_free (partition);
2814 	    i--;
2815 	  }
2816     }
2817 
2818   /* Due to limitations in the transform phase we have to fuse all
2819      reduction partitions into the last partition so the existing
2820      loop will contain all loop-closed PHI nodes.  */
2821   for (i = 0; partitions.iterate (i, &into); ++i)
2822     if (partition_reduction_p (into))
2823       break;
2824   for (i = i + 1; partitions.iterate (i, &partition); ++i)
2825     if (partition_reduction_p (partition))
2826       {
2827 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2828 	partitions.unordered_remove (i);
2829 	partition_free (partition);
2830 	i--;
2831       }
2832 
2833   /* Apply our simple cost model - fuse partitions with similar
2834      memory accesses.  */
2835   for (i = 0; partitions.iterate (i, &into); ++i)
2836     {
2837       bool changed = false;
2838       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2839 	continue;
2840       for (int j = i + 1;
2841 	   partitions.iterate (j, &partition); ++j)
2842 	{
2843 	  if (share_memory_accesses (rdg, into, partition))
2844 	    {
2845 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2846 	      partitions.unordered_remove (j);
2847 	      partition_free (partition);
2848 	      j--;
2849 	      changed = true;
2850 	    }
2851 	}
2852       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2853          accesses when 1 and 2 have similar accesses but not 0 and 1
2854 	 then in the next iteration we will fail to consider merging
2855 	 1 into 0,2.  So try again if we did any merging into 0.  */
2856       if (changed)
2857 	i--;
2858     }
2859 
2860   /* Build the partition dependency graph and fuse partitions in strong
2861      connected component.  */
2862   if (partitions.length () > 1)
2863     {
2864       /* Don't support loop nest distribution under runtime alias check
2865 	 since it's not likely to enable many vectorization opportunities.  */
2866       if (loop->inner)
2867 	merge_dep_scc_partitions (rdg, &partitions, false);
2868       else
2869 	{
2870 	  merge_dep_scc_partitions (rdg, &partitions, true);
2871 	  if (partitions.length () > 1)
2872 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2873 	}
2874     }
2875 
2876   finalize_partitions (loop, &partitions, &alias_ddrs);
2877 
2878   nbp = partitions.length ();
2879   if (nbp == 0
2880       || (nbp == 1 && !partition_builtin_p (partitions[0]))
2881       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2882     {
2883       nbp = 0;
2884       goto ldist_done;
2885     }
2886 
2887   if (version_for_distribution_p (&partitions, &alias_ddrs))
2888     version_loop_by_alias_check (loop, &alias_ddrs);
2889 
2890   if (dump_file && (dump_flags & TDF_DETAILS))
2891     {
2892       fprintf (dump_file,
2893 	       "distribute loop <%d> into partitions:\n", loop->num);
2894       dump_rdg_partitions (dump_file, partitions);
2895     }
2896 
2897   FOR_EACH_VEC_ELT (partitions, i, partition)
2898     {
2899       if (partition_builtin_p (partition))
2900 	(*nb_calls)++;
2901       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2902     }
2903 
2904  ldist_done:
2905   loop_nest.release ();
2906   free_data_refs (datarefs_vec);
2907   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2908        iter != ddrs_table->end (); ++iter)
2909     {
2910       free_dependence_relation (*iter);
2911       *iter = NULL;
2912     }
2913   delete ddrs_table;
2914 
2915   FOR_EACH_VEC_ELT (partitions, i, partition)
2916     partition_free (partition);
2917 
2918   free_rdg (rdg);
2919   return nbp - *nb_calls;
2920 }
2921 
2922 /* Distribute all loops in the current function.  */
2923 
2924 namespace {
2925 
2926 const pass_data pass_data_loop_distribution =
2927 {
2928   GIMPLE_PASS, /* type */
2929   "ldist", /* name */
2930   OPTGROUP_LOOP, /* optinfo_flags */
2931   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2932   ( PROP_cfg | PROP_ssa ), /* properties_required */
2933   0, /* properties_provided */
2934   0, /* properties_destroyed */
2935   0, /* todo_flags_start */
2936   0, /* todo_flags_finish */
2937 };
2938 
2939 class pass_loop_distribution : public gimple_opt_pass
2940 {
2941 public:
pass_loop_distribution(gcc::context * ctxt)2942   pass_loop_distribution (gcc::context *ctxt)
2943     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2944   {}
2945 
2946   /* opt_pass methods: */
gate(function *)2947   virtual bool gate (function *)
2948     {
2949       return flag_tree_loop_distribution
2950 	|| flag_tree_loop_distribute_patterns;
2951     }
2952 
2953   virtual unsigned int execute (function *);
2954 
2955 }; // class pass_loop_distribution
2956 
2957 
2958 /* Given LOOP, this function records seed statements for distribution in
2959    WORK_LIST.  Return false if there is nothing for distribution.  */
2960 
2961 static bool
find_seed_stmts_for_distribution(struct loop * loop,vec<gimple * > * work_list)2962 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2963 {
2964   basic_block *bbs = get_loop_body_in_dom_order (loop);
2965 
2966   /* Initialize the worklist with stmts we seed the partitions with.  */
2967   for (unsigned i = 0; i < loop->num_nodes; ++i)
2968     {
2969       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2970 	   !gsi_end_p (gsi); gsi_next (&gsi))
2971 	{
2972 	  gphi *phi = gsi.phi ();
2973 	  if (virtual_operand_p (gimple_phi_result (phi)))
2974 	    continue;
2975 	  /* Distribute stmts which have defs that are used outside of
2976 	     the loop.  */
2977 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2978 	    continue;
2979 	  work_list->safe_push (phi);
2980 	}
2981       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2982 	   !gsi_end_p (gsi); gsi_next (&gsi))
2983 	{
2984 	  gimple *stmt = gsi_stmt (gsi);
2985 
2986 	  /* If there is a stmt with side-effects bail out - we
2987 	     cannot and should not distribute this loop.  */
2988 	  if (gimple_has_side_effects (stmt))
2989 	    {
2990 	      free (bbs);
2991 	      return false;
2992 	    }
2993 
2994 	  /* Distribute stmts which have defs that are used outside of
2995 	     the loop.  */
2996 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2997 	    ;
2998 	  /* Otherwise only distribute stores for now.  */
2999 	  else if (!gimple_vdef (stmt))
3000 	    continue;
3001 
3002 	  work_list->safe_push (stmt);
3003 	}
3004     }
3005   free (bbs);
3006   return work_list->length () > 0;
3007 }
3008 
3009 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3010    perfect loop nest.  */
3011 
3012 static struct loop *
prepare_perfect_loop_nest(struct loop * loop)3013 prepare_perfect_loop_nest (struct loop *loop)
3014 {
3015   struct loop *outer = loop_outer (loop);
3016   tree niters = number_of_latch_executions (loop);
3017 
3018   /* TODO: We only support the innermost 3-level loop nest distribution
3019      because of compilation time issue for now.  This should be relaxed
3020      in the future.  Note we only allow 3-level loop nest distribution
3021      when parallelizing loops.  */
3022   while ((loop->inner == NULL
3023 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3024 	 && loop_outer (outer)
3025 	 && outer->inner == loop && loop->next == NULL
3026 	 && single_exit (outer)
3027 	 && optimize_loop_for_speed_p (outer)
3028 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3029 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3030 	 && niters != chrec_dont_know)
3031     {
3032       loop = outer;
3033       outer = loop_outer (loop);
3034     }
3035 
3036   return loop;
3037 }
3038 
3039 unsigned int
execute(function * fun)3040 pass_loop_distribution::execute (function *fun)
3041 {
3042   struct loop *loop;
3043   bool changed = false;
3044   basic_block bb;
3045   control_dependences *cd = NULL;
3046   auto_vec<loop_p> loops_to_be_destroyed;
3047 
3048   if (number_of_loops (fun) <= 1)
3049     return 0;
3050 
3051   /* Compute topological order for basic blocks.  Topological order is
3052      needed because data dependence is computed for data references in
3053      lexicographical order.  */
3054   if (bb_top_order_index == NULL)
3055     {
3056       int rpo_num;
3057       int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3058 
3059       bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3060       bb_top_order_index_size = last_basic_block_for_fn (cfun);
3061       rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3062       for (int i = 0; i < rpo_num; i++)
3063 	bb_top_order_index[rpo[i]] = i;
3064 
3065       free (rpo);
3066     }
3067 
3068   FOR_ALL_BB_FN (bb, fun)
3069     {
3070       gimple_stmt_iterator gsi;
3071       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3072 	gimple_set_uid (gsi_stmt (gsi), -1);
3073       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3074 	gimple_set_uid (gsi_stmt (gsi), -1);
3075     }
3076 
3077   /* We can at the moment only distribute non-nested loops, thus restrict
3078      walking to innermost loops.  */
3079   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3080     {
3081       /* Don't distribute multiple exit edges loop, or cold loop.  */
3082       if (!single_exit (loop)
3083 	  || !optimize_loop_for_speed_p (loop))
3084 	continue;
3085 
3086       /* Don't distribute loop if niters is unknown.  */
3087       tree niters = number_of_latch_executions (loop);
3088       if (niters == NULL_TREE || niters == chrec_dont_know)
3089 	continue;
3090 
3091       /* Get the perfect loop nest for distribution.  */
3092       loop = prepare_perfect_loop_nest (loop);
3093       for (; loop; loop = loop->inner)
3094 	{
3095 	  auto_vec<gimple *> work_list;
3096 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3097 	    break;
3098 
3099 	  const char *str = loop->inner ? " nest" : "";
3100 	  location_t loc = find_loop_location (loop);
3101 	  if (!cd)
3102 	    {
3103 	      calculate_dominance_info (CDI_DOMINATORS);
3104 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3105 	      cd = new control_dependences ();
3106 	      free_dominance_info (CDI_POST_DOMINATORS);
3107 	    }
3108 
3109 	  bool destroy_p;
3110 	  int nb_generated_loops, nb_generated_calls;
3111 	  nb_generated_loops = distribute_loop (loop, work_list, cd,
3112 						&nb_generated_calls,
3113 						&destroy_p);
3114 	  if (destroy_p)
3115 	    loops_to_be_destroyed.safe_push (loop);
3116 
3117 	  if (nb_generated_loops + nb_generated_calls > 0)
3118 	    {
3119 	      changed = true;
3120 	      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3121 			       loc, "Loop%s %d distributed: split to %d loops "
3122 			       "and %d library calls.\n", str, loop->num,
3123 			       nb_generated_loops, nb_generated_calls);
3124 
3125 	      break;
3126 	    }
3127 
3128 	  if (dump_file && (dump_flags & TDF_DETAILS))
3129 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3130 	}
3131     }
3132 
3133   if (cd)
3134     delete cd;
3135 
3136   if (bb_top_order_index != NULL)
3137     {
3138       free (bb_top_order_index);
3139       bb_top_order_index = NULL;
3140       bb_top_order_index_size = 0;
3141     }
3142 
3143   if (changed)
3144     {
3145       /* Destroy loop bodies that could not be reused.  Do this late as we
3146 	 otherwise can end up refering to stale data in control dependences.  */
3147       unsigned i;
3148       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3149 	destroy_loop (loop);
3150 
3151       /* Cached scalar evolutions now may refer to wrong or non-existing
3152 	 loops.  */
3153       scev_reset_htab ();
3154       mark_virtual_operands_for_renaming (fun);
3155       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3156     }
3157 
3158   checking_verify_loop_structure ();
3159 
3160   return changed ? TODO_cleanup_cfg : 0;
3161 }
3162 
3163 } // anon namespace
3164 
3165 gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)3166 make_pass_loop_distribution (gcc::context *ctxt)
3167 {
3168   return new pass_loop_distribution (ctxt);
3169 }
3170