1 /* Loop distribution.
2 Copyright (C) 2006-2018 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 /* This pass performs loop distribution: for example, the loop
23
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
28
29 is transformed to
30
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
34 |
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
38
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
44
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
84
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
91
92 #include "config.h"
93 #define INCLUDE_ALGORITHM /* stable_sort */
94 #include "system.h"
95 #include "coretypes.h"
96 #include "backend.h"
97 #include "tree.h"
98 #include "gimple.h"
99 #include "cfghooks.h"
100 #include "tree-pass.h"
101 #include "ssa.h"
102 #include "gimple-pretty-print.h"
103 #include "fold-const.h"
104 #include "cfganal.h"
105 #include "gimple-iterator.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-ssa-loop-manip.h"
110 #include "tree-ssa-loop-ivopts.h"
111 #include "tree-ssa-loop.h"
112 #include "tree-into-ssa.h"
113 #include "tree-ssa.h"
114 #include "cfgloop.h"
115 #include "tree-scalar-evolution.h"
116 #include "params.h"
117 #include "tree-vectorizer.h"
118 #include "tree-eh.h"
119
120
121 #define MAX_DATAREFS_NUM \
122 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
123
124 /* Threshold controlling number of distributed partitions. Given it may
125 be unnecessary if a memory stream cost model is invented in the future,
126 we define it as a temporary macro, rather than a parameter. */
127 #define NUM_PARTITION_THRESHOLD (4)
128
129 /* Hashtable helpers. */
130
131 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
132 {
133 static inline hashval_t hash (const data_dependence_relation *);
134 static inline bool equal (const data_dependence_relation *,
135 const data_dependence_relation *);
136 };
137
138 /* Hash function for data dependence. */
139
140 inline hashval_t
hash(const data_dependence_relation * ddr)141 ddr_hasher::hash (const data_dependence_relation *ddr)
142 {
143 inchash::hash h;
144 h.add_ptr (DDR_A (ddr));
145 h.add_ptr (DDR_B (ddr));
146 return h.end ();
147 }
148
149 /* Hash table equality function for data dependence. */
150
151 inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)152 ddr_hasher::equal (const data_dependence_relation *ddr1,
153 const data_dependence_relation *ddr2)
154 {
155 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
156 }
157
158 /* The loop (nest) to be distributed. */
159 static vec<loop_p> loop_nest;
160
161 /* Vector of data references in the loop to be distributed. */
162 static vec<data_reference_p> datarefs_vec;
163
164 /* Store index of data reference in aux field. */
165 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
166
167 /* Hash table for data dependence relation in the loop to be distributed. */
168 static hash_table<ddr_hasher> *ddrs_table;
169
170 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
171 struct rdg_vertex
172 {
173 /* The statement represented by this vertex. */
174 gimple *stmt;
175
176 /* Vector of data-references in this statement. */
177 vec<data_reference_p> datarefs;
178
179 /* True when the statement contains a write to memory. */
180 bool has_mem_write;
181
182 /* True when the statement contains a read from memory. */
183 bool has_mem_reads;
184 };
185
186 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
187 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
188 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
189 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
190 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
191 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
192 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
193 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
194
195 /* Data dependence type. */
196
197 enum rdg_dep_type
198 {
199 /* Read After Write (RAW). */
200 flow_dd = 'f',
201
202 /* Control dependence (execute conditional on). */
203 control_dd = 'c'
204 };
205
206 /* Dependence information attached to an edge of the RDG. */
207
208 struct rdg_edge
209 {
210 /* Type of the dependence. */
211 enum rdg_dep_type type;
212 };
213
214 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
215
216 /* Dump vertex I in RDG to FILE. */
217
218 static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)219 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
220 {
221 struct vertex *v = &(rdg->vertices[i]);
222 struct graph_edge *e;
223
224 fprintf (file, "(vertex %d: (%s%s) (in:", i,
225 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
226 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
227
228 if (v->pred)
229 for (e = v->pred; e; e = e->pred_next)
230 fprintf (file, " %d", e->src);
231
232 fprintf (file, ") (out:");
233
234 if (v->succ)
235 for (e = v->succ; e; e = e->succ_next)
236 fprintf (file, " %d", e->dest);
237
238 fprintf (file, ")\n");
239 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
240 fprintf (file, ")\n");
241 }
242
243 /* Call dump_rdg_vertex on stderr. */
244
245 DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)246 debug_rdg_vertex (struct graph *rdg, int i)
247 {
248 dump_rdg_vertex (stderr, rdg, i);
249 }
250
251 /* Dump the reduced dependence graph RDG to FILE. */
252
253 static void
dump_rdg(FILE * file,struct graph * rdg)254 dump_rdg (FILE *file, struct graph *rdg)
255 {
256 fprintf (file, "(rdg\n");
257 for (int i = 0; i < rdg->n_vertices; i++)
258 dump_rdg_vertex (file, rdg, i);
259 fprintf (file, ")\n");
260 }
261
262 /* Call dump_rdg on stderr. */
263
264 DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)265 debug_rdg (struct graph *rdg)
266 {
267 dump_rdg (stderr, rdg);
268 }
269
270 static void
dot_rdg_1(FILE * file,struct graph * rdg)271 dot_rdg_1 (FILE *file, struct graph *rdg)
272 {
273 int i;
274 pretty_printer buffer;
275 pp_needs_newline (&buffer) = false;
276 buffer.buffer->stream = file;
277
278 fprintf (file, "digraph RDG {\n");
279
280 for (i = 0; i < rdg->n_vertices; i++)
281 {
282 struct vertex *v = &(rdg->vertices[i]);
283 struct graph_edge *e;
284
285 fprintf (file, "%d [label=\"[%d] ", i, i);
286 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
287 pp_flush (&buffer);
288 fprintf (file, "\"]\n");
289
290 /* Highlight reads from memory. */
291 if (RDG_MEM_READS_STMT (rdg, i))
292 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
293
294 /* Highlight stores to memory. */
295 if (RDG_MEM_WRITE_STMT (rdg, i))
296 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
297
298 if (v->succ)
299 for (e = v->succ; e; e = e->succ_next)
300 switch (RDGE_TYPE (e))
301 {
302 case flow_dd:
303 /* These are the most common dependences: don't print these. */
304 fprintf (file, "%d -> %d \n", i, e->dest);
305 break;
306
307 case control_dd:
308 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
309 break;
310
311 default:
312 gcc_unreachable ();
313 }
314 }
315
316 fprintf (file, "}\n\n");
317 }
318
319 /* Display the Reduced Dependence Graph using dotty. */
320
321 DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)322 dot_rdg (struct graph *rdg)
323 {
324 /* When debugging, you may want to enable the following code. */
325 #ifdef HAVE_POPEN
326 FILE *file = popen ("dot -Tx11", "w");
327 if (!file)
328 return;
329 dot_rdg_1 (file, rdg);
330 fflush (file);
331 close (fileno (file));
332 pclose (file);
333 #else
334 dot_rdg_1 (stderr, rdg);
335 #endif
336 }
337
338 /* Returns the index of STMT in RDG. */
339
340 static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)341 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
342 {
343 int index = gimple_uid (stmt);
344 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
345 return index;
346 }
347
348 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
349 the index of DEF in RDG. */
350
351 static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)352 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
353 {
354 use_operand_p imm_use_p;
355 imm_use_iterator iterator;
356
357 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
358 {
359 struct graph_edge *e;
360 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
361
362 if (use < 0)
363 continue;
364
365 e = add_edge (rdg, idef, use);
366 e->data = XNEW (struct rdg_edge);
367 RDGE_TYPE (e) = flow_dd;
368 }
369 }
370
371 /* Creates an edge for the control dependences of BB to the vertex V. */
372
373 static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)374 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
375 int v, control_dependences *cd)
376 {
377 bitmap_iterator bi;
378 unsigned edge_n;
379 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
380 0, edge_n, bi)
381 {
382 basic_block cond_bb = cd->get_edge_src (edge_n);
383 gimple *stmt = last_stmt (cond_bb);
384 if (stmt && is_ctrl_stmt (stmt))
385 {
386 struct graph_edge *e;
387 int c = rdg_vertex_for_stmt (rdg, stmt);
388 if (c < 0)
389 continue;
390
391 e = add_edge (rdg, c, v);
392 e->data = XNEW (struct rdg_edge);
393 RDGE_TYPE (e) = control_dd;
394 }
395 }
396 }
397
398 /* Creates the edges of the reduced dependence graph RDG. */
399
400 static void
create_rdg_flow_edges(struct graph * rdg)401 create_rdg_flow_edges (struct graph *rdg)
402 {
403 int i;
404 def_operand_p def_p;
405 ssa_op_iter iter;
406
407 for (i = 0; i < rdg->n_vertices; i++)
408 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
409 iter, SSA_OP_DEF)
410 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
411 }
412
413 /* Creates the edges of the reduced dependence graph RDG. */
414
415 static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)416 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
417 {
418 int i;
419
420 for (i = 0; i < rdg->n_vertices; i++)
421 {
422 gimple *stmt = RDG_STMT (rdg, i);
423 if (gimple_code (stmt) == GIMPLE_PHI)
424 {
425 edge_iterator ei;
426 edge e;
427 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
428 if (flow_bb_inside_loop_p (loop, e->src))
429 create_edge_for_control_dependence (rdg, e->src, i, cd);
430 }
431 else
432 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
433 }
434 }
435
436 /* Build the vertices of the reduced dependence graph RDG. Return false
437 if that failed. */
438
439 static bool
create_rdg_vertices(struct graph * rdg,vec<gimple * > stmts,loop_p loop)440 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
441 {
442 int i;
443 gimple *stmt;
444
445 FOR_EACH_VEC_ELT (stmts, i, stmt)
446 {
447 struct vertex *v = &(rdg->vertices[i]);
448
449 /* Record statement to vertex mapping. */
450 gimple_set_uid (stmt, i);
451
452 v->data = XNEW (struct rdg_vertex);
453 RDGV_STMT (v) = stmt;
454 RDGV_DATAREFS (v).create (0);
455 RDGV_HAS_MEM_WRITE (v) = false;
456 RDGV_HAS_MEM_READS (v) = false;
457 if (gimple_code (stmt) == GIMPLE_PHI)
458 continue;
459
460 unsigned drp = datarefs_vec.length ();
461 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
462 return false;
463 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
464 {
465 data_reference_p dr = datarefs_vec[j];
466 if (DR_IS_READ (dr))
467 RDGV_HAS_MEM_READS (v) = true;
468 else
469 RDGV_HAS_MEM_WRITE (v) = true;
470 RDGV_DATAREFS (v).safe_push (dr);
471 }
472 }
473 return true;
474 }
475
476 /* Array mapping basic block's index to its topological order. */
477 static int *bb_top_order_index;
478 /* And size of the array. */
479 static int bb_top_order_index_size;
480
481 /* If X has a smaller topological sort number than Y, returns -1;
482 if greater, returns 1. */
483
484 static int
bb_top_order_cmp(const void * x,const void * y)485 bb_top_order_cmp (const void *x, const void *y)
486 {
487 basic_block bb1 = *(const basic_block *) x;
488 basic_block bb2 = *(const basic_block *) y;
489
490 gcc_assert (bb1->index < bb_top_order_index_size
491 && bb2->index < bb_top_order_index_size);
492 gcc_assert (bb1 == bb2
493 || bb_top_order_index[bb1->index]
494 != bb_top_order_index[bb2->index]);
495
496 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
497 }
498
499 /* Initialize STMTS with all the statements of LOOP. We use topological
500 order to discover all statements. The order is important because
501 generate_loops_for_partition is using the same traversal for identifying
502 statements in loop copies. */
503
504 static void
stmts_from_loop(struct loop * loop,vec<gimple * > * stmts)505 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
506 {
507 unsigned int i;
508 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
509
510 for (i = 0; i < loop->num_nodes; i++)
511 {
512 basic_block bb = bbs[i];
513
514 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
515 gsi_next (&bsi))
516 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
517 stmts->safe_push (bsi.phi ());
518
519 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
520 gsi_next (&bsi))
521 {
522 gimple *stmt = gsi_stmt (bsi);
523 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
524 stmts->safe_push (stmt);
525 }
526 }
527
528 free (bbs);
529 }
530
531 /* Free the reduced dependence graph RDG. */
532
533 static void
free_rdg(struct graph * rdg)534 free_rdg (struct graph *rdg)
535 {
536 int i;
537
538 for (i = 0; i < rdg->n_vertices; i++)
539 {
540 struct vertex *v = &(rdg->vertices[i]);
541 struct graph_edge *e;
542
543 for (e = v->succ; e; e = e->succ_next)
544 free (e->data);
545
546 if (v->data)
547 {
548 gimple_set_uid (RDGV_STMT (v), -1);
549 (RDGV_DATAREFS (v)).release ();
550 free (v->data);
551 }
552 }
553
554 free_graph (rdg);
555 }
556
557 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
558 LOOP, and one edge per flow dependence or control dependence from control
559 dependence CD. During visiting each statement, data references are also
560 collected and recorded in global data DATAREFS_VEC. */
561
562 static struct graph *
build_rdg(struct loop * loop,control_dependences * cd)563 build_rdg (struct loop *loop, control_dependences *cd)
564 {
565 struct graph *rdg;
566
567 /* Create the RDG vertices from the stmts of the loop nest. */
568 auto_vec<gimple *, 10> stmts;
569 stmts_from_loop (loop, &stmts);
570 rdg = new_graph (stmts.length ());
571 if (!create_rdg_vertices (rdg, stmts, loop))
572 {
573 free_rdg (rdg);
574 return NULL;
575 }
576 stmts.release ();
577
578 create_rdg_flow_edges (rdg);
579 if (cd)
580 create_rdg_cd_edges (rdg, cd, loop);
581
582 return rdg;
583 }
584
585
586 /* Kind of distributed loop. */
587 enum partition_kind {
588 PKIND_NORMAL,
589 /* Partial memset stands for a paritition can be distributed into a loop
590 of memset calls, rather than a single memset call. It's handled just
591 like a normal parition, i.e, distributed as separate loop, no memset
592 call is generated.
593
594 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
595 loop nest as deep as possible. As a result, parloop achieves better
596 parallelization by parallelizing deeper loop nest. This hack should
597 be unnecessary and removed once distributed memset can be understood
598 and analyzed in data reference analysis. See PR82604 for more. */
599 PKIND_PARTIAL_MEMSET,
600 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
601 };
602
603 /* Type of distributed loop. */
604 enum partition_type {
605 /* The distributed loop can be executed parallelly. */
606 PTYPE_PARALLEL = 0,
607 /* The distributed loop has to be executed sequentially. */
608 PTYPE_SEQUENTIAL
609 };
610
611 /* Builtin info for loop distribution. */
612 struct builtin_info
613 {
614 /* data-references a kind != PKIND_NORMAL partition is about. */
615 data_reference_p dst_dr;
616 data_reference_p src_dr;
617 /* Base address and size of memory objects operated by the builtin. Note
618 both dest and source memory objects must have the same size. */
619 tree dst_base;
620 tree src_base;
621 tree size;
622 /* Base and offset part of dst_base after stripping constant offset. This
623 is only used in memset builtin distribution for now. */
624 tree dst_base_base;
625 unsigned HOST_WIDE_INT dst_base_offset;
626 };
627
628 /* Partition for loop distribution. */
629 struct partition
630 {
631 /* Statements of the partition. */
632 bitmap stmts;
633 /* True if the partition defines variable which is used outside of loop. */
634 bool reduction_p;
635 enum partition_kind kind;
636 enum partition_type type;
637 /* Data references in the partition. */
638 bitmap datarefs;
639 /* Information of builtin parition. */
640 struct builtin_info *builtin;
641 };
642
643
644 /* Allocate and initialize a partition from BITMAP. */
645
646 static partition *
partition_alloc(void)647 partition_alloc (void)
648 {
649 partition *partition = XCNEW (struct partition);
650 partition->stmts = BITMAP_ALLOC (NULL);
651 partition->reduction_p = false;
652 partition->kind = PKIND_NORMAL;
653 partition->datarefs = BITMAP_ALLOC (NULL);
654 return partition;
655 }
656
657 /* Free PARTITION. */
658
659 static void
partition_free(partition * partition)660 partition_free (partition *partition)
661 {
662 BITMAP_FREE (partition->stmts);
663 BITMAP_FREE (partition->datarefs);
664 if (partition->builtin)
665 free (partition->builtin);
666
667 free (partition);
668 }
669
670 /* Returns true if the partition can be generated as a builtin. */
671
672 static bool
partition_builtin_p(partition * partition)673 partition_builtin_p (partition *partition)
674 {
675 return partition->kind > PKIND_PARTIAL_MEMSET;
676 }
677
678 /* Returns true if the partition contains a reduction. */
679
680 static bool
partition_reduction_p(partition * partition)681 partition_reduction_p (partition *partition)
682 {
683 return partition->reduction_p;
684 }
685
686 /* Partitions are fused because of different reasons. */
687 enum fuse_type
688 {
689 FUSE_NON_BUILTIN = 0,
690 FUSE_REDUCTION = 1,
691 FUSE_SHARE_REF = 2,
692 FUSE_SAME_SCC = 3,
693 FUSE_FINALIZE = 4
694 };
695
696 /* Description on different fusing reason. */
697 static const char *fuse_message[] = {
698 "they are non-builtins",
699 "they have reductions",
700 "they have shared memory refs",
701 "they are in the same dependence scc",
702 "there is no point to distribute loop"};
703
704 static void
705 update_type_for_merge (struct graph *, partition *, partition *);
706
707 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
708 graph and we update type for result partition if it is non-NULL. */
709
710 static void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)711 partition_merge_into (struct graph *rdg, partition *dest,
712 partition *partition, enum fuse_type ft)
713 {
714 if (dump_file && (dump_flags & TDF_DETAILS))
715 {
716 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
717 fprintf (dump_file, " Part 1: ");
718 dump_bitmap (dump_file, dest->stmts);
719 fprintf (dump_file, " Part 2: ");
720 dump_bitmap (dump_file, partition->stmts);
721 }
722
723 dest->kind = PKIND_NORMAL;
724 if (dest->type == PTYPE_PARALLEL)
725 dest->type = partition->type;
726
727 bitmap_ior_into (dest->stmts, partition->stmts);
728 if (partition_reduction_p (partition))
729 dest->reduction_p = true;
730
731 /* Further check if any data dependence prevents us from executing the
732 new partition parallelly. */
733 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
734 update_type_for_merge (rdg, dest, partition);
735
736 bitmap_ior_into (dest->datarefs, partition->datarefs);
737 }
738
739
740 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
741 the LOOP. */
742
743 static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)744 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
745 {
746 imm_use_iterator imm_iter;
747 use_operand_p use_p;
748
749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
750 {
751 if (is_gimple_debug (USE_STMT (use_p)))
752 continue;
753
754 basic_block use_bb = gimple_bb (USE_STMT (use_p));
755 if (!flow_bb_inside_loop_p (loop, use_bb))
756 return true;
757 }
758
759 return false;
760 }
761
762 /* Returns true when STMT defines a scalar variable used after the
763 loop LOOP. */
764
765 static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)766 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
767 {
768 def_operand_p def_p;
769 ssa_op_iter op_iter;
770
771 if (gimple_code (stmt) == GIMPLE_PHI)
772 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
773
774 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
775 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
776 return true;
777
778 return false;
779 }
780
781 /* Return a copy of LOOP placed before LOOP. */
782
783 static struct loop *
copy_loop_before(struct loop * loop)784 copy_loop_before (struct loop *loop)
785 {
786 struct loop *res;
787 edge preheader = loop_preheader_edge (loop);
788
789 initialize_original_copy_tables ();
790 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
791 gcc_assert (res != NULL);
792 free_original_copy_tables ();
793 delete_update_ssa ();
794
795 return res;
796 }
797
798 /* Creates an empty basic block after LOOP. */
799
800 static void
create_bb_after_loop(struct loop * loop)801 create_bb_after_loop (struct loop *loop)
802 {
803 edge exit = single_exit (loop);
804
805 if (!exit)
806 return;
807
808 split_edge (exit);
809 }
810
811 /* Generate code for PARTITION from the code in LOOP. The loop is
812 copied when COPY_P is true. All the statements not flagged in the
813 PARTITION bitmap are removed from the loop or from its copy. The
814 statements are indexed in sequence inside a basic block, and the
815 basic blocks of a loop are taken in dom order. */
816
817 static void
generate_loops_for_partition(struct loop * loop,partition * partition,bool copy_p)818 generate_loops_for_partition (struct loop *loop, partition *partition,
819 bool copy_p)
820 {
821 unsigned i;
822 basic_block *bbs;
823
824 if (copy_p)
825 {
826 int orig_loop_num = loop->orig_loop_num;
827 loop = copy_loop_before (loop);
828 gcc_assert (loop != NULL);
829 loop->orig_loop_num = orig_loop_num;
830 create_preheader (loop, CP_SIMPLE_PREHEADERS);
831 create_bb_after_loop (loop);
832 }
833 else
834 {
835 /* Origin number is set to the new versioned loop's num. */
836 gcc_assert (loop->orig_loop_num != loop->num);
837 }
838
839 /* Remove stmts not in the PARTITION bitmap. */
840 bbs = get_loop_body_in_dom_order (loop);
841
842 if (MAY_HAVE_DEBUG_BIND_STMTS)
843 for (i = 0; i < loop->num_nodes; i++)
844 {
845 basic_block bb = bbs[i];
846
847 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
848 gsi_next (&bsi))
849 {
850 gphi *phi = bsi.phi ();
851 if (!virtual_operand_p (gimple_phi_result (phi))
852 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
853 reset_debug_uses (phi);
854 }
855
856 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
857 {
858 gimple *stmt = gsi_stmt (bsi);
859 if (gimple_code (stmt) != GIMPLE_LABEL
860 && !is_gimple_debug (stmt)
861 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
862 reset_debug_uses (stmt);
863 }
864 }
865
866 for (i = 0; i < loop->num_nodes; i++)
867 {
868 basic_block bb = bbs[i];
869 edge inner_exit = NULL;
870
871 if (loop != bb->loop_father)
872 inner_exit = single_exit (bb->loop_father);
873
874 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
875 {
876 gphi *phi = bsi.phi ();
877 if (!virtual_operand_p (gimple_phi_result (phi))
878 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
879 remove_phi_node (&bsi, true);
880 else
881 gsi_next (&bsi);
882 }
883
884 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
885 {
886 gimple *stmt = gsi_stmt (bsi);
887 if (gimple_code (stmt) != GIMPLE_LABEL
888 && !is_gimple_debug (stmt)
889 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
890 {
891 /* In distribution of loop nest, if bb is inner loop's exit_bb,
892 we choose its exit edge/path in order to avoid generating
893 infinite loop. For all other cases, we choose an arbitrary
894 path through the empty CFG part that this unnecessary
895 control stmt controls. */
896 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
897 {
898 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
899 gimple_cond_make_true (cond_stmt);
900 else
901 gimple_cond_make_false (cond_stmt);
902 update_stmt (stmt);
903 }
904 else if (gimple_code (stmt) == GIMPLE_SWITCH)
905 {
906 gswitch *switch_stmt = as_a <gswitch *> (stmt);
907 gimple_switch_set_index
908 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
909 update_stmt (stmt);
910 }
911 else
912 {
913 unlink_stmt_vdef (stmt);
914 gsi_remove (&bsi, true);
915 release_defs (stmt);
916 continue;
917 }
918 }
919 gsi_next (&bsi);
920 }
921 }
922
923 free (bbs);
924 }
925
926 /* If VAL memory representation contains the same value in all bytes,
927 return that value, otherwise return -1.
928 E.g. for 0x24242424 return 0x24, for IEEE double
929 747708026454360457216.0 return 0x44, etc. */
930
931 static int
const_with_all_bytes_same(tree val)932 const_with_all_bytes_same (tree val)
933 {
934 unsigned char buf[64];
935 int i, len;
936
937 if (integer_zerop (val)
938 || (TREE_CODE (val) == CONSTRUCTOR
939 && !TREE_CLOBBER_P (val)
940 && CONSTRUCTOR_NELTS (val) == 0))
941 return 0;
942
943 if (real_zerop (val))
944 {
945 /* Only return 0 for +0.0, not for -0.0, which doesn't have
946 an all bytes same memory representation. Don't transform
947 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
948 switch (TREE_CODE (val))
949 {
950 case REAL_CST:
951 if (!real_isneg (TREE_REAL_CST_PTR (val)))
952 return 0;
953 break;
954 case COMPLEX_CST:
955 if (!const_with_all_bytes_same (TREE_REALPART (val))
956 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
957 return 0;
958 break;
959 case VECTOR_CST:
960 {
961 unsigned int count = vector_cst_encoded_nelts (val);
962 unsigned int j;
963 for (j = 0; j < count; ++j)
964 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
965 break;
966 if (j == count)
967 return 0;
968 break;
969 }
970 default:
971 break;
972 }
973 }
974
975 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
976 return -1;
977
978 len = native_encode_expr (val, buf, sizeof (buf));
979 if (len == 0)
980 return -1;
981 for (i = 1; i < len; i++)
982 if (buf[i] != buf[0])
983 return -1;
984 return buf[0];
985 }
986
987 /* Generate a call to memset for PARTITION in LOOP. */
988
989 static void
generate_memset_builtin(struct loop * loop,partition * partition)990 generate_memset_builtin (struct loop *loop, partition *partition)
991 {
992 gimple_stmt_iterator gsi;
993 tree mem, fn, nb_bytes;
994 tree val;
995 struct builtin_info *builtin = partition->builtin;
996 gimple *fn_call;
997
998 /* The new statements will be placed before LOOP. */
999 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1000
1001 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1002 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1003 false, GSI_CONTINUE_LINKING);
1004 mem = builtin->dst_base;
1005 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1006 false, GSI_CONTINUE_LINKING);
1007
1008 /* This exactly matches the pattern recognition in classify_partition. */
1009 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1010 /* Handle constants like 0x15151515 and similarly
1011 floating point constants etc. where all bytes are the same. */
1012 int bytev = const_with_all_bytes_same (val);
1013 if (bytev != -1)
1014 val = build_int_cst (integer_type_node, bytev);
1015 else if (TREE_CODE (val) == INTEGER_CST)
1016 val = fold_convert (integer_type_node, val);
1017 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1018 {
1019 tree tem = make_ssa_name (integer_type_node);
1020 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1021 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1022 val = tem;
1023 }
1024
1025 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1026 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1027 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1028
1029 if (dump_file && (dump_flags & TDF_DETAILS))
1030 {
1031 fprintf (dump_file, "generated memset");
1032 if (bytev == 0)
1033 fprintf (dump_file, " zero\n");
1034 else
1035 fprintf (dump_file, "\n");
1036 }
1037 }
1038
1039 /* Generate a call to memcpy for PARTITION in LOOP. */
1040
1041 static void
generate_memcpy_builtin(struct loop * loop,partition * partition)1042 generate_memcpy_builtin (struct loop *loop, partition *partition)
1043 {
1044 gimple_stmt_iterator gsi;
1045 gimple *fn_call;
1046 tree dest, src, fn, nb_bytes;
1047 enum built_in_function kind;
1048 struct builtin_info *builtin = partition->builtin;
1049
1050 /* The new statements will be placed before LOOP. */
1051 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1052
1053 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1054 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1055 false, GSI_CONTINUE_LINKING);
1056 dest = builtin->dst_base;
1057 src = builtin->src_base;
1058 if (partition->kind == PKIND_MEMCPY
1059 || ! ptr_derefs_may_alias_p (dest, src))
1060 kind = BUILT_IN_MEMCPY;
1061 else
1062 kind = BUILT_IN_MEMMOVE;
1063
1064 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1065 false, GSI_CONTINUE_LINKING);
1066 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1067 false, GSI_CONTINUE_LINKING);
1068 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1069 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1070 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1071
1072 if (dump_file && (dump_flags & TDF_DETAILS))
1073 {
1074 if (kind == BUILT_IN_MEMCPY)
1075 fprintf (dump_file, "generated memcpy\n");
1076 else
1077 fprintf (dump_file, "generated memmove\n");
1078 }
1079 }
1080
1081 /* Remove and destroy the loop LOOP. */
1082
1083 static void
destroy_loop(struct loop * loop)1084 destroy_loop (struct loop *loop)
1085 {
1086 unsigned nbbs = loop->num_nodes;
1087 edge exit = single_exit (loop);
1088 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1089 basic_block *bbs;
1090 unsigned i;
1091
1092 bbs = get_loop_body_in_dom_order (loop);
1093
1094 redirect_edge_pred (exit, src);
1095 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1096 exit->flags |= EDGE_FALLTHRU;
1097 cancel_loop_tree (loop);
1098 rescan_loop_exit (exit, false, true);
1099
1100 i = nbbs;
1101 do
1102 {
1103 /* We have made sure to not leave any dangling uses of SSA
1104 names defined in the loop. With the exception of virtuals.
1105 Make sure we replace all uses of virtual defs that will remain
1106 outside of the loop with the bare symbol as delete_basic_block
1107 will release them. */
1108 --i;
1109 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1110 gsi_next (&gsi))
1111 {
1112 gphi *phi = gsi.phi ();
1113 if (virtual_operand_p (gimple_phi_result (phi)))
1114 mark_virtual_phi_result_for_renaming (phi);
1115 }
1116 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1117 gsi_next (&gsi))
1118 {
1119 gimple *stmt = gsi_stmt (gsi);
1120 tree vdef = gimple_vdef (stmt);
1121 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1122 mark_virtual_operand_for_renaming (vdef);
1123 }
1124 delete_basic_block (bbs[i]);
1125 }
1126 while (i != 0);
1127
1128 free (bbs);
1129
1130 set_immediate_dominator (CDI_DOMINATORS, dest,
1131 recompute_dominator (CDI_DOMINATORS, dest));
1132 }
1133
1134 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1135
1136 static bool
generate_code_for_partition(struct loop * loop,partition * partition,bool copy_p)1137 generate_code_for_partition (struct loop *loop,
1138 partition *partition, bool copy_p)
1139 {
1140 switch (partition->kind)
1141 {
1142 case PKIND_NORMAL:
1143 case PKIND_PARTIAL_MEMSET:
1144 /* Reductions all have to be in the last partition. */
1145 gcc_assert (!partition_reduction_p (partition)
1146 || !copy_p);
1147 generate_loops_for_partition (loop, partition, copy_p);
1148 return false;
1149
1150 case PKIND_MEMSET:
1151 generate_memset_builtin (loop, partition);
1152 break;
1153
1154 case PKIND_MEMCPY:
1155 case PKIND_MEMMOVE:
1156 generate_memcpy_builtin (loop, partition);
1157 break;
1158
1159 default:
1160 gcc_unreachable ();
1161 }
1162
1163 /* Common tail for partitions we turn into a call. If this was the last
1164 partition for which we generate code, we have to destroy the loop. */
1165 if (!copy_p)
1166 return true;
1167 return false;
1168 }
1169
1170 /* Return data dependence relation for data references A and B. The two
1171 data references must be in lexicographic order wrto reduced dependence
1172 graph RDG. We firstly try to find ddr from global ddr hash table. If
1173 it doesn't exist, compute the ddr and cache it. */
1174
1175 static data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)1176 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1177 {
1178 struct data_dependence_relation ent, **slot;
1179 struct data_dependence_relation *ddr;
1180
1181 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1182 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1183 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1184 ent.a = a;
1185 ent.b = b;
1186 slot = ddrs_table->find_slot (&ent, INSERT);
1187 if (*slot == NULL)
1188 {
1189 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1190 compute_affine_dependence (ddr, loop_nest[0]);
1191 *slot = ddr;
1192 }
1193
1194 return *slot;
1195 }
1196
1197 /* In reduced dependence graph RDG for loop distribution, return true if
1198 dependence between references DR1 and DR2 leads to a dependence cycle
1199 and such dependence cycle can't be resolved by runtime alias check. */
1200
1201 static bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)1202 data_dep_in_cycle_p (struct graph *rdg,
1203 data_reference_p dr1, data_reference_p dr2)
1204 {
1205 struct data_dependence_relation *ddr;
1206
1207 /* Re-shuffle data-refs to be in topological order. */
1208 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1209 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1210 std::swap (dr1, dr2);
1211
1212 ddr = get_data_dependence (rdg, dr1, dr2);
1213
1214 /* In case of no data dependence. */
1215 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1216 return false;
1217 /* For unknown data dependence or known data dependence which can't be
1218 expressed in classic distance vector, we check if it can be resolved
1219 by runtime alias check. If yes, we still consider data dependence
1220 as won't introduce data dependence cycle. */
1221 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1222 || DDR_NUM_DIST_VECTS (ddr) == 0)
1223 return !runtime_alias_check_p (ddr, NULL, true);
1224 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1225 return true;
1226 else if (DDR_REVERSED_P (ddr)
1227 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1228 return false;
1229
1230 return true;
1231 }
1232
1233 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1234 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1235
1236 static void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)1237 update_type_for_merge (struct graph *rdg,
1238 partition *partition1, partition *partition2)
1239 {
1240 unsigned i, j;
1241 bitmap_iterator bi, bj;
1242 data_reference_p dr1, dr2;
1243
1244 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1245 {
1246 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1247
1248 dr1 = datarefs_vec[i];
1249 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1250 {
1251 dr2 = datarefs_vec[j];
1252 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1253 continue;
1254
1255 /* Partition can only be executed sequentially if there is any
1256 data dependence cycle. */
1257 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1258 {
1259 partition1->type = PTYPE_SEQUENTIAL;
1260 return;
1261 }
1262 }
1263 }
1264 }
1265
1266 /* Returns a partition with all the statements needed for computing
1267 the vertex V of the RDG, also including the loop exit conditions. */
1268
1269 static partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)1270 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1271 {
1272 partition *partition = partition_alloc ();
1273 auto_vec<int, 3> nodes;
1274 unsigned i, j;
1275 int x;
1276 data_reference_p dr;
1277
1278 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1279
1280 FOR_EACH_VEC_ELT (nodes, i, x)
1281 {
1282 bitmap_set_bit (partition->stmts, x);
1283
1284 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1285 {
1286 unsigned idx = (unsigned) DR_INDEX (dr);
1287 gcc_assert (idx < datarefs_vec.length ());
1288
1289 /* Partition can only be executed sequentially if there is any
1290 unknown data reference. */
1291 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1292 || !DR_INIT (dr) || !DR_STEP (dr))
1293 partition->type = PTYPE_SEQUENTIAL;
1294
1295 bitmap_set_bit (partition->datarefs, idx);
1296 }
1297 }
1298
1299 if (partition->type == PTYPE_SEQUENTIAL)
1300 return partition;
1301
1302 /* Further check if any data dependence prevents us from executing the
1303 partition parallelly. */
1304 update_type_for_merge (rdg, partition, partition);
1305
1306 return partition;
1307 }
1308
1309 /* Given PARTITION of LOOP and RDG, record single load/store data references
1310 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1311 data references. */
1312
1313 static bool
find_single_drs(struct loop * loop,struct graph * rdg,partition * partition,data_reference_p * dst_dr,data_reference_p * src_dr)1314 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1315 data_reference_p *dst_dr, data_reference_p *src_dr)
1316 {
1317 unsigned i;
1318 data_reference_p single_ld = NULL, single_st = NULL;
1319 bitmap_iterator bi;
1320
1321 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1322 {
1323 gimple *stmt = RDG_STMT (rdg, i);
1324 data_reference_p dr;
1325
1326 if (gimple_code (stmt) == GIMPLE_PHI)
1327 continue;
1328
1329 /* Any scalar stmts are ok. */
1330 if (!gimple_vuse (stmt))
1331 continue;
1332
1333 /* Otherwise just regular loads/stores. */
1334 if (!gimple_assign_single_p (stmt))
1335 return false;
1336
1337 /* But exactly one store and/or load. */
1338 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1339 {
1340 tree type = TREE_TYPE (DR_REF (dr));
1341
1342 /* The memset, memcpy and memmove library calls are only
1343 able to deal with generic address space. */
1344 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1345 return false;
1346
1347 if (DR_IS_READ (dr))
1348 {
1349 if (single_ld != NULL)
1350 return false;
1351 single_ld = dr;
1352 }
1353 else
1354 {
1355 if (single_st != NULL)
1356 return false;
1357 single_st = dr;
1358 }
1359 }
1360 }
1361
1362 if (!single_st)
1363 return false;
1364
1365 /* Bail out if this is a bitfield memory reference. */
1366 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1367 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1368 return false;
1369
1370 /* Data reference must be executed exactly once per iteration of each
1371 loop in the loop nest. We only need to check dominance information
1372 against the outermost one in a perfect loop nest because a bb can't
1373 dominate outermost loop's latch without dominating inner loop's. */
1374 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1375 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1376 return false;
1377
1378 if (single_ld)
1379 {
1380 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1381 /* Direct aggregate copy or via an SSA name temporary. */
1382 if (load != store
1383 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1384 return false;
1385
1386 /* Bail out if this is a bitfield memory reference. */
1387 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1388 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1389 return false;
1390
1391 /* Load and store must be in the same loop nest. */
1392 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1393 if (bb_st->loop_father != bb_ld->loop_father)
1394 return false;
1395
1396 /* Data reference must be executed exactly once per iteration.
1397 Same as single_st, we only need to check against the outermost
1398 loop. */
1399 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1400 return false;
1401
1402 edge e = single_exit (bb_st->loop_father);
1403 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1404 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1405 if (dom_ld != dom_st)
1406 return false;
1407 }
1408
1409 *src_dr = single_ld;
1410 *dst_dr = single_st;
1411 return true;
1412 }
1413
1414 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1415 loops from inner to outer to see if loop's step equals to access size at
1416 each level of loop. Return 2 if we can prove this at all level loops;
1417 record access base and size in BASE and SIZE; save loop's step at each
1418 level of loop in STEPS if it is not null. For example:
1419
1420 int arr[100][100][100];
1421 for (i = 0; i < 100; i++) ;steps[2] = 40000
1422 for (j = 100; j > 0; j--) ;steps[1] = -400
1423 for (k = 0; k < 100; k++) ;steps[0] = 4
1424 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1425
1426 Return 1 if we can prove the equality at the innermost loop, but not all
1427 level loops. In this case, no information is recorded.
1428
1429 Return 0 if no equality can be proven at any level loops. */
1430
1431 static int
1432 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1433 tree *size, vec<tree> *steps = NULL)
1434 {
1435 location_t loc = gimple_location (DR_STMT (dr));
1436 basic_block bb = gimple_bb (DR_STMT (dr));
1437 struct loop *loop = bb->loop_father;
1438 tree ref = DR_REF (dr);
1439 tree access_base = build_fold_addr_expr (ref);
1440 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1441 int res = 0;
1442
1443 do {
1444 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1445 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1446 return res;
1447
1448 access_base = CHREC_LEFT (scev_fn);
1449 if (tree_contains_chrecs (access_base, NULL))
1450 return res;
1451
1452 tree scev_step = CHREC_RIGHT (scev_fn);
1453 /* Only support constant steps. */
1454 if (TREE_CODE (scev_step) != INTEGER_CST)
1455 return res;
1456
1457 enum ev_direction access_dir = scev_direction (scev_fn);
1458 if (access_dir == EV_DIR_UNKNOWN)
1459 return res;
1460
1461 if (steps != NULL)
1462 steps->safe_push (scev_step);
1463
1464 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1465 /* Compute absolute value of scev step. */
1466 if (access_dir == EV_DIR_DECREASES)
1467 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1468
1469 /* At each level of loop, scev step must equal to access size. In other
1470 words, DR must access consecutive memory between loop iterations. */
1471 if (!operand_equal_p (scev_step, access_size, 0))
1472 return res;
1473
1474 /* Access stride can be computed for data reference at least for the
1475 innermost loop. */
1476 res = 1;
1477
1478 /* Compute DR's execution times in loop. */
1479 tree niters = number_of_latch_executions (loop);
1480 niters = fold_convert_loc (loc, sizetype, niters);
1481 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1482 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1483
1484 /* Compute DR's overall access size in loop. */
1485 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1486 niters, scev_step);
1487 /* Adjust base address in case of negative step. */
1488 if (access_dir == EV_DIR_DECREASES)
1489 {
1490 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1491 scev_step, access_size);
1492 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1493 }
1494 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1495
1496 *base = access_base;
1497 *size = access_size;
1498 /* Access stride can be computed for data reference at each level loop. */
1499 return 2;
1500 }
1501
1502 /* Allocate and return builtin struct. Record information like DST_DR,
1503 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1504
1505 static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1506 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1507 tree dst_base, tree src_base, tree size)
1508 {
1509 struct builtin_info *builtin = XNEW (struct builtin_info);
1510 builtin->dst_dr = dst_dr;
1511 builtin->src_dr = src_dr;
1512 builtin->dst_base = dst_base;
1513 builtin->src_base = src_base;
1514 builtin->size = size;
1515 return builtin;
1516 }
1517
1518 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1519 memset call. */
1520
1521 static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1522 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1523 {
1524 gimple *stmt = DR_STMT (dr);
1525 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1526
1527 if (const_with_all_bytes_same (rhs) == -1
1528 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1529 || (TYPE_MODE (TREE_TYPE (rhs))
1530 != TYPE_MODE (unsigned_char_type_node))))
1531 return;
1532
1533 if (TREE_CODE (rhs) == SSA_NAME
1534 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1535 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1536 return;
1537
1538 int res = compute_access_range (loop, dr, &base, &size);
1539 if (res == 0)
1540 return;
1541 if (res == 1)
1542 {
1543 partition->kind = PKIND_PARTIAL_MEMSET;
1544 return;
1545 }
1546
1547 poly_uint64 base_offset;
1548 unsigned HOST_WIDE_INT const_base_offset;
1549 tree base_base = strip_offset (base, &base_offset);
1550 if (!base_offset.is_constant (&const_base_offset))
1551 return;
1552
1553 struct builtin_info *builtin;
1554 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1555 builtin->dst_base_base = base_base;
1556 builtin->dst_base_offset = const_base_offset;
1557 partition->builtin = builtin;
1558 partition->kind = PKIND_MEMSET;
1559 }
1560
1561 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1562 if it forms builtin memcpy or memmove call. */
1563
1564 static void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)1565 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1566 data_reference_p dst_dr, data_reference_p src_dr)
1567 {
1568 tree base, size, src_base, src_size;
1569 auto_vec<tree> dst_steps, src_steps;
1570
1571 /* Compute access range of both load and store. */
1572 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1573 if (res != 2)
1574 return;
1575 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1576 if (res != 2)
1577 return;
1578
1579 /* They much have the same access size. */
1580 if (!operand_equal_p (size, src_size, 0))
1581 return;
1582
1583 /* Load and store in loop nest must access memory in the same way, i.e,
1584 their must have the same steps in each loop of the nest. */
1585 if (dst_steps.length () != src_steps.length ())
1586 return;
1587 for (unsigned i = 0; i < dst_steps.length (); ++i)
1588 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1589 return;
1590
1591 /* Now check that if there is a dependence. */
1592 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1593
1594 /* Classify as memcpy if no dependence between load and store. */
1595 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1596 {
1597 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1598 partition->kind = PKIND_MEMCPY;
1599 return;
1600 }
1601
1602 /* Can't do memmove in case of unknown dependence or dependence without
1603 classical distance vector. */
1604 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1605 || DDR_NUM_DIST_VECTS (ddr) == 0)
1606 return;
1607
1608 unsigned i;
1609 lambda_vector dist_v;
1610 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1611 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1612 {
1613 unsigned dep_lev = dependence_level (dist_v, num_lev);
1614 /* Can't do memmove if load depends on store. */
1615 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1616 return;
1617 }
1618
1619 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1620 partition->kind = PKIND_MEMMOVE;
1621 return;
1622 }
1623
1624 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1625 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1626 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1627
1628 static void
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)1629 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1630 bitmap stmt_in_all_partitions)
1631 {
1632 bitmap_iterator bi;
1633 unsigned i;
1634 data_reference_p single_ld = NULL, single_st = NULL;
1635 bool volatiles_p = false, has_reduction = false;
1636
1637 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1638 {
1639 gimple *stmt = RDG_STMT (rdg, i);
1640
1641 if (gimple_has_volatile_ops (stmt))
1642 volatiles_p = true;
1643
1644 /* If the stmt is not included by all partitions and there is uses
1645 outside of the loop, then mark the partition as reduction. */
1646 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1647 {
1648 /* Due to limitation in the transform phase we have to fuse all
1649 reduction partitions. As a result, this could cancel valid
1650 loop distribution especially for loop that induction variable
1651 is used outside of loop. To workaround this issue, we skip
1652 marking partition as reudction if the reduction stmt belongs
1653 to all partitions. In such case, reduction will be computed
1654 correctly no matter how partitions are fused/distributed. */
1655 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1656 {
1657 partition->reduction_p = true;
1658 return;
1659 }
1660 has_reduction = true;
1661 }
1662 }
1663
1664 /* Perform general partition disqualification for builtins. */
1665 if (volatiles_p
1666 /* Simple workaround to prevent classifying the partition as builtin
1667 if it contains any use outside of loop. */
1668 || has_reduction
1669 || !flag_tree_loop_distribute_patterns)
1670 return;
1671
1672 /* Find single load/store data references for builtin partition. */
1673 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1674 return;
1675
1676 /* Classify the builtin kind. */
1677 if (single_ld == NULL)
1678 classify_builtin_st (loop, partition, single_st);
1679 else
1680 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1681 }
1682
1683 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1684 object in RDG. */
1685
1686 static bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)1687 share_memory_accesses (struct graph *rdg,
1688 partition *partition1, partition *partition2)
1689 {
1690 unsigned i, j;
1691 bitmap_iterator bi, bj;
1692 data_reference_p dr1, dr2;
1693
1694 /* First check whether in the intersection of the two partitions are
1695 any loads or stores. Common loads are the situation that happens
1696 most often. */
1697 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1698 if (RDG_MEM_WRITE_STMT (rdg, i)
1699 || RDG_MEM_READS_STMT (rdg, i))
1700 return true;
1701
1702 /* Then check whether the two partitions access the same memory object. */
1703 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1704 {
1705 dr1 = datarefs_vec[i];
1706
1707 if (!DR_BASE_ADDRESS (dr1)
1708 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1709 continue;
1710
1711 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1712 {
1713 dr2 = datarefs_vec[j];
1714
1715 if (!DR_BASE_ADDRESS (dr2)
1716 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1717 continue;
1718
1719 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1720 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1721 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1722 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1723 return true;
1724 }
1725 }
1726
1727 return false;
1728 }
1729
1730 /* For each seed statement in STARTING_STMTS, this function builds
1731 partition for it by adding depended statements according to RDG.
1732 All partitions are recorded in PARTITIONS. */
1733
1734 static void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)1735 rdg_build_partitions (struct graph *rdg,
1736 vec<gimple *> starting_stmts,
1737 vec<partition *> *partitions)
1738 {
1739 auto_bitmap processed;
1740 int i;
1741 gimple *stmt;
1742
1743 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1744 {
1745 int v = rdg_vertex_for_stmt (rdg, stmt);
1746
1747 if (dump_file && (dump_flags & TDF_DETAILS))
1748 fprintf (dump_file,
1749 "ldist asked to generate code for vertex %d\n", v);
1750
1751 /* If the vertex is already contained in another partition so
1752 is the partition rooted at it. */
1753 if (bitmap_bit_p (processed, v))
1754 continue;
1755
1756 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1757 bitmap_ior_into (processed, partition->stmts);
1758
1759 if (dump_file && (dump_flags & TDF_DETAILS))
1760 {
1761 fprintf (dump_file, "ldist creates useful %s partition:\n",
1762 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1763 bitmap_print (dump_file, partition->stmts, " ", "\n");
1764 }
1765
1766 partitions->safe_push (partition);
1767 }
1768
1769 /* All vertices should have been assigned to at least one partition now,
1770 other than vertices belonging to dead code. */
1771 }
1772
1773 /* Dump to FILE the PARTITIONS. */
1774
1775 static void
dump_rdg_partitions(FILE * file,vec<partition * > partitions)1776 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1777 {
1778 int i;
1779 partition *partition;
1780
1781 FOR_EACH_VEC_ELT (partitions, i, partition)
1782 debug_bitmap_file (file, partition->stmts);
1783 }
1784
1785 /* Debug PARTITIONS. */
1786 extern void debug_rdg_partitions (vec<partition *> );
1787
1788 DEBUG_FUNCTION void
debug_rdg_partitions(vec<partition * > partitions)1789 debug_rdg_partitions (vec<partition *> partitions)
1790 {
1791 dump_rdg_partitions (stderr, partitions);
1792 }
1793
1794 /* Returns the number of read and write operations in the RDG. */
1795
1796 static int
number_of_rw_in_rdg(struct graph * rdg)1797 number_of_rw_in_rdg (struct graph *rdg)
1798 {
1799 int i, res = 0;
1800
1801 for (i = 0; i < rdg->n_vertices; i++)
1802 {
1803 if (RDG_MEM_WRITE_STMT (rdg, i))
1804 ++res;
1805
1806 if (RDG_MEM_READS_STMT (rdg, i))
1807 ++res;
1808 }
1809
1810 return res;
1811 }
1812
1813 /* Returns the number of read and write operations in a PARTITION of
1814 the RDG. */
1815
1816 static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)1817 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1818 {
1819 int res = 0;
1820 unsigned i;
1821 bitmap_iterator ii;
1822
1823 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1824 {
1825 if (RDG_MEM_WRITE_STMT (rdg, i))
1826 ++res;
1827
1828 if (RDG_MEM_READS_STMT (rdg, i))
1829 ++res;
1830 }
1831
1832 return res;
1833 }
1834
1835 /* Returns true when one of the PARTITIONS contains all the read or
1836 write operations of RDG. */
1837
1838 static bool
partition_contains_all_rw(struct graph * rdg,vec<partition * > partitions)1839 partition_contains_all_rw (struct graph *rdg,
1840 vec<partition *> partitions)
1841 {
1842 int i;
1843 partition *partition;
1844 int nrw = number_of_rw_in_rdg (rdg);
1845
1846 FOR_EACH_VEC_ELT (partitions, i, partition)
1847 if (nrw == number_of_rw_in_partition (rdg, partition))
1848 return true;
1849
1850 return false;
1851 }
1852
1853 /* Compute partition dependence created by the data references in DRS1
1854 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1855 not NULL, we record dependence introduced by possible alias between
1856 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1857 dependence as if it doesn't exist at all. */
1858
1859 static int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)1860 pg_add_dependence_edges (struct graph *rdg, int dir,
1861 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1862 {
1863 unsigned i, j;
1864 bitmap_iterator bi, bj;
1865 data_reference_p dr1, dr2, saved_dr1;
1866
1867 /* dependence direction - 0 is no dependence, -1 is back,
1868 1 is forth, 2 is both (we can stop then, merging will occur). */
1869 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1870 {
1871 dr1 = datarefs_vec[i];
1872
1873 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1874 {
1875 int res, this_dir = 1;
1876 ddr_p ddr;
1877
1878 dr2 = datarefs_vec[j];
1879
1880 /* Skip all <read, read> data dependence. */
1881 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1882 continue;
1883
1884 saved_dr1 = dr1;
1885 /* Re-shuffle data-refs to be in topological order. */
1886 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1887 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1888 {
1889 std::swap (dr1, dr2);
1890 this_dir = -this_dir;
1891 }
1892 ddr = get_data_dependence (rdg, dr1, dr2);
1893 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1894 {
1895 this_dir = 0;
1896 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1897 DR_BASE_ADDRESS (dr2));
1898 /* Be conservative. If data references are not well analyzed,
1899 or the two data references have the same base address and
1900 offset, add dependence and consider it alias to each other.
1901 In other words, the dependence can not be resolved by
1902 runtime alias check. */
1903 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1904 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1905 || !DR_INIT (dr1) || !DR_INIT (dr2)
1906 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1907 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1908 || res == 0)
1909 this_dir = 2;
1910 /* Data dependence could be resolved by runtime alias check,
1911 record it in ALIAS_DDRS. */
1912 else if (alias_ddrs != NULL)
1913 alias_ddrs->safe_push (ddr);
1914 /* Or simply ignore it. */
1915 }
1916 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1917 {
1918 if (DDR_REVERSED_P (ddr))
1919 this_dir = -this_dir;
1920
1921 /* Known dependences can still be unordered througout the
1922 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1923 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1924 this_dir = 2;
1925 /* If the overlap is exact preserve stmt order. */
1926 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1927 DDR_NB_LOOPS (ddr)))
1928 ;
1929 /* Else as the distance vector is lexicographic positive swap
1930 the dependence direction. */
1931 else
1932 this_dir = -this_dir;
1933 }
1934 else
1935 this_dir = 0;
1936 if (this_dir == 2)
1937 return 2;
1938 else if (dir == 0)
1939 dir = this_dir;
1940 else if (this_dir != 0 && dir != this_dir)
1941 return 2;
1942 /* Shuffle "back" dr1. */
1943 dr1 = saved_dr1;
1944 }
1945 }
1946 return dir;
1947 }
1948
1949 /* Compare postorder number of the partition graph vertices V1 and V2. */
1950
1951 static int
pgcmp(const void * v1_,const void * v2_)1952 pgcmp (const void *v1_, const void *v2_)
1953 {
1954 const vertex *v1 = (const vertex *)v1_;
1955 const vertex *v2 = (const vertex *)v2_;
1956 return v2->post - v1->post;
1957 }
1958
1959 /* Data attached to vertices of partition dependence graph. */
1960 struct pg_vdata
1961 {
1962 /* ID of the corresponding partition. */
1963 int id;
1964 /* The partition. */
1965 struct partition *partition;
1966 };
1967
1968 /* Data attached to edges of partition dependence graph. */
1969 struct pg_edata
1970 {
1971 /* If the dependence edge can be resolved by runtime alias check,
1972 this vector contains data dependence relations for runtime alias
1973 check. On the other hand, if the dependence edge is introduced
1974 because of compilation time known data dependence, this vector
1975 contains nothing. */
1976 vec<ddr_p> alias_ddrs;
1977 };
1978
1979 /* Callback data for traversing edges in graph. */
1980 struct pg_edge_callback_data
1981 {
1982 /* Bitmap contains strong connected components should be merged. */
1983 bitmap sccs_to_merge;
1984 /* Array constains component information for all vertices. */
1985 int *vertices_component;
1986 /* Vector to record all data dependence relations which are needed
1987 to break strong connected components by runtime alias checks. */
1988 vec<ddr_p> *alias_ddrs;
1989 };
1990
1991 /* Initialize vertice's data for partition dependence graph PG with
1992 PARTITIONS. */
1993
1994 static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)1995 init_partition_graph_vertices (struct graph *pg,
1996 vec<struct partition *> *partitions)
1997 {
1998 int i;
1999 partition *partition;
2000 struct pg_vdata *data;
2001
2002 for (i = 0; partitions->iterate (i, &partition); ++i)
2003 {
2004 data = new pg_vdata;
2005 pg->vertices[i].data = data;
2006 data->id = i;
2007 data->partition = partition;
2008 }
2009 }
2010
2011 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2012 dependence relations to the EDGE if DDRS isn't NULL. */
2013
2014 static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2015 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2016 {
2017 struct graph_edge *e = add_edge (pg, i, j);
2018
2019 /* If the edge is attached with data dependence relations, it means this
2020 dependence edge can be resolved by runtime alias checks. */
2021 if (ddrs != NULL)
2022 {
2023 struct pg_edata *data = new pg_edata;
2024
2025 gcc_assert (ddrs->length () > 0);
2026 e->data = data;
2027 data->alias_ddrs = vNULL;
2028 data->alias_ddrs.safe_splice (*ddrs);
2029 }
2030 }
2031
2032 /* Callback function for graph travesal algorithm. It returns true
2033 if edge E should skipped when traversing the graph. */
2034
2035 static bool
pg_skip_alias_edge(struct graph_edge * e)2036 pg_skip_alias_edge (struct graph_edge *e)
2037 {
2038 struct pg_edata *data = (struct pg_edata *)e->data;
2039 return (data != NULL && data->alias_ddrs.length () > 0);
2040 }
2041
2042 /* Callback function freeing data attached to edge E of graph. */
2043
2044 static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2045 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2046 {
2047 if (e->data != NULL)
2048 {
2049 struct pg_edata *data = (struct pg_edata *)e->data;
2050 data->alias_ddrs.release ();
2051 delete data;
2052 }
2053 }
2054
2055 /* Free data attached to vertice of partition dependence graph PG. */
2056
2057 static void
free_partition_graph_vdata(struct graph * pg)2058 free_partition_graph_vdata (struct graph *pg)
2059 {
2060 int i;
2061 struct pg_vdata *data;
2062
2063 for (i = 0; i < pg->n_vertices; ++i)
2064 {
2065 data = (struct pg_vdata *)pg->vertices[i].data;
2066 delete data;
2067 }
2068 }
2069
2070 /* Build and return partition dependence graph for PARTITIONS. RDG is
2071 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2072 is true, data dependence caused by possible alias between references
2073 is ignored, as if it doesn't exist at all; otherwise all depdendences
2074 are considered. */
2075
2076 static struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2077 build_partition_graph (struct graph *rdg,
2078 vec<struct partition *> *partitions,
2079 bool ignore_alias_p)
2080 {
2081 int i, j;
2082 struct partition *partition1, *partition2;
2083 graph *pg = new_graph (partitions->length ());
2084 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2085
2086 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2087
2088 init_partition_graph_vertices (pg, partitions);
2089
2090 for (i = 0; partitions->iterate (i, &partition1); ++i)
2091 {
2092 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2093 {
2094 /* dependence direction - 0 is no dependence, -1 is back,
2095 1 is forth, 2 is both (we can stop then, merging will occur). */
2096 int dir = 0;
2097
2098 /* If the first partition has reduction, add back edge; if the
2099 second partition has reduction, add forth edge. This makes
2100 sure that reduction partition will be sorted as the last one. */
2101 if (partition_reduction_p (partition1))
2102 dir = -1;
2103 else if (partition_reduction_p (partition2))
2104 dir = 1;
2105
2106 /* Cleanup the temporary vector. */
2107 alias_ddrs.truncate (0);
2108
2109 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2110 partition2->datarefs, alias_ddrs_p);
2111
2112 /* Add edge to partition graph if there exists dependence. There
2113 are two types of edges. One type edge is caused by compilation
2114 time known dependence, this type can not be resolved by runtime
2115 alias check. The other type can be resolved by runtime alias
2116 check. */
2117 if (dir == 1 || dir == 2
2118 || alias_ddrs.length () > 0)
2119 {
2120 /* Attach data dependence relations to edge that can be resolved
2121 by runtime alias check. */
2122 bool alias_edge_p = (dir != 1 && dir != 2);
2123 add_partition_graph_edge (pg, i, j,
2124 (alias_edge_p) ? &alias_ddrs : NULL);
2125 }
2126 if (dir == -1 || dir == 2
2127 || alias_ddrs.length () > 0)
2128 {
2129 /* Attach data dependence relations to edge that can be resolved
2130 by runtime alias check. */
2131 bool alias_edge_p = (dir != -1 && dir != 2);
2132 add_partition_graph_edge (pg, j, i,
2133 (alias_edge_p) ? &alias_ddrs : NULL);
2134 }
2135 }
2136 }
2137 return pg;
2138 }
2139
2140 /* Sort partitions in PG in descending post order and store them in
2141 PARTITIONS. */
2142
2143 static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2144 sort_partitions_by_post_order (struct graph *pg,
2145 vec<struct partition *> *partitions)
2146 {
2147 int i;
2148 struct pg_vdata *data;
2149
2150 /* Now order the remaining nodes in descending postorder. */
2151 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2152 partitions->truncate (0);
2153 for (i = 0; i < pg->n_vertices; ++i)
2154 {
2155 data = (struct pg_vdata *)pg->vertices[i].data;
2156 if (data->partition)
2157 partitions->safe_push (data->partition);
2158 }
2159 }
2160
2161 /* Given reduced dependence graph RDG merge strong connected components
2162 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2163 possible alias between references is ignored, as if it doesn't exist
2164 at all; otherwise all depdendences are considered. */
2165
2166 static void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2167 merge_dep_scc_partitions (struct graph *rdg,
2168 vec<struct partition *> *partitions,
2169 bool ignore_alias_p)
2170 {
2171 struct partition *partition1, *partition2;
2172 struct pg_vdata *data;
2173 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2174 int i, j, num_sccs = graphds_scc (pg, NULL);
2175
2176 /* Strong connected compoenent means dependence cycle, we cannot distribute
2177 them. So fuse them together. */
2178 if ((unsigned) num_sccs < partitions->length ())
2179 {
2180 for (i = 0; i < num_sccs; ++i)
2181 {
2182 for (j = 0; partitions->iterate (j, &partition1); ++j)
2183 if (pg->vertices[j].component == i)
2184 break;
2185 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2186 if (pg->vertices[j].component == i)
2187 {
2188 partition_merge_into (NULL, partition1,
2189 partition2, FUSE_SAME_SCC);
2190 partition1->type = PTYPE_SEQUENTIAL;
2191 (*partitions)[j] = NULL;
2192 partition_free (partition2);
2193 data = (struct pg_vdata *)pg->vertices[j].data;
2194 data->partition = NULL;
2195 }
2196 }
2197 }
2198
2199 sort_partitions_by_post_order (pg, partitions);
2200 gcc_assert (partitions->length () == (unsigned)num_sccs);
2201 free_partition_graph_vdata (pg);
2202 free_graph (pg);
2203 }
2204
2205 /* Callback function for traversing edge E in graph G. DATA is private
2206 callback data. */
2207
2208 static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2209 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2210 {
2211 int i, j, component;
2212 struct pg_edge_callback_data *cbdata;
2213 struct pg_edata *edata = (struct pg_edata *) e->data;
2214
2215 /* If the edge doesn't have attached data dependence, it represents
2216 compilation time known dependences. This type dependence cannot
2217 be resolved by runtime alias check. */
2218 if (edata == NULL || edata->alias_ddrs.length () == 0)
2219 return;
2220
2221 cbdata = (struct pg_edge_callback_data *) data;
2222 i = e->src;
2223 j = e->dest;
2224 component = cbdata->vertices_component[i];
2225 /* Vertices are topologically sorted according to compilation time
2226 known dependences, so we can break strong connected components
2227 by removing edges of the opposite direction, i.e, edges pointing
2228 from vertice with smaller post number to vertice with bigger post
2229 number. */
2230 if (g->vertices[i].post < g->vertices[j].post
2231 /* We only need to remove edges connecting vertices in the same
2232 strong connected component to break it. */
2233 && component == cbdata->vertices_component[j]
2234 /* Check if we want to break the strong connected component or not. */
2235 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2236 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2237 }
2238
2239 /* This is the main function breaking strong conected components in
2240 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2241 relations for runtime alias check in ALIAS_DDRS. */
2242
2243 static void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2244 break_alias_scc_partitions (struct graph *rdg,
2245 vec<struct partition *> *partitions,
2246 vec<ddr_p> *alias_ddrs)
2247 {
2248 int i, j, k, num_sccs, num_sccs_no_alias;
2249 /* Build partition dependence graph. */
2250 graph *pg = build_partition_graph (rdg, partitions, false);
2251
2252 alias_ddrs->truncate (0);
2253 /* Find strong connected components in the graph, with all dependence edges
2254 considered. */
2255 num_sccs = graphds_scc (pg, NULL);
2256 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2257 compilation time known dependences are merged before this function. */
2258 if ((unsigned) num_sccs < partitions->length ())
2259 {
2260 struct pg_edge_callback_data cbdata;
2261 auto_bitmap sccs_to_merge;
2262 auto_vec<enum partition_type> scc_types;
2263 struct partition *partition, *first;
2264
2265 /* If all partitions in a SCC have the same type, we can simply merge the
2266 SCC. This loop finds out such SCCS and record them in bitmap. */
2267 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2268 for (i = 0; i < num_sccs; ++i)
2269 {
2270 for (j = 0; partitions->iterate (j, &first); ++j)
2271 if (pg->vertices[j].component == i)
2272 break;
2273 for (++j; partitions->iterate (j, &partition); ++j)
2274 {
2275 if (pg->vertices[j].component != i)
2276 continue;
2277
2278 /* Note we Merge partitions of parallel type on purpose, though
2279 the result partition is sequential. The reason is vectorizer
2280 can do more accurate runtime alias check in this case. Also
2281 it results in more conservative distribution. */
2282 if (first->type != partition->type)
2283 {
2284 bitmap_clear_bit (sccs_to_merge, i);
2285 break;
2286 }
2287 }
2288 }
2289
2290 /* Initialize callback data for traversing. */
2291 cbdata.sccs_to_merge = sccs_to_merge;
2292 cbdata.alias_ddrs = alias_ddrs;
2293 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2294 /* Record the component information which will be corrupted by next
2295 graph scc finding call. */
2296 for (i = 0; i < pg->n_vertices; ++i)
2297 cbdata.vertices_component[i] = pg->vertices[i].component;
2298
2299 /* Collect data dependences for runtime alias checks to break SCCs. */
2300 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2301 {
2302 /* Run SCC finding algorithm again, with alias dependence edges
2303 skipped. This is to topologically sort partitions according to
2304 compilation time known dependence. Note the topological order
2305 is stored in the form of pg's post order number. */
2306 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2307 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2308 /* With topological order, we can construct two subgraphs L and R.
2309 L contains edge <x, y> where x < y in terms of post order, while
2310 R contains edge <x, y> where x > y. Edges for compilation time
2311 known dependence all fall in R, so we break SCCs by removing all
2312 (alias) edges of in subgraph L. */
2313 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2314 }
2315
2316 /* For SCC that doesn't need to be broken, merge it. */
2317 for (i = 0; i < num_sccs; ++i)
2318 {
2319 if (!bitmap_bit_p (sccs_to_merge, i))
2320 continue;
2321
2322 for (j = 0; partitions->iterate (j, &first); ++j)
2323 if (cbdata.vertices_component[j] == i)
2324 break;
2325 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2326 {
2327 struct pg_vdata *data;
2328
2329 if (cbdata.vertices_component[k] != i)
2330 continue;
2331
2332 /* Update postorder number so that merged reduction partition is
2333 sorted after other partitions. */
2334 if (!partition_reduction_p (first)
2335 && partition_reduction_p (partition))
2336 {
2337 gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2338 pg->vertices[j].post = pg->vertices[k].post;
2339 }
2340 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2341 (*partitions)[k] = NULL;
2342 partition_free (partition);
2343 data = (struct pg_vdata *)pg->vertices[k].data;
2344 gcc_assert (data->id == k);
2345 data->partition = NULL;
2346 /* The result partition of merged SCC must be sequential. */
2347 first->type = PTYPE_SEQUENTIAL;
2348 }
2349 }
2350 }
2351
2352 sort_partitions_by_post_order (pg, partitions);
2353 free_partition_graph_vdata (pg);
2354 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2355 free_graph (pg);
2356
2357 if (dump_file && (dump_flags & TDF_DETAILS))
2358 {
2359 fprintf (dump_file, "Possible alias data dependence to break:\n");
2360 dump_data_dependence_relations (dump_file, *alias_ddrs);
2361 }
2362 }
2363
2364 /* Compute and return an expression whose value is the segment length which
2365 will be accessed by DR in NITERS iterations. */
2366
2367 static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2368 data_ref_segment_size (struct data_reference *dr, tree niters)
2369 {
2370 niters = size_binop (MINUS_EXPR,
2371 fold_convert (sizetype, niters),
2372 size_one_node);
2373 return size_binop (MULT_EXPR,
2374 fold_convert (sizetype, DR_STEP (dr)),
2375 fold_convert (sizetype, niters));
2376 }
2377
2378 /* Return true if LOOP's latch is dominated by statement for data reference
2379 DR. */
2380
2381 static inline bool
latch_dominated_by_data_ref(struct loop * loop,data_reference * dr)2382 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2383 {
2384 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2385 gimple_bb (DR_STMT (dr)));
2386 }
2387
2388 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2389 data dependence relations ALIAS_DDRS. */
2390
2391 static void
compute_alias_check_pairs(struct loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)2392 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2393 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2394 {
2395 unsigned int i;
2396 unsigned HOST_WIDE_INT factor = 1;
2397 tree niters_plus_one, niters = number_of_latch_executions (loop);
2398
2399 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2400 niters = fold_convert (sizetype, niters);
2401 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2402
2403 if (dump_file && (dump_flags & TDF_DETAILS))
2404 fprintf (dump_file, "Creating alias check pairs:\n");
2405
2406 /* Iterate all data dependence relations and compute alias check pairs. */
2407 for (i = 0; i < alias_ddrs->length (); i++)
2408 {
2409 ddr_p ddr = (*alias_ddrs)[i];
2410 struct data_reference *dr_a = DDR_A (ddr);
2411 struct data_reference *dr_b = DDR_B (ddr);
2412 tree seg_length_a, seg_length_b;
2413 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2414 DR_BASE_ADDRESS (dr_b));
2415
2416 if (comp_res == 0)
2417 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2418 gcc_assert (comp_res != 0);
2419
2420 if (latch_dominated_by_data_ref (loop, dr_a))
2421 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2422 else
2423 seg_length_a = data_ref_segment_size (dr_a, niters);
2424
2425 if (latch_dominated_by_data_ref (loop, dr_b))
2426 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2427 else
2428 seg_length_b = data_ref_segment_size (dr_b, niters);
2429
2430 unsigned HOST_WIDE_INT access_size_a
2431 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2432 unsigned HOST_WIDE_INT access_size_b
2433 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2434 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2435 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2436
2437 dr_with_seg_len_pair_t dr_with_seg_len_pair
2438 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2439 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2440
2441 /* Canonicalize pairs by sorting the two DR members. */
2442 if (comp_res > 0)
2443 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2444
2445 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2446 }
2447
2448 if (tree_fits_uhwi_p (niters))
2449 factor = tree_to_uhwi (niters);
2450
2451 /* Prune alias check pairs. */
2452 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2453 if (dump_file && (dump_flags & TDF_DETAILS))
2454 fprintf (dump_file,
2455 "Improved number of alias checks from %d to %d\n",
2456 alias_ddrs->length (), comp_alias_pairs->length ());
2457 }
2458
2459 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2460 checks and version LOOP under condition of these runtime alias checks. */
2461
2462 static void
version_loop_by_alias_check(struct loop * loop,vec<ddr_p> * alias_ddrs)2463 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2464 {
2465 profile_probability prob;
2466 basic_block cond_bb;
2467 struct loop *nloop;
2468 tree lhs, arg0, cond_expr = NULL_TREE;
2469 gimple_seq cond_stmts = NULL;
2470 gimple *call_stmt = NULL;
2471 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2472
2473 /* Generate code for runtime alias checks if necessary. */
2474 gcc_assert (alias_ddrs->length () > 0);
2475
2476 if (dump_file && (dump_flags & TDF_DETAILS))
2477 fprintf (dump_file,
2478 "Version loop <%d> with runtime alias check\n", loop->num);
2479
2480 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2481 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2482 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2483 is_gimple_val, NULL_TREE);
2484
2485 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2486 if (flag_tree_loop_vectorize)
2487 {
2488 /* Generate internal function call for loop distribution alias check. */
2489 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2490 2, NULL_TREE, cond_expr);
2491 lhs = make_ssa_name (boolean_type_node);
2492 gimple_call_set_lhs (call_stmt, lhs);
2493 }
2494 else
2495 lhs = cond_expr;
2496
2497 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2498 initialize_original_copy_tables ();
2499 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2500 prob, prob.invert (), true);
2501 free_original_copy_tables ();
2502 /* Record the original loop number in newly generated loops. In case of
2503 distribution, the original loop will be distributed and the new loop
2504 is kept. */
2505 loop->orig_loop_num = nloop->num;
2506 nloop->orig_loop_num = nloop->num;
2507 nloop->dont_vectorize = true;
2508 nloop->force_vectorize = false;
2509
2510 if (call_stmt)
2511 {
2512 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2513 loop could be destroyed. */
2514 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2515 gimple_call_set_arg (call_stmt, 0, arg0);
2516 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2517 }
2518
2519 if (cond_stmts)
2520 {
2521 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2522 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2523 }
2524 update_ssa (TODO_update_ssa);
2525 }
2526
2527 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2528 ALIAS_DDRS are data dependence relations for runtime alias check. */
2529
2530 static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2531 version_for_distribution_p (vec<struct partition *> *partitions,
2532 vec<ddr_p> *alias_ddrs)
2533 {
2534 /* No need to version loop if we have only one partition. */
2535 if (partitions->length () == 1)
2536 return false;
2537
2538 /* Need to version loop if runtime alias check is necessary. */
2539 return (alias_ddrs->length () > 0);
2540 }
2541
2542 /* Compare base offset of builtin mem* partitions P1 and P2. */
2543
2544 static bool
offset_cmp(struct partition * p1,struct partition * p2)2545 offset_cmp (struct partition *p1, struct partition *p2)
2546 {
2547 gcc_assert (p1 != NULL && p1->builtin != NULL);
2548 gcc_assert (p2 != NULL && p2->builtin != NULL);
2549 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2550 }
2551
2552 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2553 case optimization transforming below code:
2554
2555 __builtin_memset (&obj, 0, 100);
2556 _1 = &obj + 100;
2557 __builtin_memset (_1, 0, 200);
2558 _2 = &obj + 300;
2559 __builtin_memset (_2, 0, 100);
2560
2561 into:
2562
2563 __builtin_memset (&obj, 0, 400);
2564
2565 Note we don't have dependence information between different partitions
2566 at this point, as a result, we can't handle nonadjacent memset builtin
2567 partitions since dependence might be broken. */
2568
2569 static void
fuse_memset_builtins(vec<struct partition * > * partitions)2570 fuse_memset_builtins (vec<struct partition *> *partitions)
2571 {
2572 unsigned i, j;
2573 struct partition *part1, *part2;
2574 tree rhs1, rhs2;
2575
2576 for (i = 0; partitions->iterate (i, &part1);)
2577 {
2578 if (part1->kind != PKIND_MEMSET)
2579 {
2580 i++;
2581 continue;
2582 }
2583
2584 /* Find sub-array of memset builtins of the same base. Index range
2585 of the sub-array is [i, j) with "j > i". */
2586 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2587 {
2588 if (part2->kind != PKIND_MEMSET
2589 || !operand_equal_p (part1->builtin->dst_base_base,
2590 part2->builtin->dst_base_base, 0))
2591 break;
2592
2593 /* Memset calls setting different values can't be merged. */
2594 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2595 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2596 if (!operand_equal_p (rhs1, rhs2, 0))
2597 break;
2598 }
2599
2600 /* Stable sort is required in order to avoid breaking dependence. */
2601 std::stable_sort (&(*partitions)[i],
2602 &(*partitions)[i] + j - i, offset_cmp);
2603 /* Continue with next partition. */
2604 i = j;
2605 }
2606
2607 /* Merge all consecutive memset builtin partitions. */
2608 for (i = 0; i < partitions->length () - 1;)
2609 {
2610 part1 = (*partitions)[i];
2611 if (part1->kind != PKIND_MEMSET)
2612 {
2613 i++;
2614 continue;
2615 }
2616
2617 part2 = (*partitions)[i + 1];
2618 /* Only merge memset partitions of the same base and with constant
2619 access sizes. */
2620 if (part2->kind != PKIND_MEMSET
2621 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2622 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2623 || !operand_equal_p (part1->builtin->dst_base_base,
2624 part2->builtin->dst_base_base, 0))
2625 {
2626 i++;
2627 continue;
2628 }
2629 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2630 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2631 int bytev1 = const_with_all_bytes_same (rhs1);
2632 int bytev2 = const_with_all_bytes_same (rhs2);
2633 /* Only merge memset partitions of the same value. */
2634 if (bytev1 != bytev2 || bytev1 == -1)
2635 {
2636 i++;
2637 continue;
2638 }
2639 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2640 wi::to_wide (part1->builtin->size));
2641 /* Only merge adjacent memset partitions. */
2642 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2643 {
2644 i++;
2645 continue;
2646 }
2647 /* Merge partitions[i] and partitions[i+1]. */
2648 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2649 part1->builtin->size,
2650 part2->builtin->size);
2651 partition_free (part2);
2652 partitions->ordered_remove (i + 1);
2653 }
2654 }
2655
2656 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2657 ALIAS_DDRS contains ddrs which need runtime alias check. */
2658
2659 static void
finalize_partitions(struct loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2660 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2661 vec<ddr_p> *alias_ddrs)
2662 {
2663 unsigned i;
2664 struct partition *partition, *a;
2665
2666 if (partitions->length () == 1
2667 || alias_ddrs->length () > 0)
2668 return;
2669
2670 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2671 bool same_type_p = true;
2672 enum partition_type type = ((*partitions)[0])->type;
2673 for (i = 0; partitions->iterate (i, &partition); ++i)
2674 {
2675 same_type_p &= (type == partition->type);
2676 if (partition_builtin_p (partition))
2677 {
2678 num_builtin++;
2679 continue;
2680 }
2681 num_normal++;
2682 if (partition->kind == PKIND_PARTIAL_MEMSET)
2683 num_partial_memset++;
2684 }
2685
2686 /* Don't distribute current loop into too many loops given we don't have
2687 memory stream cost model. Be even more conservative in case of loop
2688 nest distribution. */
2689 if ((same_type_p && num_builtin == 0
2690 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2691 || (loop->inner != NULL
2692 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2693 || (loop->inner == NULL
2694 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2695 {
2696 a = (*partitions)[0];
2697 for (i = 1; partitions->iterate (i, &partition); ++i)
2698 {
2699 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2700 partition_free (partition);
2701 }
2702 partitions->truncate (1);
2703 }
2704
2705 /* Fuse memset builtins if possible. */
2706 if (partitions->length () > 1)
2707 fuse_memset_builtins (partitions);
2708 }
2709
2710 /* Distributes the code from LOOP in such a way that producer statements
2711 are placed before consumer statements. Tries to separate only the
2712 statements from STMTS into separate loops. Returns the number of
2713 distributed loops. Set NB_CALLS to number of generated builtin calls.
2714 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2715
2716 static int
distribute_loop(struct loop * loop,vec<gimple * > stmts,control_dependences * cd,int * nb_calls,bool * destroy_p)2717 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2718 control_dependences *cd, int *nb_calls, bool *destroy_p)
2719 {
2720 ddrs_table = new hash_table<ddr_hasher> (389);
2721 struct graph *rdg;
2722 partition *partition;
2723 bool any_builtin;
2724 int i, nbp;
2725
2726 *destroy_p = false;
2727 *nb_calls = 0;
2728 loop_nest.create (0);
2729 if (!find_loop_nest (loop, &loop_nest))
2730 {
2731 loop_nest.release ();
2732 delete ddrs_table;
2733 return 0;
2734 }
2735
2736 datarefs_vec.create (20);
2737 rdg = build_rdg (loop, cd);
2738 if (!rdg)
2739 {
2740 if (dump_file && (dump_flags & TDF_DETAILS))
2741 fprintf (dump_file,
2742 "Loop %d not distributed: failed to build the RDG.\n",
2743 loop->num);
2744
2745 loop_nest.release ();
2746 free_data_refs (datarefs_vec);
2747 delete ddrs_table;
2748 return 0;
2749 }
2750
2751 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2752 {
2753 if (dump_file && (dump_flags & TDF_DETAILS))
2754 fprintf (dump_file,
2755 "Loop %d not distributed: too many memory references.\n",
2756 loop->num);
2757
2758 free_rdg (rdg);
2759 loop_nest.release ();
2760 free_data_refs (datarefs_vec);
2761 delete ddrs_table;
2762 return 0;
2763 }
2764
2765 data_reference_p dref;
2766 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2767 dref->aux = (void *) (uintptr_t) i;
2768
2769 if (dump_file && (dump_flags & TDF_DETAILS))
2770 dump_rdg (dump_file, rdg);
2771
2772 auto_vec<struct partition *, 3> partitions;
2773 rdg_build_partitions (rdg, stmts, &partitions);
2774
2775 auto_vec<ddr_p> alias_ddrs;
2776
2777 auto_bitmap stmt_in_all_partitions;
2778 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2779 for (i = 1; partitions.iterate (i, &partition); ++i)
2780 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2781
2782 any_builtin = false;
2783 FOR_EACH_VEC_ELT (partitions, i, partition)
2784 {
2785 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2786 any_builtin |= partition_builtin_p (partition);
2787 }
2788
2789 /* If we are only distributing patterns but did not detect any,
2790 simply bail out. */
2791 if (!flag_tree_loop_distribution
2792 && !any_builtin)
2793 {
2794 nbp = 0;
2795 goto ldist_done;
2796 }
2797
2798 /* If we are only distributing patterns fuse all partitions that
2799 were not classified as builtins. This also avoids chopping
2800 a loop into pieces, separated by builtin calls. That is, we
2801 only want no or a single loop body remaining. */
2802 struct partition *into;
2803 if (!flag_tree_loop_distribution)
2804 {
2805 for (i = 0; partitions.iterate (i, &into); ++i)
2806 if (!partition_builtin_p (into))
2807 break;
2808 for (++i; partitions.iterate (i, &partition); ++i)
2809 if (!partition_builtin_p (partition))
2810 {
2811 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2812 partitions.unordered_remove (i);
2813 partition_free (partition);
2814 i--;
2815 }
2816 }
2817
2818 /* Due to limitations in the transform phase we have to fuse all
2819 reduction partitions into the last partition so the existing
2820 loop will contain all loop-closed PHI nodes. */
2821 for (i = 0; partitions.iterate (i, &into); ++i)
2822 if (partition_reduction_p (into))
2823 break;
2824 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2825 if (partition_reduction_p (partition))
2826 {
2827 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2828 partitions.unordered_remove (i);
2829 partition_free (partition);
2830 i--;
2831 }
2832
2833 /* Apply our simple cost model - fuse partitions with similar
2834 memory accesses. */
2835 for (i = 0; partitions.iterate (i, &into); ++i)
2836 {
2837 bool changed = false;
2838 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2839 continue;
2840 for (int j = i + 1;
2841 partitions.iterate (j, &partition); ++j)
2842 {
2843 if (share_memory_accesses (rdg, into, partition))
2844 {
2845 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2846 partitions.unordered_remove (j);
2847 partition_free (partition);
2848 j--;
2849 changed = true;
2850 }
2851 }
2852 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2853 accesses when 1 and 2 have similar accesses but not 0 and 1
2854 then in the next iteration we will fail to consider merging
2855 1 into 0,2. So try again if we did any merging into 0. */
2856 if (changed)
2857 i--;
2858 }
2859
2860 /* Build the partition dependency graph and fuse partitions in strong
2861 connected component. */
2862 if (partitions.length () > 1)
2863 {
2864 /* Don't support loop nest distribution under runtime alias check
2865 since it's not likely to enable many vectorization opportunities. */
2866 if (loop->inner)
2867 merge_dep_scc_partitions (rdg, &partitions, false);
2868 else
2869 {
2870 merge_dep_scc_partitions (rdg, &partitions, true);
2871 if (partitions.length () > 1)
2872 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2873 }
2874 }
2875
2876 finalize_partitions (loop, &partitions, &alias_ddrs);
2877
2878 nbp = partitions.length ();
2879 if (nbp == 0
2880 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2881 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2882 {
2883 nbp = 0;
2884 goto ldist_done;
2885 }
2886
2887 if (version_for_distribution_p (&partitions, &alias_ddrs))
2888 version_loop_by_alias_check (loop, &alias_ddrs);
2889
2890 if (dump_file && (dump_flags & TDF_DETAILS))
2891 {
2892 fprintf (dump_file,
2893 "distribute loop <%d> into partitions:\n", loop->num);
2894 dump_rdg_partitions (dump_file, partitions);
2895 }
2896
2897 FOR_EACH_VEC_ELT (partitions, i, partition)
2898 {
2899 if (partition_builtin_p (partition))
2900 (*nb_calls)++;
2901 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2902 }
2903
2904 ldist_done:
2905 loop_nest.release ();
2906 free_data_refs (datarefs_vec);
2907 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2908 iter != ddrs_table->end (); ++iter)
2909 {
2910 free_dependence_relation (*iter);
2911 *iter = NULL;
2912 }
2913 delete ddrs_table;
2914
2915 FOR_EACH_VEC_ELT (partitions, i, partition)
2916 partition_free (partition);
2917
2918 free_rdg (rdg);
2919 return nbp - *nb_calls;
2920 }
2921
2922 /* Distribute all loops in the current function. */
2923
2924 namespace {
2925
2926 const pass_data pass_data_loop_distribution =
2927 {
2928 GIMPLE_PASS, /* type */
2929 "ldist", /* name */
2930 OPTGROUP_LOOP, /* optinfo_flags */
2931 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2932 ( PROP_cfg | PROP_ssa ), /* properties_required */
2933 0, /* properties_provided */
2934 0, /* properties_destroyed */
2935 0, /* todo_flags_start */
2936 0, /* todo_flags_finish */
2937 };
2938
2939 class pass_loop_distribution : public gimple_opt_pass
2940 {
2941 public:
pass_loop_distribution(gcc::context * ctxt)2942 pass_loop_distribution (gcc::context *ctxt)
2943 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2944 {}
2945
2946 /* opt_pass methods: */
gate(function *)2947 virtual bool gate (function *)
2948 {
2949 return flag_tree_loop_distribution
2950 || flag_tree_loop_distribute_patterns;
2951 }
2952
2953 virtual unsigned int execute (function *);
2954
2955 }; // class pass_loop_distribution
2956
2957
2958 /* Given LOOP, this function records seed statements for distribution in
2959 WORK_LIST. Return false if there is nothing for distribution. */
2960
2961 static bool
find_seed_stmts_for_distribution(struct loop * loop,vec<gimple * > * work_list)2962 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2963 {
2964 basic_block *bbs = get_loop_body_in_dom_order (loop);
2965
2966 /* Initialize the worklist with stmts we seed the partitions with. */
2967 for (unsigned i = 0; i < loop->num_nodes; ++i)
2968 {
2969 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2970 !gsi_end_p (gsi); gsi_next (&gsi))
2971 {
2972 gphi *phi = gsi.phi ();
2973 if (virtual_operand_p (gimple_phi_result (phi)))
2974 continue;
2975 /* Distribute stmts which have defs that are used outside of
2976 the loop. */
2977 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2978 continue;
2979 work_list->safe_push (phi);
2980 }
2981 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2982 !gsi_end_p (gsi); gsi_next (&gsi))
2983 {
2984 gimple *stmt = gsi_stmt (gsi);
2985
2986 /* If there is a stmt with side-effects bail out - we
2987 cannot and should not distribute this loop. */
2988 if (gimple_has_side_effects (stmt))
2989 {
2990 free (bbs);
2991 return false;
2992 }
2993
2994 /* Distribute stmts which have defs that are used outside of
2995 the loop. */
2996 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2997 ;
2998 /* Otherwise only distribute stores for now. */
2999 else if (!gimple_vdef (stmt))
3000 continue;
3001
3002 work_list->safe_push (stmt);
3003 }
3004 }
3005 free (bbs);
3006 return work_list->length () > 0;
3007 }
3008
3009 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3010 perfect loop nest. */
3011
3012 static struct loop *
prepare_perfect_loop_nest(struct loop * loop)3013 prepare_perfect_loop_nest (struct loop *loop)
3014 {
3015 struct loop *outer = loop_outer (loop);
3016 tree niters = number_of_latch_executions (loop);
3017
3018 /* TODO: We only support the innermost 3-level loop nest distribution
3019 because of compilation time issue for now. This should be relaxed
3020 in the future. Note we only allow 3-level loop nest distribution
3021 when parallelizing loops. */
3022 while ((loop->inner == NULL
3023 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3024 && loop_outer (outer)
3025 && outer->inner == loop && loop->next == NULL
3026 && single_exit (outer)
3027 && optimize_loop_for_speed_p (outer)
3028 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3029 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3030 && niters != chrec_dont_know)
3031 {
3032 loop = outer;
3033 outer = loop_outer (loop);
3034 }
3035
3036 return loop;
3037 }
3038
3039 unsigned int
execute(function * fun)3040 pass_loop_distribution::execute (function *fun)
3041 {
3042 struct loop *loop;
3043 bool changed = false;
3044 basic_block bb;
3045 control_dependences *cd = NULL;
3046 auto_vec<loop_p> loops_to_be_destroyed;
3047
3048 if (number_of_loops (fun) <= 1)
3049 return 0;
3050
3051 /* Compute topological order for basic blocks. Topological order is
3052 needed because data dependence is computed for data references in
3053 lexicographical order. */
3054 if (bb_top_order_index == NULL)
3055 {
3056 int rpo_num;
3057 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3058
3059 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3060 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3061 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3062 for (int i = 0; i < rpo_num; i++)
3063 bb_top_order_index[rpo[i]] = i;
3064
3065 free (rpo);
3066 }
3067
3068 FOR_ALL_BB_FN (bb, fun)
3069 {
3070 gimple_stmt_iterator gsi;
3071 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3072 gimple_set_uid (gsi_stmt (gsi), -1);
3073 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3074 gimple_set_uid (gsi_stmt (gsi), -1);
3075 }
3076
3077 /* We can at the moment only distribute non-nested loops, thus restrict
3078 walking to innermost loops. */
3079 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3080 {
3081 /* Don't distribute multiple exit edges loop, or cold loop. */
3082 if (!single_exit (loop)
3083 || !optimize_loop_for_speed_p (loop))
3084 continue;
3085
3086 /* Don't distribute loop if niters is unknown. */
3087 tree niters = number_of_latch_executions (loop);
3088 if (niters == NULL_TREE || niters == chrec_dont_know)
3089 continue;
3090
3091 /* Get the perfect loop nest for distribution. */
3092 loop = prepare_perfect_loop_nest (loop);
3093 for (; loop; loop = loop->inner)
3094 {
3095 auto_vec<gimple *> work_list;
3096 if (!find_seed_stmts_for_distribution (loop, &work_list))
3097 break;
3098
3099 const char *str = loop->inner ? " nest" : "";
3100 location_t loc = find_loop_location (loop);
3101 if (!cd)
3102 {
3103 calculate_dominance_info (CDI_DOMINATORS);
3104 calculate_dominance_info (CDI_POST_DOMINATORS);
3105 cd = new control_dependences ();
3106 free_dominance_info (CDI_POST_DOMINATORS);
3107 }
3108
3109 bool destroy_p;
3110 int nb_generated_loops, nb_generated_calls;
3111 nb_generated_loops = distribute_loop (loop, work_list, cd,
3112 &nb_generated_calls,
3113 &destroy_p);
3114 if (destroy_p)
3115 loops_to_be_destroyed.safe_push (loop);
3116
3117 if (nb_generated_loops + nb_generated_calls > 0)
3118 {
3119 changed = true;
3120 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3121 loc, "Loop%s %d distributed: split to %d loops "
3122 "and %d library calls.\n", str, loop->num,
3123 nb_generated_loops, nb_generated_calls);
3124
3125 break;
3126 }
3127
3128 if (dump_file && (dump_flags & TDF_DETAILS))
3129 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3130 }
3131 }
3132
3133 if (cd)
3134 delete cd;
3135
3136 if (bb_top_order_index != NULL)
3137 {
3138 free (bb_top_order_index);
3139 bb_top_order_index = NULL;
3140 bb_top_order_index_size = 0;
3141 }
3142
3143 if (changed)
3144 {
3145 /* Destroy loop bodies that could not be reused. Do this late as we
3146 otherwise can end up refering to stale data in control dependences. */
3147 unsigned i;
3148 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3149 destroy_loop (loop);
3150
3151 /* Cached scalar evolutions now may refer to wrong or non-existing
3152 loops. */
3153 scev_reset_htab ();
3154 mark_virtual_operands_for_renaming (fun);
3155 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3156 }
3157
3158 checking_verify_loop_structure ();
3159
3160 return changed ? TODO_cleanup_cfg : 0;
3161 }
3162
3163 } // anon namespace
3164
3165 gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)3166 make_pass_loop_distribution (gcc::context *ctxt)
3167 {
3168 return new pass_loop_distribution (ctxt);
3169 }
3170