1 /* Loop distribution. 2 Copyright (C) 2006-2018 Free Software Foundation, Inc. 3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> 4 and Sebastian Pop <sebastian.pop@amd.com>. 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it 9 under the terms of the GNU General Public License as published by the 10 Free Software Foundation; either version 3, or (at your option) any 11 later version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 /* This pass performs loop distribution: for example, the loop 23 24 |DO I = 2, N 25 | A(I) = B(I) + C 26 | D(I) = A(I-1)*E 27 |ENDDO 28 29 is transformed to 30 31 |DOALL I = 2, N 32 | A(I) = B(I) + C 33 |ENDDO 34 | 35 |DOALL I = 2, N 36 | D(I) = A(I-1)*E 37 |ENDDO 38 39 Loop distribution is the dual of loop fusion. It separates statements 40 of a loop (or loop nest) into multiple loops (or loop nests) with the 41 same loop header. The major goal is to separate statements which may 42 be vectorized from those that can't. This pass implements distribution 43 in the following steps: 44 45 1) Seed partitions with specific type statements. For now we support 46 two types seed statements: statement defining variable used outside 47 of loop; statement storing to memory. 48 2) Build reduced dependence graph (RDG) for loop to be distributed. 49 The vertices (RDG:V) model all statements in the loop and the edges 50 (RDG:E) model flow and control dependencies between statements. 51 3) Apart from RDG, compute data dependencies between memory references. 52 4) Starting from seed statement, build up partition by adding depended 53 statements according to RDG's dependence information. Partition is 54 classified as parallel type if it can be executed paralleled; or as 55 sequential type if it can't. Parallel type partition is further 56 classified as different builtin kinds if it can be implemented as 57 builtin function calls. 58 5) Build partition dependence graph (PG) based on data dependencies. 59 The vertices (PG:V) model all partitions and the edges (PG:E) model 60 all data dependencies between every partitions pair. In general, 61 data dependence is either compilation time known or unknown. In C 62 family languages, there exists quite amount compilation time unknown 63 dependencies because of possible alias relation of data references. 64 We categorize PG's edge to two types: "true" edge that represents 65 compilation time known data dependencies; "alias" edge for all other 66 data dependencies. 67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge 68 partitions in each strong connected component (SCC) correspondingly. 69 Build new PG for merged partitions. 70 7) Traverse PG again and this time with both "true" and "alias" edges 71 included. We try to break SCCs by removing some edges. Because 72 SCCs by "true" edges are all fused in step 6), we can break SCCs 73 by removing some "alias" edges. It's NP-hard to choose optimal 74 edge set, fortunately simple approximation is good enough for us 75 given the small problem scale. 76 8) Collect all data dependencies of the removed "alias" edges. Create 77 runtime alias checks for collected data dependencies. 78 9) Version loop under the condition of runtime alias checks. Given 79 loop distribution generally introduces additional overhead, it is 80 only useful if vectorization is achieved in distributed loop. We 81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If 82 no distributed loop can be vectorized, we simply remove distributed 83 loops and recover to the original one. 84 85 TODO: 86 1) We only distribute innermost two-level loop nest now. We should 87 extend it for arbitrary loop nests in the future. 88 2) We only fuse partitions in SCC now. A better fusion algorithm is 89 desired to minimize loop overhead, maximize parallelism and maximize 90 data reuse. */ 91 92 #include "config.h" 93 #define INCLUDE_ALGORITHM /* stable_sort */ 94 #include "system.h" 95 #include "coretypes.h" 96 #include "backend.h" 97 #include "tree.h" 98 #include "gimple.h" 99 #include "cfghooks.h" 100 #include "tree-pass.h" 101 #include "ssa.h" 102 #include "gimple-pretty-print.h" 103 #include "fold-const.h" 104 #include "cfganal.h" 105 #include "gimple-iterator.h" 106 #include "gimplify-me.h" 107 #include "stor-layout.h" 108 #include "tree-cfg.h" 109 #include "tree-ssa-loop-manip.h" 110 #include "tree-ssa-loop-ivopts.h" 111 #include "tree-ssa-loop.h" 112 #include "tree-into-ssa.h" 113 #include "tree-ssa.h" 114 #include "cfgloop.h" 115 #include "tree-scalar-evolution.h" 116 #include "params.h" 117 #include "tree-vectorizer.h" 118 119 120 #define MAX_DATAREFS_NUM \ 121 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) 122 123 /* Threshold controlling number of distributed partitions. Given it may 124 be unnecessary if a memory stream cost model is invented in the future, 125 we define it as a temporary macro, rather than a parameter. */ 126 #define NUM_PARTITION_THRESHOLD (4) 127 128 /* Hashtable helpers. */ 129 130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> 131 { 132 static inline hashval_t hash (const data_dependence_relation *); 133 static inline bool equal (const data_dependence_relation *, 134 const data_dependence_relation *); 135 }; 136 137 /* Hash function for data dependence. */ 138 139 inline hashval_t 140 ddr_hasher::hash (const data_dependence_relation *ddr) 141 { 142 inchash::hash h; 143 h.add_ptr (DDR_A (ddr)); 144 h.add_ptr (DDR_B (ddr)); 145 return h.end (); 146 } 147 148 /* Hash table equality function for data dependence. */ 149 150 inline bool 151 ddr_hasher::equal (const data_dependence_relation *ddr1, 152 const data_dependence_relation *ddr2) 153 { 154 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); 155 } 156 157 /* The loop (nest) to be distributed. */ 158 static vec<loop_p> loop_nest; 159 160 /* Vector of data references in the loop to be distributed. */ 161 static vec<data_reference_p> datarefs_vec; 162 163 /* Store index of data reference in aux field. */ 164 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux) 165 166 /* Hash table for data dependence relation in the loop to be distributed. */ 167 static hash_table<ddr_hasher> *ddrs_table; 168 169 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ 170 struct rdg_vertex 171 { 172 /* The statement represented by this vertex. */ 173 gimple *stmt; 174 175 /* Vector of data-references in this statement. */ 176 vec<data_reference_p> datarefs; 177 178 /* True when the statement contains a write to memory. */ 179 bool has_mem_write; 180 181 /* True when the statement contains a read from memory. */ 182 bool has_mem_reads; 183 }; 184 185 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt 186 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs 187 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write 188 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads 189 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) 190 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I])) 191 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) 192 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) 193 194 /* Data dependence type. */ 195 196 enum rdg_dep_type 197 { 198 /* Read After Write (RAW). */ 199 flow_dd = 'f', 200 201 /* Control dependence (execute conditional on). */ 202 control_dd = 'c' 203 }; 204 205 /* Dependence information attached to an edge of the RDG. */ 206 207 struct rdg_edge 208 { 209 /* Type of the dependence. */ 210 enum rdg_dep_type type; 211 }; 212 213 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type 214 215 /* Dump vertex I in RDG to FILE. */ 216 217 static void 218 dump_rdg_vertex (FILE *file, struct graph *rdg, int i) 219 { 220 struct vertex *v = &(rdg->vertices[i]); 221 struct graph_edge *e; 222 223 fprintf (file, "(vertex %d: (%s%s) (in:", i, 224 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "", 225 RDG_MEM_READS_STMT (rdg, i) ? "r" : ""); 226 227 if (v->pred) 228 for (e = v->pred; e; e = e->pred_next) 229 fprintf (file, " %d", e->src); 230 231 fprintf (file, ") (out:"); 232 233 if (v->succ) 234 for (e = v->succ; e; e = e->succ_next) 235 fprintf (file, " %d", e->dest); 236 237 fprintf (file, ")\n"); 238 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); 239 fprintf (file, ")\n"); 240 } 241 242 /* Call dump_rdg_vertex on stderr. */ 243 244 DEBUG_FUNCTION void 245 debug_rdg_vertex (struct graph *rdg, int i) 246 { 247 dump_rdg_vertex (stderr, rdg, i); 248 } 249 250 /* Dump the reduced dependence graph RDG to FILE. */ 251 252 static void 253 dump_rdg (FILE *file, struct graph *rdg) 254 { 255 fprintf (file, "(rdg\n"); 256 for (int i = 0; i < rdg->n_vertices; i++) 257 dump_rdg_vertex (file, rdg, i); 258 fprintf (file, ")\n"); 259 } 260 261 /* Call dump_rdg on stderr. */ 262 263 DEBUG_FUNCTION void 264 debug_rdg (struct graph *rdg) 265 { 266 dump_rdg (stderr, rdg); 267 } 268 269 static void 270 dot_rdg_1 (FILE *file, struct graph *rdg) 271 { 272 int i; 273 pretty_printer buffer; 274 pp_needs_newline (&buffer) = false; 275 buffer.buffer->stream = file; 276 277 fprintf (file, "digraph RDG {\n"); 278 279 for (i = 0; i < rdg->n_vertices; i++) 280 { 281 struct vertex *v = &(rdg->vertices[i]); 282 struct graph_edge *e; 283 284 fprintf (file, "%d [label=\"[%d] ", i, i); 285 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM); 286 pp_flush (&buffer); 287 fprintf (file, "\"]\n"); 288 289 /* Highlight reads from memory. */ 290 if (RDG_MEM_READS_STMT (rdg, i)) 291 fprintf (file, "%d [style=filled, fillcolor=green]\n", i); 292 293 /* Highlight stores to memory. */ 294 if (RDG_MEM_WRITE_STMT (rdg, i)) 295 fprintf (file, "%d [style=filled, fillcolor=red]\n", i); 296 297 if (v->succ) 298 for (e = v->succ; e; e = e->succ_next) 299 switch (RDGE_TYPE (e)) 300 { 301 case flow_dd: 302 /* These are the most common dependences: don't print these. */ 303 fprintf (file, "%d -> %d \n", i, e->dest); 304 break; 305 306 case control_dd: 307 fprintf (file, "%d -> %d [label=control] \n", i, e->dest); 308 break; 309 310 default: 311 gcc_unreachable (); 312 } 313 } 314 315 fprintf (file, "}\n\n"); 316 } 317 318 /* Display the Reduced Dependence Graph using dotty. */ 319 320 DEBUG_FUNCTION void 321 dot_rdg (struct graph *rdg) 322 { 323 /* When debugging, you may want to enable the following code. */ 324 #ifdef HAVE_POPEN 325 FILE *file = popen ("dot -Tx11", "w"); 326 if (!file) 327 return; 328 dot_rdg_1 (file, rdg); 329 fflush (file); 330 close (fileno (file)); 331 pclose (file); 332 #else 333 dot_rdg_1 (stderr, rdg); 334 #endif 335 } 336 337 /* Returns the index of STMT in RDG. */ 338 339 static int 340 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt) 341 { 342 int index = gimple_uid (stmt); 343 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt); 344 return index; 345 } 346 347 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is 348 the index of DEF in RDG. */ 349 350 static void 351 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) 352 { 353 use_operand_p imm_use_p; 354 imm_use_iterator iterator; 355 356 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) 357 { 358 struct graph_edge *e; 359 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); 360 361 if (use < 0) 362 continue; 363 364 e = add_edge (rdg, idef, use); 365 e->data = XNEW (struct rdg_edge); 366 RDGE_TYPE (e) = flow_dd; 367 } 368 } 369 370 /* Creates an edge for the control dependences of BB to the vertex V. */ 371 372 static void 373 create_edge_for_control_dependence (struct graph *rdg, basic_block bb, 374 int v, control_dependences *cd) 375 { 376 bitmap_iterator bi; 377 unsigned edge_n; 378 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index), 379 0, edge_n, bi) 380 { 381 basic_block cond_bb = cd->get_edge_src (edge_n); 382 gimple *stmt = last_stmt (cond_bb); 383 if (stmt && is_ctrl_stmt (stmt)) 384 { 385 struct graph_edge *e; 386 int c = rdg_vertex_for_stmt (rdg, stmt); 387 if (c < 0) 388 continue; 389 390 e = add_edge (rdg, c, v); 391 e->data = XNEW (struct rdg_edge); 392 RDGE_TYPE (e) = control_dd; 393 } 394 } 395 } 396 397 /* Creates the edges of the reduced dependence graph RDG. */ 398 399 static void 400 create_rdg_flow_edges (struct graph *rdg) 401 { 402 int i; 403 def_operand_p def_p; 404 ssa_op_iter iter; 405 406 for (i = 0; i < rdg->n_vertices; i++) 407 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), 408 iter, SSA_OP_DEF) 409 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); 410 } 411 412 /* Creates the edges of the reduced dependence graph RDG. */ 413 414 static void 415 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop) 416 { 417 int i; 418 419 for (i = 0; i < rdg->n_vertices; i++) 420 { 421 gimple *stmt = RDG_STMT (rdg, i); 422 if (gimple_code (stmt) == GIMPLE_PHI) 423 { 424 edge_iterator ei; 425 edge e; 426 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds) 427 if (flow_bb_inside_loop_p (loop, e->src)) 428 create_edge_for_control_dependence (rdg, e->src, i, cd); 429 } 430 else 431 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd); 432 } 433 } 434 435 /* Build the vertices of the reduced dependence graph RDG. Return false 436 if that failed. */ 437 438 static bool 439 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop) 440 { 441 int i; 442 gimple *stmt; 443 444 FOR_EACH_VEC_ELT (stmts, i, stmt) 445 { 446 struct vertex *v = &(rdg->vertices[i]); 447 448 /* Record statement to vertex mapping. */ 449 gimple_set_uid (stmt, i); 450 451 v->data = XNEW (struct rdg_vertex); 452 RDGV_STMT (v) = stmt; 453 RDGV_DATAREFS (v).create (0); 454 RDGV_HAS_MEM_WRITE (v) = false; 455 RDGV_HAS_MEM_READS (v) = false; 456 if (gimple_code (stmt) == GIMPLE_PHI) 457 continue; 458 459 unsigned drp = datarefs_vec.length (); 460 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec)) 461 return false; 462 for (unsigned j = drp; j < datarefs_vec.length (); ++j) 463 { 464 data_reference_p dr = datarefs_vec[j]; 465 if (DR_IS_READ (dr)) 466 RDGV_HAS_MEM_READS (v) = true; 467 else 468 RDGV_HAS_MEM_WRITE (v) = true; 469 RDGV_DATAREFS (v).safe_push (dr); 470 } 471 } 472 return true; 473 } 474 475 /* Array mapping basic block's index to its topological order. */ 476 static int *bb_top_order_index; 477 /* And size of the array. */ 478 static int bb_top_order_index_size; 479 480 /* If X has a smaller topological sort number than Y, returns -1; 481 if greater, returns 1. */ 482 483 static int 484 bb_top_order_cmp (const void *x, const void *y) 485 { 486 basic_block bb1 = *(const basic_block *) x; 487 basic_block bb2 = *(const basic_block *) y; 488 489 gcc_assert (bb1->index < bb_top_order_index_size 490 && bb2->index < bb_top_order_index_size); 491 gcc_assert (bb1 == bb2 492 || bb_top_order_index[bb1->index] 493 != bb_top_order_index[bb2->index]); 494 495 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]); 496 } 497 498 /* Initialize STMTS with all the statements of LOOP. We use topological 499 order to discover all statements. The order is important because 500 generate_loops_for_partition is using the same traversal for identifying 501 statements in loop copies. */ 502 503 static void 504 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts) 505 { 506 unsigned int i; 507 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp); 508 509 for (i = 0; i < loop->num_nodes; i++) 510 { 511 basic_block bb = bbs[i]; 512 513 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); 514 gsi_next (&bsi)) 515 if (!virtual_operand_p (gimple_phi_result (bsi.phi ()))) 516 stmts->safe_push (bsi.phi ()); 517 518 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); 519 gsi_next (&bsi)) 520 { 521 gimple *stmt = gsi_stmt (bsi); 522 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt)) 523 stmts->safe_push (stmt); 524 } 525 } 526 527 free (bbs); 528 } 529 530 /* Free the reduced dependence graph RDG. */ 531 532 static void 533 free_rdg (struct graph *rdg) 534 { 535 int i; 536 537 for (i = 0; i < rdg->n_vertices; i++) 538 { 539 struct vertex *v = &(rdg->vertices[i]); 540 struct graph_edge *e; 541 542 for (e = v->succ; e; e = e->succ_next) 543 free (e->data); 544 545 if (v->data) 546 { 547 gimple_set_uid (RDGV_STMT (v), -1); 548 (RDGV_DATAREFS (v)).release (); 549 free (v->data); 550 } 551 } 552 553 free_graph (rdg); 554 } 555 556 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of 557 LOOP, and one edge per flow dependence or control dependence from control 558 dependence CD. During visiting each statement, data references are also 559 collected and recorded in global data DATAREFS_VEC. */ 560 561 static struct graph * 562 build_rdg (struct loop *loop, control_dependences *cd) 563 { 564 struct graph *rdg; 565 566 /* Create the RDG vertices from the stmts of the loop nest. */ 567 auto_vec<gimple *, 10> stmts; 568 stmts_from_loop (loop, &stmts); 569 rdg = new_graph (stmts.length ()); 570 if (!create_rdg_vertices (rdg, stmts, loop)) 571 { 572 free_rdg (rdg); 573 return NULL; 574 } 575 stmts.release (); 576 577 create_rdg_flow_edges (rdg); 578 if (cd) 579 create_rdg_cd_edges (rdg, cd, loop); 580 581 return rdg; 582 } 583 584 585 /* Kind of distributed loop. */ 586 enum partition_kind { 587 PKIND_NORMAL, 588 /* Partial memset stands for a paritition can be distributed into a loop 589 of memset calls, rather than a single memset call. It's handled just 590 like a normal parition, i.e, distributed as separate loop, no memset 591 call is generated. 592 593 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a 594 loop nest as deep as possible. As a result, parloop achieves better 595 parallelization by parallelizing deeper loop nest. This hack should 596 be unnecessary and removed once distributed memset can be understood 597 and analyzed in data reference analysis. See PR82604 for more. */ 598 PKIND_PARTIAL_MEMSET, 599 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE 600 }; 601 602 /* Type of distributed loop. */ 603 enum partition_type { 604 /* The distributed loop can be executed parallelly. */ 605 PTYPE_PARALLEL = 0, 606 /* The distributed loop has to be executed sequentially. */ 607 PTYPE_SEQUENTIAL 608 }; 609 610 /* Builtin info for loop distribution. */ 611 struct builtin_info 612 { 613 /* data-references a kind != PKIND_NORMAL partition is about. */ 614 data_reference_p dst_dr; 615 data_reference_p src_dr; 616 /* Base address and size of memory objects operated by the builtin. Note 617 both dest and source memory objects must have the same size. */ 618 tree dst_base; 619 tree src_base; 620 tree size; 621 /* Base and offset part of dst_base after stripping constant offset. This 622 is only used in memset builtin distribution for now. */ 623 tree dst_base_base; 624 unsigned HOST_WIDE_INT dst_base_offset; 625 }; 626 627 /* Partition for loop distribution. */ 628 struct partition 629 { 630 /* Statements of the partition. */ 631 bitmap stmts; 632 /* True if the partition defines variable which is used outside of loop. */ 633 bool reduction_p; 634 enum partition_kind kind; 635 enum partition_type type; 636 /* Data references in the partition. */ 637 bitmap datarefs; 638 /* Information of builtin parition. */ 639 struct builtin_info *builtin; 640 }; 641 642 643 /* Allocate and initialize a partition from BITMAP. */ 644 645 static partition * 646 partition_alloc (void) 647 { 648 partition *partition = XCNEW (struct partition); 649 partition->stmts = BITMAP_ALLOC (NULL); 650 partition->reduction_p = false; 651 partition->kind = PKIND_NORMAL; 652 partition->datarefs = BITMAP_ALLOC (NULL); 653 return partition; 654 } 655 656 /* Free PARTITION. */ 657 658 static void 659 partition_free (partition *partition) 660 { 661 BITMAP_FREE (partition->stmts); 662 BITMAP_FREE (partition->datarefs); 663 if (partition->builtin) 664 free (partition->builtin); 665 666 free (partition); 667 } 668 669 /* Returns true if the partition can be generated as a builtin. */ 670 671 static bool 672 partition_builtin_p (partition *partition) 673 { 674 return partition->kind > PKIND_PARTIAL_MEMSET; 675 } 676 677 /* Returns true if the partition contains a reduction. */ 678 679 static bool 680 partition_reduction_p (partition *partition) 681 { 682 return partition->reduction_p; 683 } 684 685 /* Partitions are fused because of different reasons. */ 686 enum fuse_type 687 { 688 FUSE_NON_BUILTIN = 0, 689 FUSE_REDUCTION = 1, 690 FUSE_SHARE_REF = 2, 691 FUSE_SAME_SCC = 3, 692 FUSE_FINALIZE = 4 693 }; 694 695 /* Description on different fusing reason. */ 696 static const char *fuse_message[] = { 697 "they are non-builtins", 698 "they have reductions", 699 "they have shared memory refs", 700 "they are in the same dependence scc", 701 "there is no point to distribute loop"}; 702 703 static void 704 update_type_for_merge (struct graph *, partition *, partition *); 705 706 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence 707 graph and we update type for result partition if it is non-NULL. */ 708 709 static void 710 partition_merge_into (struct graph *rdg, partition *dest, 711 partition *partition, enum fuse_type ft) 712 { 713 if (dump_file && (dump_flags & TDF_DETAILS)) 714 { 715 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]); 716 fprintf (dump_file, " Part 1: "); 717 dump_bitmap (dump_file, dest->stmts); 718 fprintf (dump_file, " Part 2: "); 719 dump_bitmap (dump_file, partition->stmts); 720 } 721 722 dest->kind = PKIND_NORMAL; 723 if (dest->type == PTYPE_PARALLEL) 724 dest->type = partition->type; 725 726 bitmap_ior_into (dest->stmts, partition->stmts); 727 if (partition_reduction_p (partition)) 728 dest->reduction_p = true; 729 730 /* Further check if any data dependence prevents us from executing the 731 new partition parallelly. */ 732 if (dest->type == PTYPE_PARALLEL && rdg != NULL) 733 update_type_for_merge (rdg, dest, partition); 734 735 bitmap_ior_into (dest->datarefs, partition->datarefs); 736 } 737 738 739 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after 740 the LOOP. */ 741 742 static bool 743 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) 744 { 745 imm_use_iterator imm_iter; 746 use_operand_p use_p; 747 748 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) 749 { 750 if (is_gimple_debug (USE_STMT (use_p))) 751 continue; 752 753 basic_block use_bb = gimple_bb (USE_STMT (use_p)); 754 if (!flow_bb_inside_loop_p (loop, use_bb)) 755 return true; 756 } 757 758 return false; 759 } 760 761 /* Returns true when STMT defines a scalar variable used after the 762 loop LOOP. */ 763 764 static bool 765 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) 766 { 767 def_operand_p def_p; 768 ssa_op_iter op_iter; 769 770 if (gimple_code (stmt) == GIMPLE_PHI) 771 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop); 772 773 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) 774 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop)) 775 return true; 776 777 return false; 778 } 779 780 /* Return a copy of LOOP placed before LOOP. */ 781 782 static struct loop * 783 copy_loop_before (struct loop *loop) 784 { 785 struct loop *res; 786 edge preheader = loop_preheader_edge (loop); 787 788 initialize_original_copy_tables (); 789 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); 790 gcc_assert (res != NULL); 791 free_original_copy_tables (); 792 delete_update_ssa (); 793 794 return res; 795 } 796 797 /* Creates an empty basic block after LOOP. */ 798 799 static void 800 create_bb_after_loop (struct loop *loop) 801 { 802 edge exit = single_exit (loop); 803 804 if (!exit) 805 return; 806 807 split_edge (exit); 808 } 809 810 /* Generate code for PARTITION from the code in LOOP. The loop is 811 copied when COPY_P is true. All the statements not flagged in the 812 PARTITION bitmap are removed from the loop or from its copy. The 813 statements are indexed in sequence inside a basic block, and the 814 basic blocks of a loop are taken in dom order. */ 815 816 static void 817 generate_loops_for_partition (struct loop *loop, partition *partition, 818 bool copy_p) 819 { 820 unsigned i; 821 basic_block *bbs; 822 823 if (copy_p) 824 { 825 int orig_loop_num = loop->orig_loop_num; 826 loop = copy_loop_before (loop); 827 gcc_assert (loop != NULL); 828 loop->orig_loop_num = orig_loop_num; 829 create_preheader (loop, CP_SIMPLE_PREHEADERS); 830 create_bb_after_loop (loop); 831 } 832 else 833 { 834 /* Origin number is set to the new versioned loop's num. */ 835 gcc_assert (loop->orig_loop_num != loop->num); 836 } 837 838 /* Remove stmts not in the PARTITION bitmap. */ 839 bbs = get_loop_body_in_dom_order (loop); 840 841 if (MAY_HAVE_DEBUG_BIND_STMTS) 842 for (i = 0; i < loop->num_nodes; i++) 843 { 844 basic_block bb = bbs[i]; 845 846 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); 847 gsi_next (&bsi)) 848 { 849 gphi *phi = bsi.phi (); 850 if (!virtual_operand_p (gimple_phi_result (phi)) 851 && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) 852 reset_debug_uses (phi); 853 } 854 855 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 856 { 857 gimple *stmt = gsi_stmt (bsi); 858 if (gimple_code (stmt) != GIMPLE_LABEL 859 && !is_gimple_debug (stmt) 860 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) 861 reset_debug_uses (stmt); 862 } 863 } 864 865 for (i = 0; i < loop->num_nodes; i++) 866 { 867 basic_block bb = bbs[i]; 868 edge inner_exit = NULL; 869 870 if (loop != bb->loop_father) 871 inner_exit = single_exit (bb->loop_father); 872 873 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) 874 { 875 gphi *phi = bsi.phi (); 876 if (!virtual_operand_p (gimple_phi_result (phi)) 877 && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) 878 remove_phi_node (&bsi, true); 879 else 880 gsi_next (&bsi); 881 } 882 883 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);) 884 { 885 gimple *stmt = gsi_stmt (bsi); 886 if (gimple_code (stmt) != GIMPLE_LABEL 887 && !is_gimple_debug (stmt) 888 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) 889 { 890 /* In distribution of loop nest, if bb is inner loop's exit_bb, 891 we choose its exit edge/path in order to avoid generating 892 infinite loop. For all other cases, we choose an arbitrary 893 path through the empty CFG part that this unnecessary 894 control stmt controls. */ 895 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt)) 896 { 897 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) 898 gimple_cond_make_true (cond_stmt); 899 else 900 gimple_cond_make_false (cond_stmt); 901 update_stmt (stmt); 902 } 903 else if (gimple_code (stmt) == GIMPLE_SWITCH) 904 { 905 gswitch *switch_stmt = as_a <gswitch *> (stmt); 906 gimple_switch_set_index 907 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1))); 908 update_stmt (stmt); 909 } 910 else 911 { 912 unlink_stmt_vdef (stmt); 913 gsi_remove (&bsi, true); 914 release_defs (stmt); 915 continue; 916 } 917 } 918 gsi_next (&bsi); 919 } 920 } 921 922 free (bbs); 923 } 924 925 /* If VAL memory representation contains the same value in all bytes, 926 return that value, otherwise return -1. 927 E.g. for 0x24242424 return 0x24, for IEEE double 928 747708026454360457216.0 return 0x44, etc. */ 929 930 static int 931 const_with_all_bytes_same (tree val) 932 { 933 unsigned char buf[64]; 934 int i, len; 935 936 if (integer_zerop (val) 937 || (TREE_CODE (val) == CONSTRUCTOR 938 && !TREE_CLOBBER_P (val) 939 && CONSTRUCTOR_NELTS (val) == 0)) 940 return 0; 941 942 if (real_zerop (val)) 943 { 944 /* Only return 0 for +0.0, not for -0.0, which doesn't have 945 an all bytes same memory representation. Don't transform 946 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */ 947 switch (TREE_CODE (val)) 948 { 949 case REAL_CST: 950 if (!real_isneg (TREE_REAL_CST_PTR (val))) 951 return 0; 952 break; 953 case COMPLEX_CST: 954 if (!const_with_all_bytes_same (TREE_REALPART (val)) 955 && !const_with_all_bytes_same (TREE_IMAGPART (val))) 956 return 0; 957 break; 958 case VECTOR_CST: 959 { 960 unsigned int count = vector_cst_encoded_nelts (val); 961 unsigned int j; 962 for (j = 0; j < count; ++j) 963 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) 964 break; 965 if (j == count) 966 return 0; 967 break; 968 } 969 default: 970 break; 971 } 972 } 973 974 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) 975 return -1; 976 977 len = native_encode_expr (val, buf, sizeof (buf)); 978 if (len == 0) 979 return -1; 980 for (i = 1; i < len; i++) 981 if (buf[i] != buf[0]) 982 return -1; 983 return buf[0]; 984 } 985 986 /* Generate a call to memset for PARTITION in LOOP. */ 987 988 static void 989 generate_memset_builtin (struct loop *loop, partition *partition) 990 { 991 gimple_stmt_iterator gsi; 992 tree mem, fn, nb_bytes; 993 tree val; 994 struct builtin_info *builtin = partition->builtin; 995 gimple *fn_call; 996 997 /* The new statements will be placed before LOOP. */ 998 gsi = gsi_last_bb (loop_preheader_edge (loop)->src); 999 1000 nb_bytes = builtin->size; 1001 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, 1002 false, GSI_CONTINUE_LINKING); 1003 mem = builtin->dst_base; 1004 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, 1005 false, GSI_CONTINUE_LINKING); 1006 1007 /* This exactly matches the pattern recognition in classify_partition. */ 1008 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); 1009 /* Handle constants like 0x15151515 and similarly 1010 floating point constants etc. where all bytes are the same. */ 1011 int bytev = const_with_all_bytes_same (val); 1012 if (bytev != -1) 1013 val = build_int_cst (integer_type_node, bytev); 1014 else if (TREE_CODE (val) == INTEGER_CST) 1015 val = fold_convert (integer_type_node, val); 1016 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) 1017 { 1018 tree tem = make_ssa_name (integer_type_node); 1019 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val); 1020 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING); 1021 val = tem; 1022 } 1023 1024 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); 1025 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); 1026 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); 1027 1028 if (dump_file && (dump_flags & TDF_DETAILS)) 1029 { 1030 fprintf (dump_file, "generated memset"); 1031 if (bytev == 0) 1032 fprintf (dump_file, " zero\n"); 1033 else 1034 fprintf (dump_file, "\n"); 1035 } 1036 } 1037 1038 /* Generate a call to memcpy for PARTITION in LOOP. */ 1039 1040 static void 1041 generate_memcpy_builtin (struct loop *loop, partition *partition) 1042 { 1043 gimple_stmt_iterator gsi; 1044 gimple *fn_call; 1045 tree dest, src, fn, nb_bytes; 1046 enum built_in_function kind; 1047 struct builtin_info *builtin = partition->builtin; 1048 1049 /* The new statements will be placed before LOOP. */ 1050 gsi = gsi_last_bb (loop_preheader_edge (loop)->src); 1051 1052 nb_bytes = builtin->size; 1053 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, 1054 false, GSI_CONTINUE_LINKING); 1055 dest = builtin->dst_base; 1056 src = builtin->src_base; 1057 if (partition->kind == PKIND_MEMCPY 1058 || ! ptr_derefs_may_alias_p (dest, src)) 1059 kind = BUILT_IN_MEMCPY; 1060 else 1061 kind = BUILT_IN_MEMMOVE; 1062 1063 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, 1064 false, GSI_CONTINUE_LINKING); 1065 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, 1066 false, GSI_CONTINUE_LINKING); 1067 fn = build_fold_addr_expr (builtin_decl_implicit (kind)); 1068 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); 1069 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); 1070 1071 if (dump_file && (dump_flags & TDF_DETAILS)) 1072 { 1073 if (kind == BUILT_IN_MEMCPY) 1074 fprintf (dump_file, "generated memcpy\n"); 1075 else 1076 fprintf (dump_file, "generated memmove\n"); 1077 } 1078 } 1079 1080 /* Remove and destroy the loop LOOP. */ 1081 1082 static void 1083 destroy_loop (struct loop *loop) 1084 { 1085 unsigned nbbs = loop->num_nodes; 1086 edge exit = single_exit (loop); 1087 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; 1088 basic_block *bbs; 1089 unsigned i; 1090 1091 bbs = get_loop_body_in_dom_order (loop); 1092 1093 redirect_edge_pred (exit, src); 1094 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); 1095 exit->flags |= EDGE_FALLTHRU; 1096 cancel_loop_tree (loop); 1097 rescan_loop_exit (exit, false, true); 1098 1099 i = nbbs; 1100 do 1101 { 1102 /* We have made sure to not leave any dangling uses of SSA 1103 names defined in the loop. With the exception of virtuals. 1104 Make sure we replace all uses of virtual defs that will remain 1105 outside of the loop with the bare symbol as delete_basic_block 1106 will release them. */ 1107 --i; 1108 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); 1109 gsi_next (&gsi)) 1110 { 1111 gphi *phi = gsi.phi (); 1112 if (virtual_operand_p (gimple_phi_result (phi))) 1113 mark_virtual_phi_result_for_renaming (phi); 1114 } 1115 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); 1116 gsi_next (&gsi)) 1117 { 1118 gimple *stmt = gsi_stmt (gsi); 1119 tree vdef = gimple_vdef (stmt); 1120 if (vdef && TREE_CODE (vdef) == SSA_NAME) 1121 mark_virtual_operand_for_renaming (vdef); 1122 } 1123 delete_basic_block (bbs[i]); 1124 } 1125 while (i != 0); 1126 1127 free (bbs); 1128 1129 set_immediate_dominator (CDI_DOMINATORS, dest, 1130 recompute_dominator (CDI_DOMINATORS, dest)); 1131 } 1132 1133 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ 1134 1135 static bool 1136 generate_code_for_partition (struct loop *loop, 1137 partition *partition, bool copy_p) 1138 { 1139 switch (partition->kind) 1140 { 1141 case PKIND_NORMAL: 1142 case PKIND_PARTIAL_MEMSET: 1143 /* Reductions all have to be in the last partition. */ 1144 gcc_assert (!partition_reduction_p (partition) 1145 || !copy_p); 1146 generate_loops_for_partition (loop, partition, copy_p); 1147 return false; 1148 1149 case PKIND_MEMSET: 1150 generate_memset_builtin (loop, partition); 1151 break; 1152 1153 case PKIND_MEMCPY: 1154 case PKIND_MEMMOVE: 1155 generate_memcpy_builtin (loop, partition); 1156 break; 1157 1158 default: 1159 gcc_unreachable (); 1160 } 1161 1162 /* Common tail for partitions we turn into a call. If this was the last 1163 partition for which we generate code, we have to destroy the loop. */ 1164 if (!copy_p) 1165 return true; 1166 return false; 1167 } 1168 1169 /* Return data dependence relation for data references A and B. The two 1170 data references must be in lexicographic order wrto reduced dependence 1171 graph RDG. We firstly try to find ddr from global ddr hash table. If 1172 it doesn't exist, compute the ddr and cache it. */ 1173 1174 static data_dependence_relation * 1175 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b) 1176 { 1177 struct data_dependence_relation ent, **slot; 1178 struct data_dependence_relation *ddr; 1179 1180 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b)); 1181 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a)) 1182 <= rdg_vertex_for_stmt (rdg, DR_STMT (b))); 1183 ent.a = a; 1184 ent.b = b; 1185 slot = ddrs_table->find_slot (&ent, INSERT); 1186 if (*slot == NULL) 1187 { 1188 ddr = initialize_data_dependence_relation (a, b, loop_nest); 1189 compute_affine_dependence (ddr, loop_nest[0]); 1190 *slot = ddr; 1191 } 1192 1193 return *slot; 1194 } 1195 1196 /* In reduced dependence graph RDG for loop distribution, return true if 1197 dependence between references DR1 and DR2 leads to a dependence cycle 1198 and such dependence cycle can't be resolved by runtime alias check. */ 1199 1200 static bool 1201 data_dep_in_cycle_p (struct graph *rdg, 1202 data_reference_p dr1, data_reference_p dr2) 1203 { 1204 struct data_dependence_relation *ddr; 1205 1206 /* Re-shuffle data-refs to be in topological order. */ 1207 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) 1208 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) 1209 std::swap (dr1, dr2); 1210 1211 ddr = get_data_dependence (rdg, dr1, dr2); 1212 1213 /* In case of no data dependence. */ 1214 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1215 return false; 1216 /* For unknown data dependence or known data dependence which can't be 1217 expressed in classic distance vector, we check if it can be resolved 1218 by runtime alias check. If yes, we still consider data dependence 1219 as won't introduce data dependence cycle. */ 1220 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1221 || DDR_NUM_DIST_VECTS (ddr) == 0) 1222 return !runtime_alias_check_p (ddr, NULL, true); 1223 else if (DDR_NUM_DIST_VECTS (ddr) > 1) 1224 return true; 1225 else if (DDR_REVERSED_P (ddr) 1226 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) 1227 return false; 1228 1229 return true; 1230 } 1231 1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update 1233 PARTITION1's type after merging PARTITION2 into PARTITION1. */ 1234 1235 static void 1236 update_type_for_merge (struct graph *rdg, 1237 partition *partition1, partition *partition2) 1238 { 1239 unsigned i, j; 1240 bitmap_iterator bi, bj; 1241 data_reference_p dr1, dr2; 1242 1243 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) 1244 { 1245 unsigned start = (partition1 == partition2) ? i + 1 : 0; 1246 1247 dr1 = datarefs_vec[i]; 1248 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) 1249 { 1250 dr2 = datarefs_vec[j]; 1251 if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) 1252 continue; 1253 1254 /* Partition can only be executed sequentially if there is any 1255 data dependence cycle. */ 1256 if (data_dep_in_cycle_p (rdg, dr1, dr2)) 1257 { 1258 partition1->type = PTYPE_SEQUENTIAL; 1259 return; 1260 } 1261 } 1262 } 1263 } 1264 1265 /* Returns a partition with all the statements needed for computing 1266 the vertex V of the RDG, also including the loop exit conditions. */ 1267 1268 static partition * 1269 build_rdg_partition_for_vertex (struct graph *rdg, int v) 1270 { 1271 partition *partition = partition_alloc (); 1272 auto_vec<int, 3> nodes; 1273 unsigned i, j; 1274 int x; 1275 data_reference_p dr; 1276 1277 graphds_dfs (rdg, &v, 1, &nodes, false, NULL); 1278 1279 FOR_EACH_VEC_ELT (nodes, i, x) 1280 { 1281 bitmap_set_bit (partition->stmts, x); 1282 1283 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j) 1284 { 1285 unsigned idx = (unsigned) DR_INDEX (dr); 1286 gcc_assert (idx < datarefs_vec.length ()); 1287 1288 /* Partition can only be executed sequentially if there is any 1289 unknown data reference. */ 1290 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) 1291 || !DR_INIT (dr) || !DR_STEP (dr)) 1292 partition->type = PTYPE_SEQUENTIAL; 1293 1294 bitmap_set_bit (partition->datarefs, idx); 1295 } 1296 } 1297 1298 if (partition->type == PTYPE_SEQUENTIAL) 1299 return partition; 1300 1301 /* Further check if any data dependence prevents us from executing the 1302 partition parallelly. */ 1303 update_type_for_merge (rdg, partition, partition); 1304 1305 return partition; 1306 } 1307 1308 /* Given PARTITION of LOOP and RDG, record single load/store data references 1309 for builtin partition in SRC_DR/DST_DR, return false if there is no such 1310 data references. */ 1311 1312 static bool 1313 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition, 1314 data_reference_p *dst_dr, data_reference_p *src_dr) 1315 { 1316 unsigned i; 1317 data_reference_p single_ld = NULL, single_st = NULL; 1318 bitmap_iterator bi; 1319 1320 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) 1321 { 1322 gimple *stmt = RDG_STMT (rdg, i); 1323 data_reference_p dr; 1324 1325 if (gimple_code (stmt) == GIMPLE_PHI) 1326 continue; 1327 1328 /* Any scalar stmts are ok. */ 1329 if (!gimple_vuse (stmt)) 1330 continue; 1331 1332 /* Otherwise just regular loads/stores. */ 1333 if (!gimple_assign_single_p (stmt)) 1334 return false; 1335 1336 /* But exactly one store and/or load. */ 1337 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) 1338 { 1339 tree type = TREE_TYPE (DR_REF (dr)); 1340 1341 /* The memset, memcpy and memmove library calls are only 1342 able to deal with generic address space. */ 1343 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) 1344 return false; 1345 1346 if (DR_IS_READ (dr)) 1347 { 1348 if (single_ld != NULL) 1349 return false; 1350 single_ld = dr; 1351 } 1352 else 1353 { 1354 if (single_st != NULL) 1355 return false; 1356 single_st = dr; 1357 } 1358 } 1359 } 1360 1361 if (!single_st) 1362 return false; 1363 1364 /* Bail out if this is a bitfield memory reference. */ 1365 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF 1366 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) 1367 return false; 1368 1369 /* Data reference must be executed exactly once per iteration of each 1370 loop in the loop nest. We only need to check dominance information 1371 against the outermost one in a perfect loop nest because a bb can't 1372 dominate outermost loop's latch without dominating inner loop's. */ 1373 basic_block bb_st = gimple_bb (DR_STMT (single_st)); 1374 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) 1375 return false; 1376 1377 if (single_ld) 1378 { 1379 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); 1380 /* Direct aggregate copy or via an SSA name temporary. */ 1381 if (load != store 1382 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) 1383 return false; 1384 1385 /* Bail out if this is a bitfield memory reference. */ 1386 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF 1387 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) 1388 return false; 1389 1390 /* Load and store must be in the same loop nest. */ 1391 basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); 1392 if (bb_st->loop_father != bb_ld->loop_father) 1393 return false; 1394 1395 /* Data reference must be executed exactly once per iteration. 1396 Same as single_st, we only need to check against the outermost 1397 loop. */ 1398 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) 1399 return false; 1400 1401 edge e = single_exit (bb_st->loop_father); 1402 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); 1403 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); 1404 if (dom_ld != dom_st) 1405 return false; 1406 } 1407 1408 *src_dr = single_ld; 1409 *dst_dr = single_st; 1410 return true; 1411 } 1412 1413 /* Given data reference DR in LOOP_NEST, this function checks the enclosing 1414 loops from inner to outer to see if loop's step equals to access size at 1415 each level of loop. Return 2 if we can prove this at all level loops; 1416 record access base and size in BASE and SIZE; save loop's step at each 1417 level of loop in STEPS if it is not null. For example: 1418 1419 int arr[100][100][100]; 1420 for (i = 0; i < 100; i++) ;steps[2] = 40000 1421 for (j = 100; j > 0; j--) ;steps[1] = -400 1422 for (k = 0; k < 100; k++) ;steps[0] = 4 1423 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 1424 1425 Return 1 if we can prove the equality at the innermost loop, but not all 1426 level loops. In this case, no information is recorded. 1427 1428 Return 0 if no equality can be proven at any level loops. */ 1429 1430 static int 1431 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, 1432 tree *size, vec<tree> *steps = NULL) 1433 { 1434 location_t loc = gimple_location (DR_STMT (dr)); 1435 basic_block bb = gimple_bb (DR_STMT (dr)); 1436 struct loop *loop = bb->loop_father; 1437 tree ref = DR_REF (dr); 1438 tree access_base = build_fold_addr_expr (ref); 1439 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1440 int res = 0; 1441 1442 do { 1443 tree scev_fn = analyze_scalar_evolution (loop, access_base); 1444 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) 1445 return res; 1446 1447 access_base = CHREC_LEFT (scev_fn); 1448 if (tree_contains_chrecs (access_base, NULL)) 1449 return res; 1450 1451 tree scev_step = CHREC_RIGHT (scev_fn); 1452 /* Only support constant steps. */ 1453 if (TREE_CODE (scev_step) != INTEGER_CST) 1454 return res; 1455 1456 enum ev_direction access_dir = scev_direction (scev_fn); 1457 if (access_dir == EV_DIR_UNKNOWN) 1458 return res; 1459 1460 if (steps != NULL) 1461 steps->safe_push (scev_step); 1462 1463 scev_step = fold_convert_loc (loc, sizetype, scev_step); 1464 /* Compute absolute value of scev step. */ 1465 if (access_dir == EV_DIR_DECREASES) 1466 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); 1467 1468 /* At each level of loop, scev step must equal to access size. In other 1469 words, DR must access consecutive memory between loop iterations. */ 1470 if (!operand_equal_p (scev_step, access_size, 0)) 1471 return res; 1472 1473 /* Access stride can be computed for data reference at least for the 1474 innermost loop. */ 1475 res = 1; 1476 1477 /* Compute DR's execution times in loop. */ 1478 tree niters = number_of_latch_executions (loop); 1479 niters = fold_convert_loc (loc, sizetype, niters); 1480 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) 1481 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); 1482 1483 /* Compute DR's overall access size in loop. */ 1484 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, 1485 niters, scev_step); 1486 /* Adjust base address in case of negative step. */ 1487 if (access_dir == EV_DIR_DECREASES) 1488 { 1489 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, 1490 scev_step, access_size); 1491 access_base = fold_build_pointer_plus_loc (loc, access_base, adj); 1492 } 1493 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); 1494 1495 *base = access_base; 1496 *size = access_size; 1497 /* Access stride can be computed for data reference at each level loop. */ 1498 return 2; 1499 } 1500 1501 /* Allocate and return builtin struct. Record information like DST_DR, 1502 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ 1503 1504 static struct builtin_info * 1505 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, 1506 tree dst_base, tree src_base, tree size) 1507 { 1508 struct builtin_info *builtin = XNEW (struct builtin_info); 1509 builtin->dst_dr = dst_dr; 1510 builtin->src_dr = src_dr; 1511 builtin->dst_base = dst_base; 1512 builtin->src_base = src_base; 1513 builtin->size = size; 1514 return builtin; 1515 } 1516 1517 /* Given data reference DR in loop nest LOOP, classify if it forms builtin 1518 memset call. */ 1519 1520 static void 1521 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) 1522 { 1523 gimple *stmt = DR_STMT (dr); 1524 tree base, size, rhs = gimple_assign_rhs1 (stmt); 1525 1526 if (const_with_all_bytes_same (rhs) == -1 1527 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) 1528 || (TYPE_MODE (TREE_TYPE (rhs)) 1529 != TYPE_MODE (unsigned_char_type_node)))) 1530 return; 1531 1532 if (TREE_CODE (rhs) == SSA_NAME 1533 && !SSA_NAME_IS_DEFAULT_DEF (rhs) 1534 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) 1535 return; 1536 1537 int res = compute_access_range (loop, dr, &base, &size); 1538 if (res == 0) 1539 return; 1540 if (res == 1) 1541 { 1542 partition->kind = PKIND_PARTIAL_MEMSET; 1543 return; 1544 } 1545 1546 poly_uint64 base_offset; 1547 unsigned HOST_WIDE_INT const_base_offset; 1548 tree base_base = strip_offset (base, &base_offset); 1549 if (!base_offset.is_constant (&const_base_offset)) 1550 return; 1551 1552 struct builtin_info *builtin; 1553 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); 1554 builtin->dst_base_base = base_base; 1555 builtin->dst_base_offset = const_base_offset; 1556 partition->builtin = builtin; 1557 partition->kind = PKIND_MEMSET; 1558 } 1559 1560 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify 1561 if it forms builtin memcpy or memmove call. */ 1562 1563 static void 1564 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, 1565 data_reference_p dst_dr, data_reference_p src_dr) 1566 { 1567 tree base, size, src_base, src_size; 1568 auto_vec<tree> dst_steps, src_steps; 1569 1570 /* Compute access range of both load and store. */ 1571 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); 1572 if (res != 2) 1573 return; 1574 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); 1575 if (res != 2) 1576 return; 1577 1578 /* They much have the same access size. */ 1579 if (!operand_equal_p (size, src_size, 0)) 1580 return; 1581 1582 /* Load and store in loop nest must access memory in the same way, i.e, 1583 their must have the same steps in each loop of the nest. */ 1584 if (dst_steps.length () != src_steps.length ()) 1585 return; 1586 for (unsigned i = 0; i < dst_steps.length (); ++i) 1587 if (!operand_equal_p (dst_steps[i], src_steps[i], 0)) 1588 return; 1589 1590 /* Now check that if there is a dependence. */ 1591 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); 1592 1593 /* Classify as memcpy if no dependence between load and store. */ 1594 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1595 { 1596 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); 1597 partition->kind = PKIND_MEMCPY; 1598 return; 1599 } 1600 1601 /* Can't do memmove in case of unknown dependence or dependence without 1602 classical distance vector. */ 1603 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1604 || DDR_NUM_DIST_VECTS (ddr) == 0) 1605 return; 1606 1607 unsigned i; 1608 lambda_vector dist_v; 1609 int num_lev = (DDR_LOOP_NEST (ddr)).length (); 1610 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 1611 { 1612 unsigned dep_lev = dependence_level (dist_v, num_lev); 1613 /* Can't do memmove if load depends on store. */ 1614 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) 1615 return; 1616 } 1617 1618 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); 1619 partition->kind = PKIND_MEMMOVE; 1620 return; 1621 } 1622 1623 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. 1624 For the moment we detect memset, memcpy and memmove patterns. Bitmap 1625 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ 1626 1627 static void 1628 classify_partition (loop_p loop, struct graph *rdg, partition *partition, 1629 bitmap stmt_in_all_partitions) 1630 { 1631 bitmap_iterator bi; 1632 unsigned i; 1633 data_reference_p single_ld = NULL, single_st = NULL; 1634 bool volatiles_p = false, has_reduction = false; 1635 1636 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) 1637 { 1638 gimple *stmt = RDG_STMT (rdg, i); 1639 1640 if (gimple_has_volatile_ops (stmt)) 1641 volatiles_p = true; 1642 1643 /* If the stmt is not included by all partitions and there is uses 1644 outside of the loop, then mark the partition as reduction. */ 1645 if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) 1646 { 1647 /* Due to limitation in the transform phase we have to fuse all 1648 reduction partitions. As a result, this could cancel valid 1649 loop distribution especially for loop that induction variable 1650 is used outside of loop. To workaround this issue, we skip 1651 marking partition as reudction if the reduction stmt belongs 1652 to all partitions. In such case, reduction will be computed 1653 correctly no matter how partitions are fused/distributed. */ 1654 if (!bitmap_bit_p (stmt_in_all_partitions, i)) 1655 { 1656 partition->reduction_p = true; 1657 return; 1658 } 1659 has_reduction = true; 1660 } 1661 } 1662 1663 /* Perform general partition disqualification for builtins. */ 1664 if (volatiles_p 1665 /* Simple workaround to prevent classifying the partition as builtin 1666 if it contains any use outside of loop. */ 1667 || has_reduction 1668 || !flag_tree_loop_distribute_patterns) 1669 return; 1670 1671 /* Find single load/store data references for builtin partition. */ 1672 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) 1673 return; 1674 1675 /* Classify the builtin kind. */ 1676 if (single_ld == NULL) 1677 classify_builtin_st (loop, partition, single_st); 1678 else 1679 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); 1680 } 1681 1682 /* Returns true when PARTITION1 and PARTITION2 access the same memory 1683 object in RDG. */ 1684 1685 static bool 1686 share_memory_accesses (struct graph *rdg, 1687 partition *partition1, partition *partition2) 1688 { 1689 unsigned i, j; 1690 bitmap_iterator bi, bj; 1691 data_reference_p dr1, dr2; 1692 1693 /* First check whether in the intersection of the two partitions are 1694 any loads or stores. Common loads are the situation that happens 1695 most often. */ 1696 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi) 1697 if (RDG_MEM_WRITE_STMT (rdg, i) 1698 || RDG_MEM_READS_STMT (rdg, i)) 1699 return true; 1700 1701 /* Then check whether the two partitions access the same memory object. */ 1702 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) 1703 { 1704 dr1 = datarefs_vec[i]; 1705 1706 if (!DR_BASE_ADDRESS (dr1) 1707 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1)) 1708 continue; 1709 1710 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj) 1711 { 1712 dr2 = datarefs_vec[j]; 1713 1714 if (!DR_BASE_ADDRESS (dr2) 1715 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2)) 1716 continue; 1717 1718 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) 1719 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) 1720 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) 1721 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) 1722 return true; 1723 } 1724 } 1725 1726 return false; 1727 } 1728 1729 /* For each seed statement in STARTING_STMTS, this function builds 1730 partition for it by adding depended statements according to RDG. 1731 All partitions are recorded in PARTITIONS. */ 1732 1733 static void 1734 rdg_build_partitions (struct graph *rdg, 1735 vec<gimple *> starting_stmts, 1736 vec<partition *> *partitions) 1737 { 1738 auto_bitmap processed; 1739 int i; 1740 gimple *stmt; 1741 1742 FOR_EACH_VEC_ELT (starting_stmts, i, stmt) 1743 { 1744 int v = rdg_vertex_for_stmt (rdg, stmt); 1745 1746 if (dump_file && (dump_flags & TDF_DETAILS)) 1747 fprintf (dump_file, 1748 "ldist asked to generate code for vertex %d\n", v); 1749 1750 /* If the vertex is already contained in another partition so 1751 is the partition rooted at it. */ 1752 if (bitmap_bit_p (processed, v)) 1753 continue; 1754 1755 partition *partition = build_rdg_partition_for_vertex (rdg, v); 1756 bitmap_ior_into (processed, partition->stmts); 1757 1758 if (dump_file && (dump_flags & TDF_DETAILS)) 1759 { 1760 fprintf (dump_file, "ldist creates useful %s partition:\n", 1761 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent"); 1762 bitmap_print (dump_file, partition->stmts, " ", "\n"); 1763 } 1764 1765 partitions->safe_push (partition); 1766 } 1767 1768 /* All vertices should have been assigned to at least one partition now, 1769 other than vertices belonging to dead code. */ 1770 } 1771 1772 /* Dump to FILE the PARTITIONS. */ 1773 1774 static void 1775 dump_rdg_partitions (FILE *file, vec<partition *> partitions) 1776 { 1777 int i; 1778 partition *partition; 1779 1780 FOR_EACH_VEC_ELT (partitions, i, partition) 1781 debug_bitmap_file (file, partition->stmts); 1782 } 1783 1784 /* Debug PARTITIONS. */ 1785 extern void debug_rdg_partitions (vec<partition *> ); 1786 1787 DEBUG_FUNCTION void 1788 debug_rdg_partitions (vec<partition *> partitions) 1789 { 1790 dump_rdg_partitions (stderr, partitions); 1791 } 1792 1793 /* Returns the number of read and write operations in the RDG. */ 1794 1795 static int 1796 number_of_rw_in_rdg (struct graph *rdg) 1797 { 1798 int i, res = 0; 1799 1800 for (i = 0; i < rdg->n_vertices; i++) 1801 { 1802 if (RDG_MEM_WRITE_STMT (rdg, i)) 1803 ++res; 1804 1805 if (RDG_MEM_READS_STMT (rdg, i)) 1806 ++res; 1807 } 1808 1809 return res; 1810 } 1811 1812 /* Returns the number of read and write operations in a PARTITION of 1813 the RDG. */ 1814 1815 static int 1816 number_of_rw_in_partition (struct graph *rdg, partition *partition) 1817 { 1818 int res = 0; 1819 unsigned i; 1820 bitmap_iterator ii; 1821 1822 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii) 1823 { 1824 if (RDG_MEM_WRITE_STMT (rdg, i)) 1825 ++res; 1826 1827 if (RDG_MEM_READS_STMT (rdg, i)) 1828 ++res; 1829 } 1830 1831 return res; 1832 } 1833 1834 /* Returns true when one of the PARTITIONS contains all the read or 1835 write operations of RDG. */ 1836 1837 static bool 1838 partition_contains_all_rw (struct graph *rdg, 1839 vec<partition *> partitions) 1840 { 1841 int i; 1842 partition *partition; 1843 int nrw = number_of_rw_in_rdg (rdg); 1844 1845 FOR_EACH_VEC_ELT (partitions, i, partition) 1846 if (nrw == number_of_rw_in_partition (rdg, partition)) 1847 return true; 1848 1849 return false; 1850 } 1851 1852 /* Compute partition dependence created by the data references in DRS1 1853 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is 1854 not NULL, we record dependence introduced by possible alias between 1855 two data references in ALIAS_DDRS; otherwise, we simply ignore such 1856 dependence as if it doesn't exist at all. */ 1857 1858 static int 1859 pg_add_dependence_edges (struct graph *rdg, int dir, 1860 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) 1861 { 1862 unsigned i, j; 1863 bitmap_iterator bi, bj; 1864 data_reference_p dr1, dr2, saved_dr1; 1865 1866 /* dependence direction - 0 is no dependence, -1 is back, 1867 1 is forth, 2 is both (we can stop then, merging will occur). */ 1868 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi) 1869 { 1870 dr1 = datarefs_vec[i]; 1871 1872 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj) 1873 { 1874 int res, this_dir = 1; 1875 ddr_p ddr; 1876 1877 dr2 = datarefs_vec[j]; 1878 1879 /* Skip all <read, read> data dependence. */ 1880 if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) 1881 continue; 1882 1883 saved_dr1 = dr1; 1884 /* Re-shuffle data-refs to be in topological order. */ 1885 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) 1886 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) 1887 { 1888 std::swap (dr1, dr2); 1889 this_dir = -this_dir; 1890 } 1891 ddr = get_data_dependence (rdg, dr1, dr2); 1892 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 1893 { 1894 this_dir = 0; 1895 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1), 1896 DR_BASE_ADDRESS (dr2)); 1897 /* Be conservative. If data references are not well analyzed, 1898 or the two data references have the same base address and 1899 offset, add dependence and consider it alias to each other. 1900 In other words, the dependence can not be resolved by 1901 runtime alias check. */ 1902 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) 1903 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) 1904 || !DR_INIT (dr1) || !DR_INIT (dr2) 1905 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1)) 1906 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2)) 1907 || res == 0) 1908 this_dir = 2; 1909 /* Data dependence could be resolved by runtime alias check, 1910 record it in ALIAS_DDRS. */ 1911 else if (alias_ddrs != NULL) 1912 alias_ddrs->safe_push (ddr); 1913 /* Or simply ignore it. */ 1914 } 1915 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) 1916 { 1917 if (DDR_REVERSED_P (ddr)) 1918 this_dir = -this_dir; 1919 1920 /* Known dependences can still be unordered througout the 1921 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ 1922 if (DDR_NUM_DIST_VECTS (ddr) != 1) 1923 this_dir = 2; 1924 /* If the overlap is exact preserve stmt order. */ 1925 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) 1926 ; 1927 /* Else as the distance vector is lexicographic positive swap 1928 the dependence direction. */ 1929 else 1930 this_dir = -this_dir; 1931 } 1932 else 1933 this_dir = 0; 1934 if (this_dir == 2) 1935 return 2; 1936 else if (dir == 0) 1937 dir = this_dir; 1938 else if (this_dir != 0 && dir != this_dir) 1939 return 2; 1940 /* Shuffle "back" dr1. */ 1941 dr1 = saved_dr1; 1942 } 1943 } 1944 return dir; 1945 } 1946 1947 /* Compare postorder number of the partition graph vertices V1 and V2. */ 1948 1949 static int 1950 pgcmp (const void *v1_, const void *v2_) 1951 { 1952 const vertex *v1 = (const vertex *)v1_; 1953 const vertex *v2 = (const vertex *)v2_; 1954 return v2->post - v1->post; 1955 } 1956 1957 /* Data attached to vertices of partition dependence graph. */ 1958 struct pg_vdata 1959 { 1960 /* ID of the corresponding partition. */ 1961 int id; 1962 /* The partition. */ 1963 struct partition *partition; 1964 }; 1965 1966 /* Data attached to edges of partition dependence graph. */ 1967 struct pg_edata 1968 { 1969 /* If the dependence edge can be resolved by runtime alias check, 1970 this vector contains data dependence relations for runtime alias 1971 check. On the other hand, if the dependence edge is introduced 1972 because of compilation time known data dependence, this vector 1973 contains nothing. */ 1974 vec<ddr_p> alias_ddrs; 1975 }; 1976 1977 /* Callback data for traversing edges in graph. */ 1978 struct pg_edge_callback_data 1979 { 1980 /* Bitmap contains strong connected components should be merged. */ 1981 bitmap sccs_to_merge; 1982 /* Array constains component information for all vertices. */ 1983 int *vertices_component; 1984 /* Vector to record all data dependence relations which are needed 1985 to break strong connected components by runtime alias checks. */ 1986 vec<ddr_p> *alias_ddrs; 1987 }; 1988 1989 /* Initialize vertice's data for partition dependence graph PG with 1990 PARTITIONS. */ 1991 1992 static void 1993 init_partition_graph_vertices (struct graph *pg, 1994 vec<struct partition *> *partitions) 1995 { 1996 int i; 1997 partition *partition; 1998 struct pg_vdata *data; 1999 2000 for (i = 0; partitions->iterate (i, &partition); ++i) 2001 { 2002 data = new pg_vdata; 2003 pg->vertices[i].data = data; 2004 data->id = i; 2005 data->partition = partition; 2006 } 2007 } 2008 2009 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data 2010 dependence relations to the EDGE if DDRS isn't NULL. */ 2011 2012 static void 2013 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs) 2014 { 2015 struct graph_edge *e = add_edge (pg, i, j); 2016 2017 /* If the edge is attached with data dependence relations, it means this 2018 dependence edge can be resolved by runtime alias checks. */ 2019 if (ddrs != NULL) 2020 { 2021 struct pg_edata *data = new pg_edata; 2022 2023 gcc_assert (ddrs->length () > 0); 2024 e->data = data; 2025 data->alias_ddrs = vNULL; 2026 data->alias_ddrs.safe_splice (*ddrs); 2027 } 2028 } 2029 2030 /* Callback function for graph travesal algorithm. It returns true 2031 if edge E should skipped when traversing the graph. */ 2032 2033 static bool 2034 pg_skip_alias_edge (struct graph_edge *e) 2035 { 2036 struct pg_edata *data = (struct pg_edata *)e->data; 2037 return (data != NULL && data->alias_ddrs.length () > 0); 2038 } 2039 2040 /* Callback function freeing data attached to edge E of graph. */ 2041 2042 static void 2043 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *) 2044 { 2045 if (e->data != NULL) 2046 { 2047 struct pg_edata *data = (struct pg_edata *)e->data; 2048 data->alias_ddrs.release (); 2049 delete data; 2050 } 2051 } 2052 2053 /* Free data attached to vertice of partition dependence graph PG. */ 2054 2055 static void 2056 free_partition_graph_vdata (struct graph *pg) 2057 { 2058 int i; 2059 struct pg_vdata *data; 2060 2061 for (i = 0; i < pg->n_vertices; ++i) 2062 { 2063 data = (struct pg_vdata *)pg->vertices[i].data; 2064 delete data; 2065 } 2066 } 2067 2068 /* Build and return partition dependence graph for PARTITIONS. RDG is 2069 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P 2070 is true, data dependence caused by possible alias between references 2071 is ignored, as if it doesn't exist at all; otherwise all depdendences 2072 are considered. */ 2073 2074 static struct graph * 2075 build_partition_graph (struct graph *rdg, 2076 vec<struct partition *> *partitions, 2077 bool ignore_alias_p) 2078 { 2079 int i, j; 2080 struct partition *partition1, *partition2; 2081 graph *pg = new_graph (partitions->length ()); 2082 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p; 2083 2084 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs; 2085 2086 init_partition_graph_vertices (pg, partitions); 2087 2088 for (i = 0; partitions->iterate (i, &partition1); ++i) 2089 { 2090 for (j = i + 1; partitions->iterate (j, &partition2); ++j) 2091 { 2092 /* dependence direction - 0 is no dependence, -1 is back, 2093 1 is forth, 2 is both (we can stop then, merging will occur). */ 2094 int dir = 0; 2095 2096 /* If the first partition has reduction, add back edge; if the 2097 second partition has reduction, add forth edge. This makes 2098 sure that reduction partition will be sorted as the last one. */ 2099 if (partition_reduction_p (partition1)) 2100 dir = -1; 2101 else if (partition_reduction_p (partition2)) 2102 dir = 1; 2103 2104 /* Cleanup the temporary vector. */ 2105 alias_ddrs.truncate (0); 2106 2107 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs, 2108 partition2->datarefs, alias_ddrs_p); 2109 2110 /* Add edge to partition graph if there exists dependence. There 2111 are two types of edges. One type edge is caused by compilation 2112 time known dependence, this type can not be resolved by runtime 2113 alias check. The other type can be resolved by runtime alias 2114 check. */ 2115 if (dir == 1 || dir == 2 2116 || alias_ddrs.length () > 0) 2117 { 2118 /* Attach data dependence relations to edge that can be resolved 2119 by runtime alias check. */ 2120 bool alias_edge_p = (dir != 1 && dir != 2); 2121 add_partition_graph_edge (pg, i, j, 2122 (alias_edge_p) ? &alias_ddrs : NULL); 2123 } 2124 if (dir == -1 || dir == 2 2125 || alias_ddrs.length () > 0) 2126 { 2127 /* Attach data dependence relations to edge that can be resolved 2128 by runtime alias check. */ 2129 bool alias_edge_p = (dir != -1 && dir != 2); 2130 add_partition_graph_edge (pg, j, i, 2131 (alias_edge_p) ? &alias_ddrs : NULL); 2132 } 2133 } 2134 } 2135 return pg; 2136 } 2137 2138 /* Sort partitions in PG in descending post order and store them in 2139 PARTITIONS. */ 2140 2141 static void 2142 sort_partitions_by_post_order (struct graph *pg, 2143 vec<struct partition *> *partitions) 2144 { 2145 int i; 2146 struct pg_vdata *data; 2147 2148 /* Now order the remaining nodes in descending postorder. */ 2149 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); 2150 partitions->truncate (0); 2151 for (i = 0; i < pg->n_vertices; ++i) 2152 { 2153 data = (struct pg_vdata *)pg->vertices[i].data; 2154 if (data->partition) 2155 partitions->safe_push (data->partition); 2156 } 2157 } 2158 2159 /* Given reduced dependence graph RDG merge strong connected components 2160 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by 2161 possible alias between references is ignored, as if it doesn't exist 2162 at all; otherwise all depdendences are considered. */ 2163 2164 static void 2165 merge_dep_scc_partitions (struct graph *rdg, 2166 vec<struct partition *> *partitions, 2167 bool ignore_alias_p) 2168 { 2169 struct partition *partition1, *partition2; 2170 struct pg_vdata *data; 2171 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); 2172 int i, j, num_sccs = graphds_scc (pg, NULL); 2173 2174 /* Strong connected compoenent means dependence cycle, we cannot distribute 2175 them. So fuse them together. */ 2176 if ((unsigned) num_sccs < partitions->length ()) 2177 { 2178 for (i = 0; i < num_sccs; ++i) 2179 { 2180 for (j = 0; partitions->iterate (j, &partition1); ++j) 2181 if (pg->vertices[j].component == i) 2182 break; 2183 for (j = j + 1; partitions->iterate (j, &partition2); ++j) 2184 if (pg->vertices[j].component == i) 2185 { 2186 partition_merge_into (NULL, partition1, 2187 partition2, FUSE_SAME_SCC); 2188 partition1->type = PTYPE_SEQUENTIAL; 2189 (*partitions)[j] = NULL; 2190 partition_free (partition2); 2191 data = (struct pg_vdata *)pg->vertices[j].data; 2192 data->partition = NULL; 2193 } 2194 } 2195 } 2196 2197 sort_partitions_by_post_order (pg, partitions); 2198 gcc_assert (partitions->length () == (unsigned)num_sccs); 2199 free_partition_graph_vdata (pg); 2200 free_graph (pg); 2201 } 2202 2203 /* Callback function for traversing edge E in graph G. DATA is private 2204 callback data. */ 2205 2206 static void 2207 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data) 2208 { 2209 int i, j, component; 2210 struct pg_edge_callback_data *cbdata; 2211 struct pg_edata *edata = (struct pg_edata *) e->data; 2212 2213 /* If the edge doesn't have attached data dependence, it represents 2214 compilation time known dependences. This type dependence cannot 2215 be resolved by runtime alias check. */ 2216 if (edata == NULL || edata->alias_ddrs.length () == 0) 2217 return; 2218 2219 cbdata = (struct pg_edge_callback_data *) data; 2220 i = e->src; 2221 j = e->dest; 2222 component = cbdata->vertices_component[i]; 2223 /* Vertices are topologically sorted according to compilation time 2224 known dependences, so we can break strong connected components 2225 by removing edges of the opposite direction, i.e, edges pointing 2226 from vertice with smaller post number to vertice with bigger post 2227 number. */ 2228 if (g->vertices[i].post < g->vertices[j].post 2229 /* We only need to remove edges connecting vertices in the same 2230 strong connected component to break it. */ 2231 && component == cbdata->vertices_component[j] 2232 /* Check if we want to break the strong connected component or not. */ 2233 && !bitmap_bit_p (cbdata->sccs_to_merge, component)) 2234 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs); 2235 } 2236 2237 /* This is the main function breaking strong conected components in 2238 PARTITIONS giving reduced depdendence graph RDG. Store data dependence 2239 relations for runtime alias check in ALIAS_DDRS. */ 2240 2241 static void 2242 break_alias_scc_partitions (struct graph *rdg, 2243 vec<struct partition *> *partitions, 2244 vec<ddr_p> *alias_ddrs) 2245 { 2246 int i, j, k, num_sccs, num_sccs_no_alias; 2247 /* Build partition dependence graph. */ 2248 graph *pg = build_partition_graph (rdg, partitions, false); 2249 2250 alias_ddrs->truncate (0); 2251 /* Find strong connected components in the graph, with all dependence edges 2252 considered. */ 2253 num_sccs = graphds_scc (pg, NULL); 2254 /* All SCCs now can be broken by runtime alias checks because SCCs caused by 2255 compilation time known dependences are merged before this function. */ 2256 if ((unsigned) num_sccs < partitions->length ()) 2257 { 2258 struct pg_edge_callback_data cbdata; 2259 auto_bitmap sccs_to_merge; 2260 auto_vec<enum partition_type> scc_types; 2261 struct partition *partition, *first; 2262 2263 /* If all partitions in a SCC have the same type, we can simply merge the 2264 SCC. This loop finds out such SCCS and record them in bitmap. */ 2265 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); 2266 for (i = 0; i < num_sccs; ++i) 2267 { 2268 for (j = 0; partitions->iterate (j, &first); ++j) 2269 if (pg->vertices[j].component == i) 2270 break; 2271 for (++j; partitions->iterate (j, &partition); ++j) 2272 { 2273 if (pg->vertices[j].component != i) 2274 continue; 2275 2276 /* Note we Merge partitions of parallel type on purpose, though 2277 the result partition is sequential. The reason is vectorizer 2278 can do more accurate runtime alias check in this case. Also 2279 it results in more conservative distribution. */ 2280 if (first->type != partition->type) 2281 { 2282 bitmap_clear_bit (sccs_to_merge, i); 2283 break; 2284 } 2285 } 2286 } 2287 2288 /* Initialize callback data for traversing. */ 2289 cbdata.sccs_to_merge = sccs_to_merge; 2290 cbdata.alias_ddrs = alias_ddrs; 2291 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices); 2292 /* Record the component information which will be corrupted by next 2293 graph scc finding call. */ 2294 for (i = 0; i < pg->n_vertices; ++i) 2295 cbdata.vertices_component[i] = pg->vertices[i].component; 2296 2297 /* Collect data dependences for runtime alias checks to break SCCs. */ 2298 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) 2299 { 2300 /* Run SCC finding algorithm again, with alias dependence edges 2301 skipped. This is to topologically sort partitions according to 2302 compilation time known dependence. Note the topological order 2303 is stored in the form of pg's post order number. */ 2304 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); 2305 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias); 2306 /* With topological order, we can construct two subgraphs L and R. 2307 L contains edge <x, y> where x < y in terms of post order, while 2308 R contains edge <x, y> where x > y. Edges for compilation time 2309 known dependence all fall in R, so we break SCCs by removing all 2310 (alias) edges of in subgraph L. */ 2311 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata); 2312 } 2313 2314 /* For SCC that doesn't need to be broken, merge it. */ 2315 for (i = 0; i < num_sccs; ++i) 2316 { 2317 if (!bitmap_bit_p (sccs_to_merge, i)) 2318 continue; 2319 2320 for (j = 0; partitions->iterate (j, &first); ++j) 2321 if (cbdata.vertices_component[j] == i) 2322 break; 2323 for (k = j + 1; partitions->iterate (k, &partition); ++k) 2324 { 2325 struct pg_vdata *data; 2326 2327 if (cbdata.vertices_component[k] != i) 2328 continue; 2329 2330 /* Update postorder number so that merged reduction partition is 2331 sorted after other partitions. */ 2332 if (!partition_reduction_p (first) 2333 && partition_reduction_p (partition)) 2334 { 2335 gcc_assert (pg->vertices[k].post < pg->vertices[j].post); 2336 pg->vertices[j].post = pg->vertices[k].post; 2337 } 2338 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC); 2339 (*partitions)[k] = NULL; 2340 partition_free (partition); 2341 data = (struct pg_vdata *)pg->vertices[k].data; 2342 gcc_assert (data->id == k); 2343 data->partition = NULL; 2344 /* The result partition of merged SCC must be sequential. */ 2345 first->type = PTYPE_SEQUENTIAL; 2346 } 2347 } 2348 } 2349 2350 sort_partitions_by_post_order (pg, partitions); 2351 free_partition_graph_vdata (pg); 2352 for_each_edge (pg, free_partition_graph_edata_cb, NULL); 2353 free_graph (pg); 2354 2355 if (dump_file && (dump_flags & TDF_DETAILS)) 2356 { 2357 fprintf (dump_file, "Possible alias data dependence to break:\n"); 2358 dump_data_dependence_relations (dump_file, *alias_ddrs); 2359 } 2360 } 2361 2362 /* Compute and return an expression whose value is the segment length which 2363 will be accessed by DR in NITERS iterations. */ 2364 2365 static tree 2366 data_ref_segment_size (struct data_reference *dr, tree niters) 2367 { 2368 niters = size_binop (MINUS_EXPR, 2369 fold_convert (sizetype, niters), 2370 size_one_node); 2371 return size_binop (MULT_EXPR, 2372 fold_convert (sizetype, DR_STEP (dr)), 2373 fold_convert (sizetype, niters)); 2374 } 2375 2376 /* Return true if LOOP's latch is dominated by statement for data reference 2377 DR. */ 2378 2379 static inline bool 2380 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr) 2381 { 2382 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, 2383 gimple_bb (DR_STMT (dr))); 2384 } 2385 2386 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's 2387 data dependence relations ALIAS_DDRS. */ 2388 2389 static void 2390 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs, 2391 vec<dr_with_seg_len_pair_t> *comp_alias_pairs) 2392 { 2393 unsigned int i; 2394 unsigned HOST_WIDE_INT factor = 1; 2395 tree niters_plus_one, niters = number_of_latch_executions (loop); 2396 2397 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know); 2398 niters = fold_convert (sizetype, niters); 2399 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node); 2400 2401 if (dump_file && (dump_flags & TDF_DETAILS)) 2402 fprintf (dump_file, "Creating alias check pairs:\n"); 2403 2404 /* Iterate all data dependence relations and compute alias check pairs. */ 2405 for (i = 0; i < alias_ddrs->length (); i++) 2406 { 2407 ddr_p ddr = (*alias_ddrs)[i]; 2408 struct data_reference *dr_a = DDR_A (ddr); 2409 struct data_reference *dr_b = DDR_B (ddr); 2410 tree seg_length_a, seg_length_b; 2411 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 2412 DR_BASE_ADDRESS (dr_b)); 2413 2414 if (comp_res == 0) 2415 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); 2416 gcc_assert (comp_res != 0); 2417 2418 if (latch_dominated_by_data_ref (loop, dr_a)) 2419 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); 2420 else 2421 seg_length_a = data_ref_segment_size (dr_a, niters); 2422 2423 if (latch_dominated_by_data_ref (loop, dr_b)) 2424 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); 2425 else 2426 seg_length_b = data_ref_segment_size (dr_b, niters); 2427 2428 unsigned HOST_WIDE_INT access_size_a 2429 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); 2430 unsigned HOST_WIDE_INT access_size_b 2431 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); 2432 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); 2433 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); 2434 2435 dr_with_seg_len_pair_t dr_with_seg_len_pair 2436 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), 2437 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b)); 2438 2439 /* Canonicalize pairs by sorting the two DR members. */ 2440 if (comp_res > 0) 2441 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 2442 2443 comp_alias_pairs->safe_push (dr_with_seg_len_pair); 2444 } 2445 2446 if (tree_fits_uhwi_p (niters)) 2447 factor = tree_to_uhwi (niters); 2448 2449 /* Prune alias check pairs. */ 2450 prune_runtime_alias_test_list (comp_alias_pairs, factor); 2451 if (dump_file && (dump_flags & TDF_DETAILS)) 2452 fprintf (dump_file, 2453 "Improved number of alias checks from %d to %d\n", 2454 alias_ddrs->length (), comp_alias_pairs->length ()); 2455 } 2456 2457 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias 2458 checks and version LOOP under condition of these runtime alias checks. */ 2459 2460 static void 2461 version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs) 2462 { 2463 profile_probability prob; 2464 basic_block cond_bb; 2465 struct loop *nloop; 2466 tree lhs, arg0, cond_expr = NULL_TREE; 2467 gimple_seq cond_stmts = NULL; 2468 gimple *call_stmt = NULL; 2469 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs; 2470 2471 /* Generate code for runtime alias checks if necessary. */ 2472 gcc_assert (alias_ddrs->length () > 0); 2473 2474 if (dump_file && (dump_flags & TDF_DETAILS)) 2475 fprintf (dump_file, 2476 "Version loop <%d> with runtime alias check\n", loop->num); 2477 2478 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs); 2479 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); 2480 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, 2481 is_gimple_val, NULL_TREE); 2482 2483 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ 2484 if (flag_tree_loop_vectorize) 2485 { 2486 /* Generate internal function call for loop distribution alias check. */ 2487 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, 2488 2, NULL_TREE, cond_expr); 2489 lhs = make_ssa_name (boolean_type_node); 2490 gimple_call_set_lhs (call_stmt, lhs); 2491 } 2492 else 2493 lhs = cond_expr; 2494 2495 prob = profile_probability::guessed_always ().apply_scale (9, 10); 2496 initialize_original_copy_tables (); 2497 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (), 2498 prob, prob.invert (), true); 2499 free_original_copy_tables (); 2500 /* Record the original loop number in newly generated loops. In case of 2501 distribution, the original loop will be distributed and the new loop 2502 is kept. */ 2503 loop->orig_loop_num = nloop->num; 2504 nloop->orig_loop_num = nloop->num; 2505 nloop->dont_vectorize = true; 2506 nloop->force_vectorize = false; 2507 2508 if (call_stmt) 2509 { 2510 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original 2511 loop could be destroyed. */ 2512 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num); 2513 gimple_call_set_arg (call_stmt, 0, arg0); 2514 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt); 2515 } 2516 2517 if (cond_stmts) 2518 { 2519 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); 2520 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT); 2521 } 2522 update_ssa (TODO_update_ssa); 2523 } 2524 2525 /* Return true if loop versioning is needed to distrubute PARTITIONS. 2526 ALIAS_DDRS are data dependence relations for runtime alias check. */ 2527 2528 static inline bool 2529 version_for_distribution_p (vec<struct partition *> *partitions, 2530 vec<ddr_p> *alias_ddrs) 2531 { 2532 /* No need to version loop if we have only one partition. */ 2533 if (partitions->length () == 1) 2534 return false; 2535 2536 /* Need to version loop if runtime alias check is necessary. */ 2537 return (alias_ddrs->length () > 0); 2538 } 2539 2540 /* Compare base offset of builtin mem* partitions P1 and P2. */ 2541 2542 static bool 2543 offset_cmp (struct partition *p1, struct partition *p2) 2544 { 2545 gcc_assert (p1 != NULL && p1->builtin != NULL); 2546 gcc_assert (p2 != NULL && p2->builtin != NULL); 2547 return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset; 2548 } 2549 2550 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special 2551 case optimization transforming below code: 2552 2553 __builtin_memset (&obj, 0, 100); 2554 _1 = &obj + 100; 2555 __builtin_memset (_1, 0, 200); 2556 _2 = &obj + 300; 2557 __builtin_memset (_2, 0, 100); 2558 2559 into: 2560 2561 __builtin_memset (&obj, 0, 400); 2562 2563 Note we don't have dependence information between different partitions 2564 at this point, as a result, we can't handle nonadjacent memset builtin 2565 partitions since dependence might be broken. */ 2566 2567 static void 2568 fuse_memset_builtins (vec<struct partition *> *partitions) 2569 { 2570 unsigned i, j; 2571 struct partition *part1, *part2; 2572 tree rhs1, rhs2; 2573 2574 for (i = 0; partitions->iterate (i, &part1);) 2575 { 2576 if (part1->kind != PKIND_MEMSET) 2577 { 2578 i++; 2579 continue; 2580 } 2581 2582 /* Find sub-array of memset builtins of the same base. Index range 2583 of the sub-array is [i, j) with "j > i". */ 2584 for (j = i + 1; partitions->iterate (j, &part2); ++j) 2585 { 2586 if (part2->kind != PKIND_MEMSET 2587 || !operand_equal_p (part1->builtin->dst_base_base, 2588 part2->builtin->dst_base_base, 0)) 2589 break; 2590 2591 /* Memset calls setting different values can't be merged. */ 2592 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); 2593 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); 2594 if (!operand_equal_p (rhs1, rhs2, 0)) 2595 break; 2596 } 2597 2598 /* Stable sort is required in order to avoid breaking dependence. */ 2599 std::stable_sort (&(*partitions)[i], 2600 &(*partitions)[i] + j - i, offset_cmp); 2601 /* Continue with next partition. */ 2602 i = j; 2603 } 2604 2605 /* Merge all consecutive memset builtin partitions. */ 2606 for (i = 0; i < partitions->length () - 1;) 2607 { 2608 part1 = (*partitions)[i]; 2609 if (part1->kind != PKIND_MEMSET) 2610 { 2611 i++; 2612 continue; 2613 } 2614 2615 part2 = (*partitions)[i + 1]; 2616 /* Only merge memset partitions of the same base and with constant 2617 access sizes. */ 2618 if (part2->kind != PKIND_MEMSET 2619 || TREE_CODE (part1->builtin->size) != INTEGER_CST 2620 || TREE_CODE (part2->builtin->size) != INTEGER_CST 2621 || !operand_equal_p (part1->builtin->dst_base_base, 2622 part2->builtin->dst_base_base, 0)) 2623 { 2624 i++; 2625 continue; 2626 } 2627 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); 2628 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); 2629 int bytev1 = const_with_all_bytes_same (rhs1); 2630 int bytev2 = const_with_all_bytes_same (rhs2); 2631 /* Only merge memset partitions of the same value. */ 2632 if (bytev1 != bytev2 || bytev1 == -1) 2633 { 2634 i++; 2635 continue; 2636 } 2637 wide_int end1 = wi::add (part1->builtin->dst_base_offset, 2638 wi::to_wide (part1->builtin->size)); 2639 /* Only merge adjacent memset partitions. */ 2640 if (wi::ne_p (end1, part2->builtin->dst_base_offset)) 2641 { 2642 i++; 2643 continue; 2644 } 2645 /* Merge partitions[i] and partitions[i+1]. */ 2646 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, 2647 part1->builtin->size, 2648 part2->builtin->size); 2649 partition_free (part2); 2650 partitions->ordered_remove (i + 1); 2651 } 2652 } 2653 2654 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. 2655 ALIAS_DDRS contains ddrs which need runtime alias check. */ 2656 2657 static void 2658 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions, 2659 vec<ddr_p> *alias_ddrs) 2660 { 2661 unsigned i; 2662 struct partition *partition, *a; 2663 2664 if (partitions->length () == 1 2665 || alias_ddrs->length () > 0) 2666 return; 2667 2668 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; 2669 bool same_type_p = true; 2670 enum partition_type type = ((*partitions)[0])->type; 2671 for (i = 0; partitions->iterate (i, &partition); ++i) 2672 { 2673 same_type_p &= (type == partition->type); 2674 if (partition_builtin_p (partition)) 2675 { 2676 num_builtin++; 2677 continue; 2678 } 2679 num_normal++; 2680 if (partition->kind == PKIND_PARTIAL_MEMSET) 2681 num_partial_memset++; 2682 } 2683 2684 /* Don't distribute current loop into too many loops given we don't have 2685 memory stream cost model. Be even more conservative in case of loop 2686 nest distribution. */ 2687 if ((same_type_p && num_builtin == 0 2688 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) 2689 || (loop->inner != NULL 2690 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) 2691 || (loop->inner == NULL 2692 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) 2693 { 2694 a = (*partitions)[0]; 2695 for (i = 1; partitions->iterate (i, &partition); ++i) 2696 { 2697 partition_merge_into (NULL, a, partition, FUSE_FINALIZE); 2698 partition_free (partition); 2699 } 2700 partitions->truncate (1); 2701 } 2702 2703 /* Fuse memset builtins if possible. */ 2704 if (partitions->length () > 1) 2705 fuse_memset_builtins (partitions); 2706 } 2707 2708 /* Distributes the code from LOOP in such a way that producer statements 2709 are placed before consumer statements. Tries to separate only the 2710 statements from STMTS into separate loops. Returns the number of 2711 distributed loops. Set NB_CALLS to number of generated builtin calls. 2712 Set *DESTROY_P to whether LOOP needs to be destroyed. */ 2713 2714 static int 2715 distribute_loop (struct loop *loop, vec<gimple *> stmts, 2716 control_dependences *cd, int *nb_calls, bool *destroy_p) 2717 { 2718 ddrs_table = new hash_table<ddr_hasher> (389); 2719 struct graph *rdg; 2720 partition *partition; 2721 bool any_builtin; 2722 int i, nbp; 2723 2724 *destroy_p = false; 2725 *nb_calls = 0; 2726 loop_nest.create (0); 2727 if (!find_loop_nest (loop, &loop_nest)) 2728 { 2729 loop_nest.release (); 2730 delete ddrs_table; 2731 return 0; 2732 } 2733 2734 datarefs_vec.create (20); 2735 rdg = build_rdg (loop, cd); 2736 if (!rdg) 2737 { 2738 if (dump_file && (dump_flags & TDF_DETAILS)) 2739 fprintf (dump_file, 2740 "Loop %d not distributed: failed to build the RDG.\n", 2741 loop->num); 2742 2743 loop_nest.release (); 2744 free_data_refs (datarefs_vec); 2745 delete ddrs_table; 2746 return 0; 2747 } 2748 2749 if (datarefs_vec.length () > MAX_DATAREFS_NUM) 2750 { 2751 if (dump_file && (dump_flags & TDF_DETAILS)) 2752 fprintf (dump_file, 2753 "Loop %d not distributed: too many memory references.\n", 2754 loop->num); 2755 2756 free_rdg (rdg); 2757 loop_nest.release (); 2758 free_data_refs (datarefs_vec); 2759 delete ddrs_table; 2760 return 0; 2761 } 2762 2763 data_reference_p dref; 2764 for (i = 0; datarefs_vec.iterate (i, &dref); ++i) 2765 dref->aux = (void *) (uintptr_t) i; 2766 2767 if (dump_file && (dump_flags & TDF_DETAILS)) 2768 dump_rdg (dump_file, rdg); 2769 2770 auto_vec<struct partition *, 3> partitions; 2771 rdg_build_partitions (rdg, stmts, &partitions); 2772 2773 auto_vec<ddr_p> alias_ddrs; 2774 2775 auto_bitmap stmt_in_all_partitions; 2776 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); 2777 for (i = 1; partitions.iterate (i, &partition); ++i) 2778 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); 2779 2780 any_builtin = false; 2781 FOR_EACH_VEC_ELT (partitions, i, partition) 2782 { 2783 classify_partition (loop, rdg, partition, stmt_in_all_partitions); 2784 any_builtin |= partition_builtin_p (partition); 2785 } 2786 2787 /* If we are only distributing patterns but did not detect any, 2788 simply bail out. */ 2789 if (!flag_tree_loop_distribution 2790 && !any_builtin) 2791 { 2792 nbp = 0; 2793 goto ldist_done; 2794 } 2795 2796 /* If we are only distributing patterns fuse all partitions that 2797 were not classified as builtins. This also avoids chopping 2798 a loop into pieces, separated by builtin calls. That is, we 2799 only want no or a single loop body remaining. */ 2800 struct partition *into; 2801 if (!flag_tree_loop_distribution) 2802 { 2803 for (i = 0; partitions.iterate (i, &into); ++i) 2804 if (!partition_builtin_p (into)) 2805 break; 2806 for (++i; partitions.iterate (i, &partition); ++i) 2807 if (!partition_builtin_p (partition)) 2808 { 2809 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); 2810 partitions.unordered_remove (i); 2811 partition_free (partition); 2812 i--; 2813 } 2814 } 2815 2816 /* Due to limitations in the transform phase we have to fuse all 2817 reduction partitions into the last partition so the existing 2818 loop will contain all loop-closed PHI nodes. */ 2819 for (i = 0; partitions.iterate (i, &into); ++i) 2820 if (partition_reduction_p (into)) 2821 break; 2822 for (i = i + 1; partitions.iterate (i, &partition); ++i) 2823 if (partition_reduction_p (partition)) 2824 { 2825 partition_merge_into (rdg, into, partition, FUSE_REDUCTION); 2826 partitions.unordered_remove (i); 2827 partition_free (partition); 2828 i--; 2829 } 2830 2831 /* Apply our simple cost model - fuse partitions with similar 2832 memory accesses. */ 2833 for (i = 0; partitions.iterate (i, &into); ++i) 2834 { 2835 bool changed = false; 2836 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) 2837 continue; 2838 for (int j = i + 1; 2839 partitions.iterate (j, &partition); ++j) 2840 { 2841 if (share_memory_accesses (rdg, into, partition)) 2842 { 2843 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); 2844 partitions.unordered_remove (j); 2845 partition_free (partition); 2846 j--; 2847 changed = true; 2848 } 2849 } 2850 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar 2851 accesses when 1 and 2 have similar accesses but not 0 and 1 2852 then in the next iteration we will fail to consider merging 2853 1 into 0,2. So try again if we did any merging into 0. */ 2854 if (changed) 2855 i--; 2856 } 2857 2858 /* Build the partition dependency graph and fuse partitions in strong 2859 connected component. */ 2860 if (partitions.length () > 1) 2861 { 2862 /* Don't support loop nest distribution under runtime alias check 2863 since it's not likely to enable many vectorization opportunities. */ 2864 if (loop->inner) 2865 merge_dep_scc_partitions (rdg, &partitions, false); 2866 else 2867 { 2868 merge_dep_scc_partitions (rdg, &partitions, true); 2869 if (partitions.length () > 1) 2870 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); 2871 } 2872 } 2873 2874 finalize_partitions (loop, &partitions, &alias_ddrs); 2875 2876 nbp = partitions.length (); 2877 if (nbp == 0 2878 || (nbp == 1 && !partition_builtin_p (partitions[0])) 2879 || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) 2880 { 2881 nbp = 0; 2882 goto ldist_done; 2883 } 2884 2885 if (version_for_distribution_p (&partitions, &alias_ddrs)) 2886 version_loop_by_alias_check (loop, &alias_ddrs); 2887 2888 if (dump_file && (dump_flags & TDF_DETAILS)) 2889 { 2890 fprintf (dump_file, 2891 "distribute loop <%d> into partitions:\n", loop->num); 2892 dump_rdg_partitions (dump_file, partitions); 2893 } 2894 2895 FOR_EACH_VEC_ELT (partitions, i, partition) 2896 { 2897 if (partition_builtin_p (partition)) 2898 (*nb_calls)++; 2899 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1); 2900 } 2901 2902 ldist_done: 2903 loop_nest.release (); 2904 free_data_refs (datarefs_vec); 2905 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin (); 2906 iter != ddrs_table->end (); ++iter) 2907 { 2908 free_dependence_relation (*iter); 2909 *iter = NULL; 2910 } 2911 delete ddrs_table; 2912 2913 FOR_EACH_VEC_ELT (partitions, i, partition) 2914 partition_free (partition); 2915 2916 free_rdg (rdg); 2917 return nbp - *nb_calls; 2918 } 2919 2920 /* Distribute all loops in the current function. */ 2921 2922 namespace { 2923 2924 const pass_data pass_data_loop_distribution = 2925 { 2926 GIMPLE_PASS, /* type */ 2927 "ldist", /* name */ 2928 OPTGROUP_LOOP, /* optinfo_flags */ 2929 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ 2930 ( PROP_cfg | PROP_ssa ), /* properties_required */ 2931 0, /* properties_provided */ 2932 0, /* properties_destroyed */ 2933 0, /* todo_flags_start */ 2934 0, /* todo_flags_finish */ 2935 }; 2936 2937 class pass_loop_distribution : public gimple_opt_pass 2938 { 2939 public: 2940 pass_loop_distribution (gcc::context *ctxt) 2941 : gimple_opt_pass (pass_data_loop_distribution, ctxt) 2942 {} 2943 2944 /* opt_pass methods: */ 2945 virtual bool gate (function *) 2946 { 2947 return flag_tree_loop_distribution 2948 || flag_tree_loop_distribute_patterns; 2949 } 2950 2951 virtual unsigned int execute (function *); 2952 2953 }; // class pass_loop_distribution 2954 2955 2956 /* Given LOOP, this function records seed statements for distribution in 2957 WORK_LIST. Return false if there is nothing for distribution. */ 2958 2959 static bool 2960 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list) 2961 { 2962 basic_block *bbs = get_loop_body_in_dom_order (loop); 2963 2964 /* Initialize the worklist with stmts we seed the partitions with. */ 2965 for (unsigned i = 0; i < loop->num_nodes; ++i) 2966 { 2967 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); 2968 !gsi_end_p (gsi); gsi_next (&gsi)) 2969 { 2970 gphi *phi = gsi.phi (); 2971 if (virtual_operand_p (gimple_phi_result (phi))) 2972 continue; 2973 /* Distribute stmts which have defs that are used outside of 2974 the loop. */ 2975 if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) 2976 continue; 2977 work_list->safe_push (phi); 2978 } 2979 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 2980 !gsi_end_p (gsi); gsi_next (&gsi)) 2981 { 2982 gimple *stmt = gsi_stmt (gsi); 2983 2984 /* If there is a stmt with side-effects bail out - we 2985 cannot and should not distribute this loop. */ 2986 if (gimple_has_side_effects (stmt)) 2987 { 2988 free (bbs); 2989 return false; 2990 } 2991 2992 /* Distribute stmts which have defs that are used outside of 2993 the loop. */ 2994 if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) 2995 ; 2996 /* Otherwise only distribute stores for now. */ 2997 else if (!gimple_vdef (stmt)) 2998 continue; 2999 3000 work_list->safe_push (stmt); 3001 } 3002 } 3003 free (bbs); 3004 return work_list->length () > 0; 3005 } 3006 3007 /* Given innermost LOOP, return the outermost enclosing loop that forms a 3008 perfect loop nest. */ 3009 3010 static struct loop * 3011 prepare_perfect_loop_nest (struct loop *loop) 3012 { 3013 struct loop *outer = loop_outer (loop); 3014 tree niters = number_of_latch_executions (loop); 3015 3016 /* TODO: We only support the innermost 3-level loop nest distribution 3017 because of compilation time issue for now. This should be relaxed 3018 in the future. Note we only allow 3-level loop nest distribution 3019 when parallelizing loops. */ 3020 while ((loop->inner == NULL 3021 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) 3022 && loop_outer (outer) 3023 && outer->inner == loop && loop->next == NULL 3024 && single_exit (outer) 3025 && optimize_loop_for_speed_p (outer) 3026 && !chrec_contains_symbols_defined_in_loop (niters, outer->num) 3027 && (niters = number_of_latch_executions (outer)) != NULL_TREE 3028 && niters != chrec_dont_know) 3029 { 3030 loop = outer; 3031 outer = loop_outer (loop); 3032 } 3033 3034 return loop; 3035 } 3036 3037 unsigned int 3038 pass_loop_distribution::execute (function *fun) 3039 { 3040 struct loop *loop; 3041 bool changed = false; 3042 basic_block bb; 3043 control_dependences *cd = NULL; 3044 auto_vec<loop_p> loops_to_be_destroyed; 3045 3046 if (number_of_loops (fun) <= 1) 3047 return 0; 3048 3049 /* Compute topological order for basic blocks. Topological order is 3050 needed because data dependence is computed for data references in 3051 lexicographical order. */ 3052 if (bb_top_order_index == NULL) 3053 { 3054 int rpo_num; 3055 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); 3056 3057 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); 3058 bb_top_order_index_size = last_basic_block_for_fn (cfun); 3059 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true); 3060 for (int i = 0; i < rpo_num; i++) 3061 bb_top_order_index[rpo[i]] = i; 3062 3063 free (rpo); 3064 } 3065 3066 FOR_ALL_BB_FN (bb, fun) 3067 { 3068 gimple_stmt_iterator gsi; 3069 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 3070 gimple_set_uid (gsi_stmt (gsi), -1); 3071 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 3072 gimple_set_uid (gsi_stmt (gsi), -1); 3073 } 3074 3075 /* We can at the moment only distribute non-nested loops, thus restrict 3076 walking to innermost loops. */ 3077 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 3078 { 3079 /* Don't distribute multiple exit edges loop, or cold loop. */ 3080 if (!single_exit (loop) 3081 || !optimize_loop_for_speed_p (loop)) 3082 continue; 3083 3084 /* Don't distribute loop if niters is unknown. */ 3085 tree niters = number_of_latch_executions (loop); 3086 if (niters == NULL_TREE || niters == chrec_dont_know) 3087 continue; 3088 3089 /* Get the perfect loop nest for distribution. */ 3090 loop = prepare_perfect_loop_nest (loop); 3091 for (; loop; loop = loop->inner) 3092 { 3093 auto_vec<gimple *> work_list; 3094 if (!find_seed_stmts_for_distribution (loop, &work_list)) 3095 break; 3096 3097 const char *str = loop->inner ? " nest" : ""; 3098 location_t loc = find_loop_location (loop); 3099 if (!cd) 3100 { 3101 calculate_dominance_info (CDI_DOMINATORS); 3102 calculate_dominance_info (CDI_POST_DOMINATORS); 3103 cd = new control_dependences (); 3104 free_dominance_info (CDI_POST_DOMINATORS); 3105 } 3106 3107 bool destroy_p; 3108 int nb_generated_loops, nb_generated_calls; 3109 nb_generated_loops = distribute_loop (loop, work_list, cd, 3110 &nb_generated_calls, 3111 &destroy_p); 3112 if (destroy_p) 3113 loops_to_be_destroyed.safe_push (loop); 3114 3115 if (nb_generated_loops + nb_generated_calls > 0) 3116 { 3117 changed = true; 3118 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, 3119 loc, "Loop%s %d distributed: split to %d loops " 3120 "and %d library calls.\n", str, loop->num, 3121 nb_generated_loops, nb_generated_calls); 3122 3123 break; 3124 } 3125 3126 if (dump_file && (dump_flags & TDF_DETAILS)) 3127 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); 3128 } 3129 } 3130 3131 if (cd) 3132 delete cd; 3133 3134 if (bb_top_order_index != NULL) 3135 { 3136 free (bb_top_order_index); 3137 bb_top_order_index = NULL; 3138 bb_top_order_index_size = 0; 3139 } 3140 3141 if (changed) 3142 { 3143 /* Destroy loop bodies that could not be reused. Do this late as we 3144 otherwise can end up refering to stale data in control dependences. */ 3145 unsigned i; 3146 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) 3147 destroy_loop (loop); 3148 3149 /* Cached scalar evolutions now may refer to wrong or non-existing 3150 loops. */ 3151 scev_reset_htab (); 3152 mark_virtual_operands_for_renaming (fun); 3153 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); 3154 } 3155 3156 checking_verify_loop_structure (); 3157 3158 return changed ? TODO_cleanup_cfg : 0; 3159 } 3160 3161 } // anon namespace 3162 3163 gimple_opt_pass * 3164 make_pass_loop_distribution (gcc::context *ctxt) 3165 { 3166 return new pass_loop_distribution (ctxt); 3167 } 3168