1 /* Loop autoparallelization. 2 Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012 3 Free Software Foundation, Inc. 4 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and 5 Zdenek Dvorak <dvorakz@suse.cz>. 6 7 This file is part of GCC. 8 9 GCC is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free 11 Software Foundation; either version 3, or (at your option) any later 12 version. 13 14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 15 WARRANTY; without even the implied warranty of MERCHANTABILITY or 16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17 for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with GCC; see the file COPYING3. If not see 21 <http://www.gnu.org/licenses/>. */ 22 23 #include "config.h" 24 #include "system.h" 25 #include "coretypes.h" 26 #include "tree-flow.h" 27 #include "cfgloop.h" 28 #include "tree-data-ref.h" 29 #include "tree-scalar-evolution.h" 30 #include "gimple-pretty-print.h" 31 #include "tree-pass.h" 32 #include "langhooks.h" 33 #include "tree-vectorizer.h" 34 35 /* This pass tries to distribute iterations of loops into several threads. 36 The implementation is straightforward -- for each loop we test whether its 37 iterations are independent, and if it is the case (and some additional 38 conditions regarding profitability and correctness are satisfied), we 39 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion 40 machinery do its job. 41 42 The most of the complexity is in bringing the code into shape expected 43 by the omp expanders: 44 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction 45 variable and that the exit test is at the start of the loop body 46 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable 47 variables by accesses through pointers, and breaking up ssa chains 48 by storing the values incoming to the parallelized loop to a structure 49 passed to the new function as an argument (something similar is done 50 in omp gimplification, unfortunately only a small part of the code 51 can be shared). 52 53 TODO: 54 -- if there are several parallelizable loops in a function, it may be 55 possible to generate the threads just once (using synchronization to 56 ensure that cross-loop dependences are obeyed). 57 -- handling of common scalar dependence patterns (accumulation, ...) 58 -- handling of non-innermost loops */ 59 60 /* 61 Reduction handling: 62 currently we use vect_force_simple_reduction() to detect reduction patterns. 63 The code transformation will be introduced by an example. 64 65 66 parloop 67 { 68 int sum=1; 69 70 for (i = 0; i < N; i++) 71 { 72 x[i] = i + 3; 73 sum+=x[i]; 74 } 75 } 76 77 gimple-like code: 78 header_bb: 79 80 # sum_29 = PHI <sum_11(5), 1(3)> 81 # i_28 = PHI <i_12(5), 0(3)> 82 D.1795_8 = i_28 + 3; 83 x[i_28] = D.1795_8; 84 sum_11 = D.1795_8 + sum_29; 85 i_12 = i_28 + 1; 86 if (N_6(D) > i_12) 87 goto header_bb; 88 89 90 exit_bb: 91 92 # sum_21 = PHI <sum_11(4)> 93 printf (&"%d"[0], sum_21); 94 95 96 after reduction transformation (only relevant parts): 97 98 parloop 99 { 100 101 .... 102 103 104 # Storing the initial value given by the user. # 105 106 .paral_data_store.32.sum.27 = 1; 107 108 #pragma omp parallel num_threads(4) 109 110 #pragma omp for schedule(static) 111 112 # The neutral element corresponding to the particular 113 reduction's operation, e.g. 0 for PLUS_EXPR, 114 1 for MULT_EXPR, etc. replaces the user's initial value. # 115 116 # sum.27_29 = PHI <sum.27_11, 0> 117 118 sum.27_11 = D.1827_8 + sum.27_29; 119 120 GIMPLE_OMP_CONTINUE 121 122 # Adding this reduction phi is done at create_phi_for_local_result() # 123 # sum.27_56 = PHI <sum.27_11, 0> 124 GIMPLE_OMP_RETURN 125 126 # Creating the atomic operation is done at 127 create_call_for_reduction_1() # 128 129 #pragma omp atomic_load 130 D.1839_59 = *&.paral_data_load.33_51->reduction.23; 131 D.1840_60 = sum.27_56 + D.1839_59; 132 #pragma omp atomic_store (D.1840_60); 133 134 GIMPLE_OMP_RETURN 135 136 # collecting the result after the join of the threads is done at 137 create_loads_for_reductions(). 138 The value computed by the threads is loaded from the 139 shared struct. # 140 141 142 .paral_data_load.33_52 = &.paral_data_store.32; 143 sum_37 = .paral_data_load.33_52->sum.27; 144 sum_43 = D.1795_41 + sum_37; 145 146 exit bb: 147 # sum_21 = PHI <sum_43, sum_26> 148 printf (&"%d"[0], sum_21); 149 150 ... 151 152 } 153 154 */ 155 156 /* Minimal number of iterations of a loop that should be executed in each 157 thread. */ 158 #define MIN_PER_THREAD 100 159 160 /* Element of the hashtable, representing a 161 reduction in the current loop. */ 162 struct reduction_info 163 { 164 gimple reduc_stmt; /* reduction statement. */ 165 gimple reduc_phi; /* The phi node defining the reduction. */ 166 enum tree_code reduction_code;/* code for the reduction operation. */ 167 unsigned reduc_version; /* SSA_NAME_VERSION of original reduc_phi 168 result. */ 169 gimple keep_res; /* The PHI_RESULT of this phi is the resulting value 170 of the reduction variable when existing the loop. */ 171 tree initial_value; /* The initial value of the reduction var before entering the loop. */ 172 tree field; /* the name of the field in the parloop data structure intended for reduction. */ 173 tree init; /* reduction initialization value. */ 174 gimple new_phi; /* (helper field) Newly created phi node whose result 175 will be passed to the atomic operation. Represents 176 the local result each thread computed for the reduction 177 operation. */ 178 }; 179 180 /* Equality and hash functions for hashtab code. */ 181 182 static int 183 reduction_info_eq (const void *aa, const void *bb) 184 { 185 const struct reduction_info *a = (const struct reduction_info *) aa; 186 const struct reduction_info *b = (const struct reduction_info *) bb; 187 188 return (a->reduc_phi == b->reduc_phi); 189 } 190 191 static hashval_t 192 reduction_info_hash (const void *aa) 193 { 194 const struct reduction_info *a = (const struct reduction_info *) aa; 195 196 return a->reduc_version; 197 } 198 199 static struct reduction_info * 200 reduction_phi (htab_t reduction_list, gimple phi) 201 { 202 struct reduction_info tmpred, *red; 203 204 if (htab_elements (reduction_list) == 0 || phi == NULL) 205 return NULL; 206 207 tmpred.reduc_phi = phi; 208 tmpred.reduc_version = gimple_uid (phi); 209 red = (struct reduction_info *) htab_find (reduction_list, &tmpred); 210 211 return red; 212 } 213 214 /* Element of hashtable of names to copy. */ 215 216 struct name_to_copy_elt 217 { 218 unsigned version; /* The version of the name to copy. */ 219 tree new_name; /* The new name used in the copy. */ 220 tree field; /* The field of the structure used to pass the 221 value. */ 222 }; 223 224 /* Equality and hash functions for hashtab code. */ 225 226 static int 227 name_to_copy_elt_eq (const void *aa, const void *bb) 228 { 229 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa; 230 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb; 231 232 return a->version == b->version; 233 } 234 235 static hashval_t 236 name_to_copy_elt_hash (const void *aa) 237 { 238 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa; 239 240 return (hashval_t) a->version; 241 } 242 243 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE 244 matrix. Rather than use floats, we simply keep a single DENOMINATOR that 245 represents the denominator for every element in the matrix. */ 246 typedef struct lambda_trans_matrix_s 247 { 248 lambda_matrix matrix; 249 int rowsize; 250 int colsize; 251 int denominator; 252 } *lambda_trans_matrix; 253 #define LTM_MATRIX(T) ((T)->matrix) 254 #define LTM_ROWSIZE(T) ((T)->rowsize) 255 #define LTM_COLSIZE(T) ((T)->colsize) 256 #define LTM_DENOMINATOR(T) ((T)->denominator) 257 258 /* Allocate a new transformation matrix. */ 259 260 static lambda_trans_matrix 261 lambda_trans_matrix_new (int colsize, int rowsize, 262 struct obstack * lambda_obstack) 263 { 264 lambda_trans_matrix ret; 265 266 ret = (lambda_trans_matrix) 267 obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s)); 268 LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack); 269 LTM_ROWSIZE (ret) = rowsize; 270 LTM_COLSIZE (ret) = colsize; 271 LTM_DENOMINATOR (ret) = 1; 272 return ret; 273 } 274 275 /* Multiply a vector VEC by a matrix MAT. 276 MAT is an M*N matrix, and VEC is a vector with length N. The result 277 is stored in DEST which must be a vector of length M. */ 278 279 static void 280 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n, 281 lambda_vector vec, lambda_vector dest) 282 { 283 int i, j; 284 285 lambda_vector_clear (dest, m); 286 for (i = 0; i < m; i++) 287 for (j = 0; j < n; j++) 288 dest[i] += matrix[i][j] * vec[j]; 289 } 290 291 /* Return true if TRANS is a legal transformation matrix that respects 292 the dependence vectors in DISTS and DIRS. The conservative answer 293 is false. 294 295 "Wolfe proves that a unimodular transformation represented by the 296 matrix T is legal when applied to a loop nest with a set of 297 lexicographically non-negative distance vectors RDG if and only if 298 for each vector d in RDG, (T.d >= 0) is lexicographically positive. 299 i.e.: if and only if it transforms the lexicographically positive 300 distance vectors to lexicographically positive vectors. Note that 301 a unimodular matrix must transform the zero vector (and only it) to 302 the zero vector." S.Muchnick. */ 303 304 static bool 305 lambda_transform_legal_p (lambda_trans_matrix trans, 306 int nb_loops, 307 VEC (ddr_p, heap) *dependence_relations) 308 { 309 unsigned int i, j; 310 lambda_vector distres; 311 struct data_dependence_relation *ddr; 312 313 gcc_assert (LTM_COLSIZE (trans) == nb_loops 314 && LTM_ROWSIZE (trans) == nb_loops); 315 316 /* When there are no dependences, the transformation is correct. */ 317 if (VEC_length (ddr_p, dependence_relations) == 0) 318 return true; 319 320 ddr = VEC_index (ddr_p, dependence_relations, 0); 321 if (ddr == NULL) 322 return true; 323 324 /* When there is an unknown relation in the dependence_relations, we 325 know that it is no worth looking at this loop nest: give up. */ 326 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 327 return false; 328 329 distres = lambda_vector_new (nb_loops); 330 331 /* For each distance vector in the dependence graph. */ 332 FOR_EACH_VEC_ELT (ddr_p, dependence_relations, i, ddr) 333 { 334 /* Don't care about relations for which we know that there is no 335 dependence, nor about read-read (aka. output-dependences): 336 these data accesses can happen in any order. */ 337 if (DDR_ARE_DEPENDENT (ddr) == chrec_known 338 || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))) 339 continue; 340 341 /* Conservatively answer: "this transformation is not valid". */ 342 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 343 return false; 344 345 /* If the dependence could not be captured by a distance vector, 346 conservatively answer that the transform is not valid. */ 347 if (DDR_NUM_DIST_VECTS (ddr) == 0) 348 return false; 349 350 /* Compute trans.dist_vect */ 351 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++) 352 { 353 lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops, 354 DDR_DIST_VECT (ddr, j), distres); 355 356 if (!lambda_vector_lexico_pos (distres, nb_loops)) 357 return false; 358 } 359 } 360 return true; 361 } 362 363 /* Data dependency analysis. Returns true if the iterations of LOOP 364 are independent on each other (that is, if we can execute them 365 in parallel). */ 366 367 static bool 368 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack) 369 { 370 VEC (loop_p, heap) *loop_nest; 371 VEC (ddr_p, heap) *dependence_relations; 372 VEC (data_reference_p, heap) *datarefs; 373 lambda_trans_matrix trans; 374 bool ret = false; 375 376 if (dump_file && (dump_flags & TDF_DETAILS)) 377 { 378 fprintf (dump_file, "Considering loop %d\n", loop->num); 379 if (!loop->inner) 380 fprintf (dump_file, "loop is innermost\n"); 381 else 382 fprintf (dump_file, "loop NOT innermost\n"); 383 } 384 385 /* Check for problems with dependences. If the loop can be reversed, 386 the iterations are independent. */ 387 datarefs = VEC_alloc (data_reference_p, heap, 10); 388 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10); 389 loop_nest = VEC_alloc (loop_p, heap, 3); 390 if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs, 391 &dependence_relations)) 392 { 393 if (dump_file && (dump_flags & TDF_DETAILS)) 394 fprintf (dump_file, " FAILED: cannot analyze data dependencies\n"); 395 ret = false; 396 goto end; 397 } 398 if (dump_file && (dump_flags & TDF_DETAILS)) 399 dump_data_dependence_relations (dump_file, dependence_relations); 400 401 trans = lambda_trans_matrix_new (1, 1, parloop_obstack); 402 LTM_MATRIX (trans)[0][0] = -1; 403 404 if (lambda_transform_legal_p (trans, 1, dependence_relations)) 405 { 406 ret = true; 407 if (dump_file && (dump_flags & TDF_DETAILS)) 408 fprintf (dump_file, " SUCCESS: may be parallelized\n"); 409 } 410 else if (dump_file && (dump_flags & TDF_DETAILS)) 411 fprintf (dump_file, 412 " FAILED: data dependencies exist across iterations\n"); 413 414 end: 415 VEC_free (loop_p, heap, loop_nest); 416 free_dependence_relations (dependence_relations); 417 free_data_refs (datarefs); 418 419 return ret; 420 } 421 422 /* Return true when LOOP contains basic blocks marked with the 423 BB_IRREDUCIBLE_LOOP flag. */ 424 425 static inline bool 426 loop_has_blocks_with_irreducible_flag (struct loop *loop) 427 { 428 unsigned i; 429 basic_block *bbs = get_loop_body_in_dom_order (loop); 430 bool res = true; 431 432 for (i = 0; i < loop->num_nodes; i++) 433 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP) 434 goto end; 435 436 res = false; 437 end: 438 free (bbs); 439 return res; 440 } 441 442 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name. 443 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls 444 to their addresses that can be reused. The address of OBJ is known to 445 be invariant in the whole function. Other needed statements are placed 446 right before GSI. */ 447 448 static tree 449 take_address_of (tree obj, tree type, edge entry, htab_t decl_address, 450 gimple_stmt_iterator *gsi) 451 { 452 int uid; 453 void **dslot; 454 struct int_tree_map ielt, *nielt; 455 tree *var_p, name, bvar, addr; 456 gimple stmt; 457 gimple_seq stmts; 458 459 /* Since the address of OBJ is invariant, the trees may be shared. 460 Avoid rewriting unrelated parts of the code. */ 461 obj = unshare_expr (obj); 462 for (var_p = &obj; 463 handled_component_p (*var_p); 464 var_p = &TREE_OPERAND (*var_p, 0)) 465 continue; 466 467 /* Canonicalize the access to base on a MEM_REF. */ 468 if (DECL_P (*var_p)) 469 *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p)); 470 471 /* Assign a canonical SSA name to the address of the base decl used 472 in the address and share it for all accesses and addresses based 473 on it. */ 474 uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0)); 475 ielt.uid = uid; 476 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT); 477 if (!*dslot) 478 { 479 if (gsi == NULL) 480 return NULL; 481 addr = TREE_OPERAND (*var_p, 0); 482 bvar = create_tmp_var (TREE_TYPE (addr), 483 get_name (TREE_OPERAND 484 (TREE_OPERAND (*var_p, 0), 0))); 485 add_referenced_var (bvar); 486 stmt = gimple_build_assign (bvar, addr); 487 name = make_ssa_name (bvar, stmt); 488 gimple_assign_set_lhs (stmt, name); 489 gsi_insert_on_edge_immediate (entry, stmt); 490 491 nielt = XNEW (struct int_tree_map); 492 nielt->uid = uid; 493 nielt->to = name; 494 *dslot = nielt; 495 } 496 else 497 name = ((struct int_tree_map *) *dslot)->to; 498 499 /* Express the address in terms of the canonical SSA name. */ 500 TREE_OPERAND (*var_p, 0) = name; 501 if (gsi == NULL) 502 return build_fold_addr_expr_with_type (obj, type); 503 504 name = force_gimple_operand (build_addr (obj, current_function_decl), 505 &stmts, true, NULL_TREE); 506 if (!gimple_seq_empty_p (stmts)) 507 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 508 509 if (!useless_type_conversion_p (type, TREE_TYPE (name))) 510 { 511 name = force_gimple_operand (fold_convert (type, name), &stmts, true, 512 NULL_TREE); 513 if (!gimple_seq_empty_p (stmts)) 514 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 515 } 516 517 return name; 518 } 519 520 /* Callback for htab_traverse. Create the initialization statement 521 for reduction described in SLOT, and place it at the preheader of 522 the loop described in DATA. */ 523 524 static int 525 initialize_reductions (void **slot, void *data) 526 { 527 tree init, c; 528 tree bvar, type, arg; 529 edge e; 530 531 struct reduction_info *const reduc = (struct reduction_info *) *slot; 532 struct loop *loop = (struct loop *) data; 533 534 /* Create initialization in preheader: 535 reduction_variable = initialization value of reduction. */ 536 537 /* In the phi node at the header, replace the argument coming 538 from the preheader with the reduction initialization value. */ 539 540 /* Create a new variable to initialize the reduction. */ 541 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 542 bvar = create_tmp_var (type, "reduction"); 543 add_referenced_var (bvar); 544 545 c = build_omp_clause (gimple_location (reduc->reduc_stmt), 546 OMP_CLAUSE_REDUCTION); 547 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code; 548 OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt)); 549 550 init = omp_reduction_init (c, TREE_TYPE (bvar)); 551 reduc->init = init; 552 553 /* Replace the argument representing the initialization value 554 with the initialization value for the reduction (neutral 555 element for the particular operation, e.g. 0 for PLUS_EXPR, 556 1 for MULT_EXPR, etc). 557 Keep the old value in a new variable "reduction_initial", 558 that will be taken in consideration after the parallel 559 computing is done. */ 560 561 e = loop_preheader_edge (loop); 562 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e); 563 /* Create new variable to hold the initial value. */ 564 565 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE 566 (reduc->reduc_phi, loop_preheader_edge (loop)), init); 567 reduc->initial_value = arg; 568 return 1; 569 } 570 571 struct elv_data 572 { 573 struct walk_stmt_info info; 574 edge entry; 575 htab_t decl_address; 576 gimple_stmt_iterator *gsi; 577 bool changed; 578 bool reset; 579 }; 580 581 /* Eliminates references to local variables in *TP out of the single 582 entry single exit region starting at DTA->ENTRY. 583 DECL_ADDRESS contains addresses of the references that had their 584 address taken already. If the expression is changed, CHANGED is 585 set to true. Callback for walk_tree. */ 586 587 static tree 588 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data) 589 { 590 struct elv_data *const dta = (struct elv_data *) data; 591 tree t = *tp, var, addr, addr_type, type, obj; 592 593 if (DECL_P (t)) 594 { 595 *walk_subtrees = 0; 596 597 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t)) 598 return NULL_TREE; 599 600 type = TREE_TYPE (t); 601 addr_type = build_pointer_type (type); 602 addr = take_address_of (t, addr_type, dta->entry, dta->decl_address, 603 dta->gsi); 604 if (dta->gsi == NULL && addr == NULL_TREE) 605 { 606 dta->reset = true; 607 return NULL_TREE; 608 } 609 610 *tp = build_simple_mem_ref (addr); 611 612 dta->changed = true; 613 return NULL_TREE; 614 } 615 616 if (TREE_CODE (t) == ADDR_EXPR) 617 { 618 /* ADDR_EXPR may appear in two contexts: 619 -- as a gimple operand, when the address taken is a function invariant 620 -- as gimple rhs, when the resulting address in not a function 621 invariant 622 We do not need to do anything special in the latter case (the base of 623 the memory reference whose address is taken may be replaced in the 624 DECL_P case). The former case is more complicated, as we need to 625 ensure that the new address is still a gimple operand. Thus, it 626 is not sufficient to replace just the base of the memory reference -- 627 we need to move the whole computation of the address out of the 628 loop. */ 629 if (!is_gimple_val (t)) 630 return NULL_TREE; 631 632 *walk_subtrees = 0; 633 obj = TREE_OPERAND (t, 0); 634 var = get_base_address (obj); 635 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var)) 636 return NULL_TREE; 637 638 addr_type = TREE_TYPE (t); 639 addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address, 640 dta->gsi); 641 if (dta->gsi == NULL && addr == NULL_TREE) 642 { 643 dta->reset = true; 644 return NULL_TREE; 645 } 646 *tp = addr; 647 648 dta->changed = true; 649 return NULL_TREE; 650 } 651 652 if (!EXPR_P (t)) 653 *walk_subtrees = 0; 654 655 return NULL_TREE; 656 } 657 658 /* Moves the references to local variables in STMT at *GSI out of the single 659 entry single exit region starting at ENTRY. DECL_ADDRESS contains 660 addresses of the references that had their address taken 661 already. */ 662 663 static void 664 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi, 665 htab_t decl_address) 666 { 667 struct elv_data dta; 668 gimple stmt = gsi_stmt (*gsi); 669 670 memset (&dta.info, '\0', sizeof (dta.info)); 671 dta.entry = entry; 672 dta.decl_address = decl_address; 673 dta.changed = false; 674 dta.reset = false; 675 676 if (gimple_debug_bind_p (stmt)) 677 { 678 dta.gsi = NULL; 679 walk_tree (gimple_debug_bind_get_value_ptr (stmt), 680 eliminate_local_variables_1, &dta.info, NULL); 681 if (dta.reset) 682 { 683 gimple_debug_bind_reset_value (stmt); 684 dta.changed = true; 685 } 686 } 687 else 688 { 689 dta.gsi = gsi; 690 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info); 691 } 692 693 if (dta.changed) 694 update_stmt (stmt); 695 } 696 697 /* Eliminates the references to local variables from the single entry 698 single exit region between the ENTRY and EXIT edges. 699 700 This includes: 701 1) Taking address of a local variable -- these are moved out of the 702 region (and temporary variable is created to hold the address if 703 necessary). 704 705 2) Dereferencing a local variable -- these are replaced with indirect 706 references. */ 707 708 static void 709 eliminate_local_variables (edge entry, edge exit) 710 { 711 basic_block bb; 712 VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3); 713 unsigned i; 714 gimple_stmt_iterator gsi; 715 bool has_debug_stmt = false; 716 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq, 717 free); 718 basic_block entry_bb = entry->src; 719 basic_block exit_bb = exit->dest; 720 721 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 722 723 FOR_EACH_VEC_ELT (basic_block, body, i, bb) 724 if (bb != entry_bb && bb != exit_bb) 725 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 726 if (is_gimple_debug (gsi_stmt (gsi))) 727 { 728 if (gimple_debug_bind_p (gsi_stmt (gsi))) 729 has_debug_stmt = true; 730 } 731 else 732 eliminate_local_variables_stmt (entry, &gsi, decl_address); 733 734 if (has_debug_stmt) 735 FOR_EACH_VEC_ELT (basic_block, body, i, bb) 736 if (bb != entry_bb && bb != exit_bb) 737 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 738 if (gimple_debug_bind_p (gsi_stmt (gsi))) 739 eliminate_local_variables_stmt (entry, &gsi, decl_address); 740 741 htab_delete (decl_address); 742 VEC_free (basic_block, heap, body); 743 } 744 745 /* Returns true if expression EXPR is not defined between ENTRY and 746 EXIT, i.e. if all its operands are defined outside of the region. */ 747 748 static bool 749 expr_invariant_in_region_p (edge entry, edge exit, tree expr) 750 { 751 basic_block entry_bb = entry->src; 752 basic_block exit_bb = exit->dest; 753 basic_block def_bb; 754 755 if (is_gimple_min_invariant (expr)) 756 return true; 757 758 if (TREE_CODE (expr) == SSA_NAME) 759 { 760 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr)); 761 if (def_bb 762 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb) 763 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb)) 764 return false; 765 766 return true; 767 } 768 769 return false; 770 } 771 772 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME. 773 The copies are stored to NAME_COPIES, if NAME was already duplicated, 774 its duplicate stored in NAME_COPIES is returned. 775 776 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also 777 duplicated, storing the copies in DECL_COPIES. */ 778 779 static tree 780 separate_decls_in_region_name (tree name, 781 htab_t name_copies, htab_t decl_copies, 782 bool copy_name_p) 783 { 784 tree copy, var, var_copy; 785 unsigned idx, uid, nuid; 786 struct int_tree_map ielt, *nielt; 787 struct name_to_copy_elt elt, *nelt; 788 void **slot, **dslot; 789 790 if (TREE_CODE (name) != SSA_NAME) 791 return name; 792 793 idx = SSA_NAME_VERSION (name); 794 elt.version = idx; 795 slot = htab_find_slot_with_hash (name_copies, &elt, idx, 796 copy_name_p ? INSERT : NO_INSERT); 797 if (slot && *slot) 798 return ((struct name_to_copy_elt *) *slot)->new_name; 799 800 var = SSA_NAME_VAR (name); 801 uid = DECL_UID (var); 802 ielt.uid = uid; 803 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT); 804 if (!*dslot) 805 { 806 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var)); 807 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var); 808 add_referenced_var (var_copy); 809 nielt = XNEW (struct int_tree_map); 810 nielt->uid = uid; 811 nielt->to = var_copy; 812 *dslot = nielt; 813 814 /* Ensure that when we meet this decl next time, we won't duplicate 815 it again. */ 816 nuid = DECL_UID (var_copy); 817 ielt.uid = nuid; 818 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT); 819 gcc_assert (!*dslot); 820 nielt = XNEW (struct int_tree_map); 821 nielt->uid = nuid; 822 nielt->to = var_copy; 823 *dslot = nielt; 824 } 825 else 826 var_copy = ((struct int_tree_map *) *dslot)->to; 827 828 if (copy_name_p) 829 { 830 copy = duplicate_ssa_name (name, NULL); 831 nelt = XNEW (struct name_to_copy_elt); 832 nelt->version = idx; 833 nelt->new_name = copy; 834 nelt->field = NULL_TREE; 835 *slot = nelt; 836 } 837 else 838 { 839 gcc_assert (!slot); 840 copy = name; 841 } 842 843 SSA_NAME_VAR (copy) = var_copy; 844 return copy; 845 } 846 847 /* Finds the ssa names used in STMT that are defined outside the 848 region between ENTRY and EXIT and replaces such ssa names with 849 their duplicates. The duplicates are stored to NAME_COPIES. Base 850 decls of all ssa names used in STMT (including those defined in 851 LOOP) are replaced with the new temporary variables; the 852 replacement decls are stored in DECL_COPIES. */ 853 854 static void 855 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt, 856 htab_t name_copies, htab_t decl_copies) 857 { 858 use_operand_p use; 859 def_operand_p def; 860 ssa_op_iter oi; 861 tree name, copy; 862 bool copy_name_p; 863 864 mark_virtual_ops_for_renaming (stmt); 865 866 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF) 867 { 868 name = DEF_FROM_PTR (def); 869 gcc_assert (TREE_CODE (name) == SSA_NAME); 870 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 871 false); 872 gcc_assert (copy == name); 873 } 874 875 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 876 { 877 name = USE_FROM_PTR (use); 878 if (TREE_CODE (name) != SSA_NAME) 879 continue; 880 881 copy_name_p = expr_invariant_in_region_p (entry, exit, name); 882 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 883 copy_name_p); 884 SET_USE (use, copy); 885 } 886 } 887 888 /* Finds the ssa names used in STMT that are defined outside the 889 region between ENTRY and EXIT and replaces such ssa names with 890 their duplicates. The duplicates are stored to NAME_COPIES. Base 891 decls of all ssa names used in STMT (including those defined in 892 LOOP) are replaced with the new temporary variables; the 893 replacement decls are stored in DECL_COPIES. */ 894 895 static bool 896 separate_decls_in_region_debug (gimple stmt, htab_t name_copies, 897 htab_t decl_copies) 898 { 899 use_operand_p use; 900 ssa_op_iter oi; 901 tree var, name; 902 struct int_tree_map ielt; 903 struct name_to_copy_elt elt; 904 void **slot, **dslot; 905 906 if (gimple_debug_bind_p (stmt)) 907 var = gimple_debug_bind_get_var (stmt); 908 else if (gimple_debug_source_bind_p (stmt)) 909 var = gimple_debug_source_bind_get_var (stmt); 910 else 911 return true; 912 if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL) 913 return true; 914 gcc_assert (DECL_P (var) && SSA_VAR_P (var)); 915 ielt.uid = DECL_UID (var); 916 dslot = htab_find_slot_with_hash (decl_copies, &ielt, ielt.uid, NO_INSERT); 917 if (!dslot) 918 return true; 919 if (gimple_debug_bind_p (stmt)) 920 gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to); 921 else if (gimple_debug_source_bind_p (stmt)) 922 gimple_debug_source_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to); 923 924 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 925 { 926 name = USE_FROM_PTR (use); 927 if (TREE_CODE (name) != SSA_NAME) 928 continue; 929 930 elt.version = SSA_NAME_VERSION (name); 931 slot = htab_find_slot_with_hash (name_copies, &elt, elt.version, NO_INSERT); 932 if (!slot) 933 { 934 gimple_debug_bind_reset_value (stmt); 935 update_stmt (stmt); 936 break; 937 } 938 939 SET_USE (use, ((struct name_to_copy_elt *) *slot)->new_name); 940 } 941 942 return false; 943 } 944 945 /* Callback for htab_traverse. Adds a field corresponding to the reduction 946 specified in SLOT. The type is passed in DATA. */ 947 948 static int 949 add_field_for_reduction (void **slot, void *data) 950 { 951 952 struct reduction_info *const red = (struct reduction_info *) *slot; 953 tree const type = (tree) data; 954 tree var = SSA_NAME_VAR (gimple_assign_lhs (red->reduc_stmt)); 955 tree field = build_decl (gimple_location (red->reduc_stmt), 956 FIELD_DECL, DECL_NAME (var), TREE_TYPE (var)); 957 958 insert_field_into_struct (type, field); 959 960 red->field = field; 961 962 return 1; 963 } 964 965 /* Callback for htab_traverse. Adds a field corresponding to a ssa name 966 described in SLOT. The type is passed in DATA. */ 967 968 static int 969 add_field_for_name (void **slot, void *data) 970 { 971 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot; 972 tree type = (tree) data; 973 tree name = ssa_name (elt->version); 974 tree var = SSA_NAME_VAR (name); 975 tree field = build_decl (DECL_SOURCE_LOCATION (var), 976 FIELD_DECL, DECL_NAME (var), TREE_TYPE (var)); 977 978 insert_field_into_struct (type, field); 979 elt->field = field; 980 981 return 1; 982 } 983 984 /* Callback for htab_traverse. A local result is the intermediate result 985 computed by a single 986 thread, or the initial value in case no iteration was executed. 987 This function creates a phi node reflecting these values. 988 The phi's result will be stored in NEW_PHI field of the 989 reduction's data structure. */ 990 991 static int 992 create_phi_for_local_result (void **slot, void *data) 993 { 994 struct reduction_info *const reduc = (struct reduction_info *) *slot; 995 const struct loop *const loop = (const struct loop *) data; 996 edge e; 997 gimple new_phi; 998 basic_block store_bb; 999 tree local_res; 1000 source_location locus; 1001 1002 /* STORE_BB is the block where the phi 1003 should be stored. It is the destination of the loop exit. 1004 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */ 1005 store_bb = FALLTHRU_EDGE (loop->latch)->dest; 1006 1007 /* STORE_BB has two predecessors. One coming from the loop 1008 (the reduction's result is computed at the loop), 1009 and another coming from a block preceding the loop, 1010 when no iterations 1011 are executed (the initial value should be taken). */ 1012 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch)) 1013 e = EDGE_PRED (store_bb, 1); 1014 else 1015 e = EDGE_PRED (store_bb, 0); 1016 local_res 1017 = make_ssa_name (SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt)), 1018 NULL); 1019 locus = gimple_location (reduc->reduc_stmt); 1020 new_phi = create_phi_node (local_res, store_bb); 1021 SSA_NAME_DEF_STMT (local_res) = new_phi; 1022 add_phi_arg (new_phi, reduc->init, e, locus); 1023 add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt), 1024 FALLTHRU_EDGE (loop->latch), locus); 1025 reduc->new_phi = new_phi; 1026 1027 return 1; 1028 } 1029 1030 struct clsn_data 1031 { 1032 tree store; 1033 tree load; 1034 1035 basic_block store_bb; 1036 basic_block load_bb; 1037 }; 1038 1039 /* Callback for htab_traverse. Create an atomic instruction for the 1040 reduction described in SLOT. 1041 DATA annotates the place in memory the atomic operation relates to, 1042 and the basic block it needs to be generated in. */ 1043 1044 static int 1045 create_call_for_reduction_1 (void **slot, void *data) 1046 { 1047 struct reduction_info *const reduc = (struct reduction_info *) *slot; 1048 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1049 gimple_stmt_iterator gsi; 1050 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 1051 tree load_struct; 1052 basic_block bb; 1053 basic_block new_bb; 1054 edge e; 1055 tree t, addr, ref, x; 1056 tree tmp_load, name; 1057 gimple load; 1058 1059 load_struct = build_simple_mem_ref (clsn_data->load); 1060 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); 1061 1062 addr = build_addr (t, current_function_decl); 1063 1064 /* Create phi node. */ 1065 bb = clsn_data->load_bb; 1066 1067 e = split_block (bb, t); 1068 new_bb = e->dest; 1069 1070 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL); 1071 add_referenced_var (tmp_load); 1072 tmp_load = make_ssa_name (tmp_load, NULL); 1073 load = gimple_build_omp_atomic_load (tmp_load, addr); 1074 SSA_NAME_DEF_STMT (tmp_load) = load; 1075 gsi = gsi_start_bb (new_bb); 1076 gsi_insert_after (&gsi, load, GSI_NEW_STMT); 1077 1078 e = split_block (new_bb, load); 1079 new_bb = e->dest; 1080 gsi = gsi_start_bb (new_bb); 1081 ref = tmp_load; 1082 x = fold_build2 (reduc->reduction_code, 1083 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref, 1084 PHI_RESULT (reduc->new_phi)); 1085 1086 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true, 1087 GSI_CONTINUE_LINKING); 1088 1089 gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT); 1090 return 1; 1091 } 1092 1093 /* Create the atomic operation at the join point of the threads. 1094 REDUCTION_LIST describes the reductions in the LOOP. 1095 LD_ST_DATA describes the shared data structure where 1096 shared data is stored in and loaded from. */ 1097 static void 1098 create_call_for_reduction (struct loop *loop, htab_t reduction_list, 1099 struct clsn_data *ld_st_data) 1100 { 1101 htab_traverse (reduction_list, create_phi_for_local_result, loop); 1102 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */ 1103 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest; 1104 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data); 1105 } 1106 1107 /* Callback for htab_traverse. Loads the final reduction value at the 1108 join point of all threads, and inserts it in the right place. */ 1109 1110 static int 1111 create_loads_for_reductions (void **slot, void *data) 1112 { 1113 struct reduction_info *const red = (struct reduction_info *) *slot; 1114 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1115 gimple stmt; 1116 gimple_stmt_iterator gsi; 1117 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt)); 1118 tree load_struct; 1119 tree name; 1120 tree x; 1121 1122 gsi = gsi_after_labels (clsn_data->load_bb); 1123 load_struct = build_simple_mem_ref (clsn_data->load); 1124 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field, 1125 NULL_TREE); 1126 1127 x = load_struct; 1128 name = PHI_RESULT (red->keep_res); 1129 stmt = gimple_build_assign (name, x); 1130 SSA_NAME_DEF_STMT (name) = stmt; 1131 1132 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1133 1134 for (gsi = gsi_start_phis (gimple_bb (red->keep_res)); 1135 !gsi_end_p (gsi); gsi_next (&gsi)) 1136 if (gsi_stmt (gsi) == red->keep_res) 1137 { 1138 remove_phi_node (&gsi, false); 1139 return 1; 1140 } 1141 gcc_unreachable (); 1142 } 1143 1144 /* Load the reduction result that was stored in LD_ST_DATA. 1145 REDUCTION_LIST describes the list of reductions that the 1146 loads should be generated for. */ 1147 static void 1148 create_final_loads_for_reduction (htab_t reduction_list, 1149 struct clsn_data *ld_st_data) 1150 { 1151 gimple_stmt_iterator gsi; 1152 tree t; 1153 gimple stmt; 1154 1155 gsi = gsi_after_labels (ld_st_data->load_bb); 1156 t = build_fold_addr_expr (ld_st_data->store); 1157 stmt = gimple_build_assign (ld_st_data->load, t); 1158 1159 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 1160 SSA_NAME_DEF_STMT (ld_st_data->load) = stmt; 1161 1162 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data); 1163 1164 } 1165 1166 /* Callback for htab_traverse. Store the neutral value for the 1167 particular reduction's operation, e.g. 0 for PLUS_EXPR, 1168 1 for MULT_EXPR, etc. into the reduction field. 1169 The reduction is specified in SLOT. The store information is 1170 passed in DATA. */ 1171 1172 static int 1173 create_stores_for_reduction (void **slot, void *data) 1174 { 1175 struct reduction_info *const red = (struct reduction_info *) *slot; 1176 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1177 tree t; 1178 gimple stmt; 1179 gimple_stmt_iterator gsi; 1180 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt)); 1181 1182 gsi = gsi_last_bb (clsn_data->store_bb); 1183 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE); 1184 stmt = gimple_build_assign (t, red->initial_value); 1185 mark_virtual_ops_for_renaming (stmt); 1186 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1187 1188 return 1; 1189 } 1190 1191 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and 1192 store to a field of STORE in STORE_BB for the ssa name and its duplicate 1193 specified in SLOT. */ 1194 1195 static int 1196 create_loads_and_stores_for_name (void **slot, void *data) 1197 { 1198 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot; 1199 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1200 tree t; 1201 gimple stmt; 1202 gimple_stmt_iterator gsi; 1203 tree type = TREE_TYPE (elt->new_name); 1204 tree load_struct; 1205 1206 gsi = gsi_last_bb (clsn_data->store_bb); 1207 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE); 1208 stmt = gimple_build_assign (t, ssa_name (elt->version)); 1209 mark_virtual_ops_for_renaming (stmt); 1210 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1211 1212 gsi = gsi_last_bb (clsn_data->load_bb); 1213 load_struct = build_simple_mem_ref (clsn_data->load); 1214 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE); 1215 stmt = gimple_build_assign (elt->new_name, t); 1216 SSA_NAME_DEF_STMT (elt->new_name) = stmt; 1217 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1218 1219 return 1; 1220 } 1221 1222 /* Moves all the variables used in LOOP and defined outside of it (including 1223 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa 1224 name) to a structure created for this purpose. The code 1225 1226 while (1) 1227 { 1228 use (a); 1229 use (b); 1230 } 1231 1232 is transformed this way: 1233 1234 bb0: 1235 old.a = a; 1236 old.b = b; 1237 1238 bb1: 1239 a' = new->a; 1240 b' = new->b; 1241 while (1) 1242 { 1243 use (a'); 1244 use (b'); 1245 } 1246 1247 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The 1248 pointer `new' is intentionally not initialized (the loop will be split to a 1249 separate function later, and `new' will be initialized from its arguments). 1250 LD_ST_DATA holds information about the shared data structure used to pass 1251 information among the threads. It is initialized here, and 1252 gen_parallel_loop will pass it to create_call_for_reduction that 1253 needs this information. REDUCTION_LIST describes the reductions 1254 in LOOP. */ 1255 1256 static void 1257 separate_decls_in_region (edge entry, edge exit, htab_t reduction_list, 1258 tree *arg_struct, tree *new_arg_struct, 1259 struct clsn_data *ld_st_data) 1260 1261 { 1262 basic_block bb1 = split_edge (entry); 1263 basic_block bb0 = single_pred (bb1); 1264 htab_t name_copies = htab_create (10, name_to_copy_elt_hash, 1265 name_to_copy_elt_eq, free); 1266 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq, 1267 free); 1268 unsigned i; 1269 tree type, type_name, nvar; 1270 gimple_stmt_iterator gsi; 1271 struct clsn_data clsn_data; 1272 VEC (basic_block, heap) *body = VEC_alloc (basic_block, heap, 3); 1273 basic_block bb; 1274 basic_block entry_bb = bb1; 1275 basic_block exit_bb = exit->dest; 1276 bool has_debug_stmt = false; 1277 1278 entry = single_succ_edge (entry_bb); 1279 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 1280 1281 FOR_EACH_VEC_ELT (basic_block, body, i, bb) 1282 { 1283 if (bb != entry_bb && bb != exit_bb) 1284 { 1285 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1286 separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi), 1287 name_copies, decl_copies); 1288 1289 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1290 { 1291 gimple stmt = gsi_stmt (gsi); 1292 1293 if (is_gimple_debug (stmt)) 1294 has_debug_stmt = true; 1295 else 1296 separate_decls_in_region_stmt (entry, exit, stmt, 1297 name_copies, decl_copies); 1298 } 1299 } 1300 } 1301 1302 /* Now process debug bind stmts. We must not create decls while 1303 processing debug stmts, so we defer their processing so as to 1304 make sure we will have debug info for as many variables as 1305 possible (all of those that were dealt with in the loop above), 1306 and discard those for which we know there's nothing we can 1307 do. */ 1308 if (has_debug_stmt) 1309 FOR_EACH_VEC_ELT (basic_block, body, i, bb) 1310 if (bb != entry_bb && bb != exit_bb) 1311 { 1312 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) 1313 { 1314 gimple stmt = gsi_stmt (gsi); 1315 1316 if (is_gimple_debug (stmt)) 1317 { 1318 if (separate_decls_in_region_debug (stmt, name_copies, 1319 decl_copies)) 1320 { 1321 gsi_remove (&gsi, true); 1322 continue; 1323 } 1324 } 1325 1326 gsi_next (&gsi); 1327 } 1328 } 1329 1330 VEC_free (basic_block, heap, body); 1331 1332 if (htab_elements (name_copies) == 0 && htab_elements (reduction_list) == 0) 1333 { 1334 /* It may happen that there is nothing to copy (if there are only 1335 loop carried and external variables in the loop). */ 1336 *arg_struct = NULL; 1337 *new_arg_struct = NULL; 1338 } 1339 else 1340 { 1341 /* Create the type for the structure to store the ssa names to. */ 1342 type = lang_hooks.types.make_type (RECORD_TYPE); 1343 type_name = build_decl (UNKNOWN_LOCATION, 1344 TYPE_DECL, create_tmp_var_name (".paral_data"), 1345 type); 1346 TYPE_NAME (type) = type_name; 1347 1348 htab_traverse (name_copies, add_field_for_name, type); 1349 if (reduction_list && htab_elements (reduction_list) > 0) 1350 { 1351 /* Create the fields for reductions. */ 1352 htab_traverse (reduction_list, add_field_for_reduction, 1353 type); 1354 } 1355 layout_type (type); 1356 1357 /* Create the loads and stores. */ 1358 *arg_struct = create_tmp_var (type, ".paral_data_store"); 1359 add_referenced_var (*arg_struct); 1360 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load"); 1361 add_referenced_var (nvar); 1362 *new_arg_struct = make_ssa_name (nvar, NULL); 1363 1364 ld_st_data->store = *arg_struct; 1365 ld_st_data->load = *new_arg_struct; 1366 ld_st_data->store_bb = bb0; 1367 ld_st_data->load_bb = bb1; 1368 1369 htab_traverse (name_copies, create_loads_and_stores_for_name, 1370 ld_st_data); 1371 1372 /* Load the calculation from memory (after the join of the threads). */ 1373 1374 if (reduction_list && htab_elements (reduction_list) > 0) 1375 { 1376 htab_traverse (reduction_list, create_stores_for_reduction, 1377 ld_st_data); 1378 clsn_data.load = make_ssa_name (nvar, NULL); 1379 clsn_data.load_bb = exit->dest; 1380 clsn_data.store = ld_st_data->store; 1381 create_final_loads_for_reduction (reduction_list, &clsn_data); 1382 } 1383 } 1384 1385 htab_delete (decl_copies); 1386 htab_delete (name_copies); 1387 } 1388 1389 /* Bitmap containing uids of functions created by parallelization. We cannot 1390 allocate it from the default obstack, as it must live across compilation 1391 of several functions; we make it gc allocated instead. */ 1392 1393 static GTY(()) bitmap parallelized_functions; 1394 1395 /* Returns true if FN was created by create_loop_fn. */ 1396 1397 static bool 1398 parallelized_function_p (tree fn) 1399 { 1400 if (!parallelized_functions || !DECL_ARTIFICIAL (fn)) 1401 return false; 1402 1403 return bitmap_bit_p (parallelized_functions, DECL_UID (fn)); 1404 } 1405 1406 /* Creates and returns an empty function that will receive the body of 1407 a parallelized loop. */ 1408 1409 static tree 1410 create_loop_fn (location_t loc) 1411 { 1412 char buf[100]; 1413 char *tname; 1414 tree decl, type, name, t; 1415 struct function *act_cfun = cfun; 1416 static unsigned loopfn_num; 1417 1418 snprintf (buf, 100, "%s.$loopfn", current_function_name ()); 1419 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++); 1420 clean_symbol_name (tname); 1421 name = get_identifier (tname); 1422 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE); 1423 1424 decl = build_decl (loc, FUNCTION_DECL, name, type); 1425 if (!parallelized_functions) 1426 parallelized_functions = BITMAP_GGC_ALLOC (); 1427 bitmap_set_bit (parallelized_functions, DECL_UID (decl)); 1428 1429 TREE_STATIC (decl) = 1; 1430 TREE_USED (decl) = 1; 1431 DECL_ARTIFICIAL (decl) = 1; 1432 DECL_IGNORED_P (decl) = 0; 1433 TREE_PUBLIC (decl) = 0; 1434 DECL_UNINLINABLE (decl) = 1; 1435 DECL_EXTERNAL (decl) = 0; 1436 DECL_CONTEXT (decl) = NULL_TREE; 1437 DECL_INITIAL (decl) = make_node (BLOCK); 1438 1439 t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node); 1440 DECL_ARTIFICIAL (t) = 1; 1441 DECL_IGNORED_P (t) = 1; 1442 DECL_RESULT (decl) = t; 1443 1444 t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"), 1445 ptr_type_node); 1446 DECL_ARTIFICIAL (t) = 1; 1447 DECL_ARG_TYPE (t) = ptr_type_node; 1448 DECL_CONTEXT (t) = decl; 1449 TREE_USED (t) = 1; 1450 DECL_ARGUMENTS (decl) = t; 1451 1452 allocate_struct_function (decl, false); 1453 1454 /* The call to allocate_struct_function clobbers CFUN, so we need to restore 1455 it. */ 1456 set_cfun (act_cfun); 1457 1458 return decl; 1459 } 1460 1461 /* Moves the exit condition of LOOP to the beginning of its header, and 1462 duplicates the part of the last iteration that gets disabled to the 1463 exit of the loop. NIT is the number of iterations of the loop 1464 (used to initialize the variables in the duplicated part). 1465 1466 TODO: the common case is that latch of the loop is empty and immediately 1467 follows the loop exit. In this case, it would be better not to copy the 1468 body of the loop, but only move the entry of the loop directly before the 1469 exit check and increase the number of iterations of the loop by one. 1470 This may need some additional preconditioning in case NIT = ~0. 1471 REDUCTION_LIST describes the reductions in LOOP. */ 1472 1473 static void 1474 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit) 1475 { 1476 basic_block *bbs, *nbbs, ex_bb, orig_header; 1477 unsigned n; 1478 bool ok; 1479 edge exit = single_dom_exit (loop), hpred; 1480 tree control, control_name, res, t; 1481 gimple phi, nphi, cond_stmt, stmt, cond_nit; 1482 gimple_stmt_iterator gsi; 1483 tree nit_1; 1484 edge exit_1; 1485 tree new_rhs; 1486 1487 split_block_after_labels (loop->header); 1488 orig_header = single_succ (loop->header); 1489 hpred = single_succ_edge (loop->header); 1490 1491 cond_stmt = last_stmt (exit->src); 1492 control = gimple_cond_lhs (cond_stmt); 1493 gcc_assert (gimple_cond_rhs (cond_stmt) == nit); 1494 1495 /* Make sure that we have phi nodes on exit for all loop header phis 1496 (create_parallel_loop requires that). */ 1497 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 1498 { 1499 phi = gsi_stmt (gsi); 1500 res = PHI_RESULT (phi); 1501 t = make_ssa_name (SSA_NAME_VAR (res), phi); 1502 SET_PHI_RESULT (phi, t); 1503 nphi = create_phi_node (res, orig_header); 1504 SSA_NAME_DEF_STMT (res) = nphi; 1505 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION); 1506 1507 if (res == control) 1508 { 1509 gimple_cond_set_lhs (cond_stmt, t); 1510 update_stmt (cond_stmt); 1511 control = t; 1512 } 1513 } 1514 1515 /* Setting the condition towards peeling the last iteration: 1516 If the block consisting of the exit condition has the latch as 1517 successor, then the body of the loop is executed before 1518 the exit condition is tested. In such case, moving the 1519 condition to the entry, causes that the loop will iterate 1520 one less iteration (which is the wanted outcome, since we 1521 peel out the last iteration). If the body is executed after 1522 the condition, moving the condition to the entry requires 1523 decrementing one iteration. */ 1524 exit_1 = EDGE_SUCC (exit->src, EDGE_SUCC (exit->src, 0) == exit); 1525 if (exit_1->dest == loop->latch) 1526 new_rhs = gimple_cond_rhs (cond_stmt); 1527 else 1528 { 1529 new_rhs = fold_build2 (MINUS_EXPR, TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1530 gimple_cond_rhs (cond_stmt), 1531 build_int_cst (TREE_TYPE (gimple_cond_rhs (cond_stmt)), 1)); 1532 if (TREE_CODE (gimple_cond_rhs (cond_stmt)) == SSA_NAME) 1533 { 1534 basic_block preheader; 1535 gimple_stmt_iterator gsi1; 1536 1537 preheader = loop_preheader_edge(loop)->src; 1538 gsi1 = gsi_after_labels (preheader); 1539 new_rhs = force_gimple_operand_gsi (&gsi1, new_rhs, true, 1540 NULL_TREE,false,GSI_CONTINUE_LINKING); 1541 } 1542 } 1543 gimple_cond_set_rhs (cond_stmt, unshare_expr (new_rhs)); 1544 gimple_cond_set_lhs (cond_stmt, unshare_expr (gimple_cond_lhs (cond_stmt))); 1545 1546 bbs = get_loop_body_in_dom_order (loop); 1547 1548 for (n = 0; bbs[n] != loop->latch; n++) 1549 continue; 1550 nbbs = XNEWVEC (basic_block, n); 1551 ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit, 1552 bbs + 1, n, nbbs); 1553 gcc_assert (ok); 1554 free (bbs); 1555 ex_bb = nbbs[0]; 1556 free (nbbs); 1557 1558 /* Other than reductions, the only gimple reg that should be copied 1559 out of the loop is the control variable. */ 1560 1561 control_name = NULL_TREE; 1562 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); ) 1563 { 1564 phi = gsi_stmt (gsi); 1565 res = PHI_RESULT (phi); 1566 if (!is_gimple_reg (res)) 1567 { 1568 gsi_next (&gsi); 1569 continue; 1570 } 1571 1572 /* Check if it is a part of reduction. If it is, 1573 keep the phi at the reduction's keep_res field. The 1574 PHI_RESULT of this phi is the resulting value of the reduction 1575 variable when exiting the loop. */ 1576 1577 exit = single_dom_exit (loop); 1578 1579 if (htab_elements (reduction_list) > 0) 1580 { 1581 struct reduction_info *red; 1582 1583 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 1584 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val)); 1585 if (red) 1586 { 1587 red->keep_res = phi; 1588 gsi_next (&gsi); 1589 continue; 1590 } 1591 } 1592 gcc_assert (control_name == NULL_TREE 1593 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control)); 1594 control_name = res; 1595 remove_phi_node (&gsi, false); 1596 } 1597 gcc_assert (control_name != NULL_TREE); 1598 1599 /* Initialize the control variable to number of iterations 1600 according to the rhs of the exit condition. */ 1601 gsi = gsi_after_labels (ex_bb); 1602 cond_nit = last_stmt (exit->src); 1603 nit_1 = gimple_cond_rhs (cond_nit); 1604 nit_1 = force_gimple_operand_gsi (&gsi, 1605 fold_convert (TREE_TYPE (control_name), nit_1), 1606 false, NULL_TREE, false, GSI_SAME_STMT); 1607 stmt = gimple_build_assign (control_name, nit_1); 1608 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 1609 SSA_NAME_DEF_STMT (control_name) = stmt; 1610 } 1611 1612 /* Create the parallel constructs for LOOP as described in gen_parallel_loop. 1613 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL. 1614 NEW_DATA is the variable that should be initialized from the argument 1615 of LOOP_FN. N_THREADS is the requested number of threads. Returns the 1616 basic block containing GIMPLE_OMP_PARALLEL tree. */ 1617 1618 static basic_block 1619 create_parallel_loop (struct loop *loop, tree loop_fn, tree data, 1620 tree new_data, unsigned n_threads, location_t loc) 1621 { 1622 gimple_stmt_iterator gsi; 1623 basic_block bb, paral_bb, for_bb, ex_bb; 1624 tree t, param; 1625 gimple stmt, for_stmt, phi, cond_stmt; 1626 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type; 1627 edge exit, nexit, guard, end, e; 1628 1629 /* Prepare the GIMPLE_OMP_PARALLEL statement. */ 1630 bb = loop_preheader_edge (loop)->src; 1631 paral_bb = single_pred (bb); 1632 gsi = gsi_last_bb (paral_bb); 1633 1634 t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); 1635 OMP_CLAUSE_NUM_THREADS_EXPR (t) 1636 = build_int_cst (integer_type_node, n_threads); 1637 stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); 1638 gimple_set_location (stmt, loc); 1639 1640 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1641 1642 /* Initialize NEW_DATA. */ 1643 if (data) 1644 { 1645 gsi = gsi_after_labels (bb); 1646 1647 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL); 1648 stmt = gimple_build_assign (param, build_fold_addr_expr (data)); 1649 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1650 SSA_NAME_DEF_STMT (param) = stmt; 1651 1652 stmt = gimple_build_assign (new_data, 1653 fold_convert (TREE_TYPE (new_data), param)); 1654 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1655 SSA_NAME_DEF_STMT (new_data) = stmt; 1656 } 1657 1658 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ 1659 bb = split_loop_exit_edge (single_dom_exit (loop)); 1660 gsi = gsi_last_bb (bb); 1661 stmt = gimple_build_omp_return (false); 1662 gimple_set_location (stmt, loc); 1663 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1664 1665 /* Extract data for GIMPLE_OMP_FOR. */ 1666 gcc_assert (loop->header == single_dom_exit (loop)->src); 1667 cond_stmt = last_stmt (loop->header); 1668 1669 cvar = gimple_cond_lhs (cond_stmt); 1670 cvar_base = SSA_NAME_VAR (cvar); 1671 phi = SSA_NAME_DEF_STMT (cvar); 1672 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); 1673 initvar = make_ssa_name (cvar_base, NULL); 1674 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)), 1675 initvar); 1676 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); 1677 1678 gsi = gsi_last_nondebug_bb (loop->latch); 1679 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next)); 1680 gsi_remove (&gsi, true); 1681 1682 /* Prepare cfg. */ 1683 for_bb = split_edge (loop_preheader_edge (loop)); 1684 ex_bb = split_loop_exit_edge (single_dom_exit (loop)); 1685 extract_true_false_edges_from_block (loop->header, &nexit, &exit); 1686 gcc_assert (exit == single_dom_exit (loop)); 1687 1688 guard = make_edge (for_bb, ex_bb, 0); 1689 single_succ_edge (loop->latch)->flags = 0; 1690 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU); 1691 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1692 { 1693 source_location locus; 1694 tree def; 1695 phi = gsi_stmt (gsi); 1696 stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit)); 1697 1698 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop)); 1699 locus = gimple_phi_arg_location_from_edge (stmt, 1700 loop_preheader_edge (loop)); 1701 add_phi_arg (phi, def, guard, locus); 1702 1703 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop)); 1704 locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop)); 1705 add_phi_arg (phi, def, end, locus); 1706 } 1707 e = redirect_edge_and_branch (exit, nexit->dest); 1708 PENDING_STMT (e) = NULL; 1709 1710 /* Emit GIMPLE_OMP_FOR. */ 1711 gimple_cond_set_lhs (cond_stmt, cvar_base); 1712 type = TREE_TYPE (cvar); 1713 t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); 1714 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; 1715 1716 for_stmt = gimple_build_omp_for (NULL, t, 1, NULL); 1717 gimple_set_location (for_stmt, loc); 1718 gimple_omp_for_set_index (for_stmt, 0, initvar); 1719 gimple_omp_for_set_initial (for_stmt, 0, cvar_init); 1720 gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt)); 1721 gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt)); 1722 gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type, 1723 cvar_base, 1724 build_int_cst (type, 1))); 1725 1726 gsi = gsi_last_bb (for_bb); 1727 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT); 1728 SSA_NAME_DEF_STMT (initvar) = for_stmt; 1729 1730 /* Emit GIMPLE_OMP_CONTINUE. */ 1731 gsi = gsi_last_bb (loop->latch); 1732 stmt = gimple_build_omp_continue (cvar_next, cvar); 1733 gimple_set_location (stmt, loc); 1734 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1735 SSA_NAME_DEF_STMT (cvar_next) = stmt; 1736 1737 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */ 1738 gsi = gsi_last_bb (ex_bb); 1739 stmt = gimple_build_omp_return (true); 1740 gimple_set_location (stmt, loc); 1741 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1742 1743 return paral_bb; 1744 } 1745 1746 /* Generates code to execute the iterations of LOOP in N_THREADS 1747 threads in parallel. 1748 1749 NITER describes number of iterations of LOOP. 1750 REDUCTION_LIST describes the reductions existent in the LOOP. */ 1751 1752 static void 1753 gen_parallel_loop (struct loop *loop, htab_t reduction_list, 1754 unsigned n_threads, struct tree_niter_desc *niter) 1755 { 1756 loop_iterator li; 1757 tree many_iterations_cond, type, nit; 1758 tree arg_struct, new_arg_struct; 1759 gimple_seq stmts; 1760 basic_block parallel_head; 1761 edge entry, exit; 1762 struct clsn_data clsn_data; 1763 unsigned prob; 1764 location_t loc; 1765 gimple cond_stmt; 1766 1767 /* From 1768 1769 --------------------------------------------------------------------- 1770 loop 1771 { 1772 IV = phi (INIT, IV + STEP) 1773 BODY1; 1774 if (COND) 1775 break; 1776 BODY2; 1777 } 1778 --------------------------------------------------------------------- 1779 1780 with # of iterations NITER (possibly with MAY_BE_ZERO assumption), 1781 we generate the following code: 1782 1783 --------------------------------------------------------------------- 1784 1785 if (MAY_BE_ZERO 1786 || NITER < MIN_PER_THREAD * N_THREADS) 1787 goto original; 1788 1789 BODY1; 1790 store all local loop-invariant variables used in body of the loop to DATA. 1791 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA); 1792 load the variables from DATA. 1793 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static)) 1794 BODY2; 1795 BODY1; 1796 GIMPLE_OMP_CONTINUE; 1797 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR 1798 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL 1799 goto end; 1800 1801 original: 1802 loop 1803 { 1804 IV = phi (INIT, IV + STEP) 1805 BODY1; 1806 if (COND) 1807 break; 1808 BODY2; 1809 } 1810 1811 end: 1812 1813 */ 1814 1815 /* Create two versions of the loop -- in the old one, we know that the 1816 number of iterations is large enough, and we will transform it into the 1817 loop that will be split to loop_fn, the new one will be used for the 1818 remaining iterations. */ 1819 1820 type = TREE_TYPE (niter->niter); 1821 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true, 1822 NULL_TREE); 1823 if (stmts) 1824 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1825 1826 many_iterations_cond = 1827 fold_build2 (GE_EXPR, boolean_type_node, 1828 nit, build_int_cst (type, MIN_PER_THREAD * n_threads)); 1829 many_iterations_cond 1830 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 1831 invert_truthvalue (unshare_expr (niter->may_be_zero)), 1832 many_iterations_cond); 1833 many_iterations_cond 1834 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); 1835 if (stmts) 1836 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1837 if (!is_gimple_condexpr (many_iterations_cond)) 1838 { 1839 many_iterations_cond 1840 = force_gimple_operand (many_iterations_cond, &stmts, 1841 true, NULL_TREE); 1842 if (stmts) 1843 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1844 } 1845 1846 initialize_original_copy_tables (); 1847 1848 /* We assume that the loop usually iterates a lot. */ 1849 prob = 4 * REG_BR_PROB_BASE / 5; 1850 loop_version (loop, many_iterations_cond, NULL, 1851 prob, prob, REG_BR_PROB_BASE - prob, true); 1852 update_ssa (TODO_update_ssa); 1853 free_original_copy_tables (); 1854 1855 /* Base all the induction variables in LOOP on a single control one. */ 1856 canonicalize_loop_ivs (loop, &nit, true); 1857 1858 /* Ensure that the exit condition is the first statement in the loop. */ 1859 transform_to_exit_first_loop (loop, reduction_list, nit); 1860 1861 /* Generate initializations for reductions. */ 1862 if (htab_elements (reduction_list) > 0) 1863 htab_traverse (reduction_list, initialize_reductions, loop); 1864 1865 /* Eliminate the references to local variables from the loop. */ 1866 gcc_assert (single_exit (loop)); 1867 entry = loop_preheader_edge (loop); 1868 exit = single_dom_exit (loop); 1869 1870 eliminate_local_variables (entry, exit); 1871 /* In the old loop, move all variables non-local to the loop to a structure 1872 and back, and create separate decls for the variables used in loop. */ 1873 separate_decls_in_region (entry, exit, reduction_list, &arg_struct, 1874 &new_arg_struct, &clsn_data); 1875 1876 /* Create the parallel constructs. */ 1877 loc = UNKNOWN_LOCATION; 1878 cond_stmt = last_stmt (loop->header); 1879 if (cond_stmt) 1880 loc = gimple_location (cond_stmt); 1881 parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct, 1882 new_arg_struct, n_threads, loc); 1883 if (htab_elements (reduction_list) > 0) 1884 create_call_for_reduction (loop, reduction_list, &clsn_data); 1885 1886 scev_reset (); 1887 1888 /* Cancel the loop (it is simpler to do it here rather than to teach the 1889 expander to do it). */ 1890 cancel_loop_tree (loop); 1891 1892 /* Free loop bound estimations that could contain references to 1893 removed statements. */ 1894 FOR_EACH_LOOP (li, loop, 0) 1895 free_numbers_of_iterations_estimates_loop (loop); 1896 1897 /* Expand the parallel constructs. We do it directly here instead of running 1898 a separate expand_omp pass, since it is more efficient, and less likely to 1899 cause troubles with further analyses not being able to deal with the 1900 OMP trees. */ 1901 1902 omp_expand_local (parallel_head); 1903 } 1904 1905 /* Returns true when LOOP contains vector phi nodes. */ 1906 1907 static bool 1908 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED) 1909 { 1910 unsigned i; 1911 basic_block *bbs = get_loop_body_in_dom_order (loop); 1912 gimple_stmt_iterator gsi; 1913 bool res = true; 1914 1915 for (i = 0; i < loop->num_nodes; i++) 1916 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) 1917 if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE) 1918 goto end; 1919 1920 res = false; 1921 end: 1922 free (bbs); 1923 return res; 1924 } 1925 1926 /* Create a reduction_info struct, initialize it with REDUC_STMT 1927 and PHI, insert it to the REDUCTION_LIST. */ 1928 1929 static void 1930 build_new_reduction (htab_t reduction_list, gimple reduc_stmt, gimple phi) 1931 { 1932 PTR *slot; 1933 struct reduction_info *new_reduction; 1934 1935 gcc_assert (reduc_stmt); 1936 1937 if (dump_file && (dump_flags & TDF_DETAILS)) 1938 { 1939 fprintf (dump_file, 1940 "Detected reduction. reduction stmt is: \n"); 1941 print_gimple_stmt (dump_file, reduc_stmt, 0, 0); 1942 fprintf (dump_file, "\n"); 1943 } 1944 1945 new_reduction = XCNEW (struct reduction_info); 1946 1947 new_reduction->reduc_stmt = reduc_stmt; 1948 new_reduction->reduc_phi = phi; 1949 new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi)); 1950 new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt); 1951 slot = htab_find_slot (reduction_list, new_reduction, INSERT); 1952 *slot = new_reduction; 1953 } 1954 1955 /* Callback for htab_traverse. Sets gimple_uid of reduc_phi stmts. */ 1956 1957 static int 1958 set_reduc_phi_uids (void **slot, void *data ATTRIBUTE_UNUSED) 1959 { 1960 struct reduction_info *const red = (struct reduction_info *) *slot; 1961 gimple_set_uid (red->reduc_phi, red->reduc_version); 1962 return 1; 1963 } 1964 1965 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */ 1966 1967 static void 1968 gather_scalar_reductions (loop_p loop, htab_t reduction_list) 1969 { 1970 gimple_stmt_iterator gsi; 1971 loop_vec_info simple_loop_info; 1972 1973 vect_dump = NULL; 1974 simple_loop_info = vect_analyze_loop_form (loop); 1975 1976 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 1977 { 1978 gimple phi = gsi_stmt (gsi); 1979 affine_iv iv; 1980 tree res = PHI_RESULT (phi); 1981 bool double_reduc; 1982 1983 if (!is_gimple_reg (res)) 1984 continue; 1985 1986 if (!simple_iv (loop, loop, res, &iv, true) 1987 && simple_loop_info) 1988 { 1989 gimple reduc_stmt = vect_force_simple_reduction (simple_loop_info, 1990 phi, true, 1991 &double_reduc); 1992 if (reduc_stmt && !double_reduc) 1993 build_new_reduction (reduction_list, reduc_stmt, phi); 1994 } 1995 } 1996 destroy_loop_vec_info (simple_loop_info, true); 1997 1998 /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form 1999 and destroy_loop_vec_info, we can set gimple_uid of reduc_phi stmts 2000 only now. */ 2001 htab_traverse (reduction_list, set_reduc_phi_uids, NULL); 2002 } 2003 2004 /* Try to initialize NITER for code generation part. */ 2005 2006 static bool 2007 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter) 2008 { 2009 edge exit = single_dom_exit (loop); 2010 2011 gcc_assert (exit); 2012 2013 /* We need to know # of iterations, and there should be no uses of values 2014 defined inside loop outside of it, unless the values are invariants of 2015 the loop. */ 2016 if (!number_of_iterations_exit (loop, exit, niter, false)) 2017 { 2018 if (dump_file && (dump_flags & TDF_DETAILS)) 2019 fprintf (dump_file, " FAILED: number of iterations not known\n"); 2020 return false; 2021 } 2022 2023 return true; 2024 } 2025 2026 /* Try to initialize REDUCTION_LIST for code generation part. 2027 REDUCTION_LIST describes the reductions. */ 2028 2029 static bool 2030 try_create_reduction_list (loop_p loop, htab_t reduction_list) 2031 { 2032 edge exit = single_dom_exit (loop); 2033 gimple_stmt_iterator gsi; 2034 2035 gcc_assert (exit); 2036 2037 gather_scalar_reductions (loop, reduction_list); 2038 2039 2040 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 2041 { 2042 gimple phi = gsi_stmt (gsi); 2043 struct reduction_info *red; 2044 imm_use_iterator imm_iter; 2045 use_operand_p use_p; 2046 gimple reduc_phi; 2047 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 2048 2049 if (is_gimple_reg (val)) 2050 { 2051 if (dump_file && (dump_flags & TDF_DETAILS)) 2052 { 2053 fprintf (dump_file, "phi is "); 2054 print_gimple_stmt (dump_file, phi, 0, 0); 2055 fprintf (dump_file, "arg of phi to exit: value "); 2056 print_generic_expr (dump_file, val, 0); 2057 fprintf (dump_file, " used outside loop\n"); 2058 fprintf (dump_file, 2059 " checking if it a part of reduction pattern: \n"); 2060 } 2061 if (htab_elements (reduction_list) == 0) 2062 { 2063 if (dump_file && (dump_flags & TDF_DETAILS)) 2064 fprintf (dump_file, 2065 " FAILED: it is not a part of reduction.\n"); 2066 return false; 2067 } 2068 reduc_phi = NULL; 2069 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val) 2070 { 2071 if (!gimple_debug_bind_p (USE_STMT (use_p)) 2072 && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 2073 { 2074 reduc_phi = USE_STMT (use_p); 2075 break; 2076 } 2077 } 2078 red = reduction_phi (reduction_list, reduc_phi); 2079 if (red == NULL) 2080 { 2081 if (dump_file && (dump_flags & TDF_DETAILS)) 2082 fprintf (dump_file, 2083 " FAILED: it is not a part of reduction.\n"); 2084 return false; 2085 } 2086 if (dump_file && (dump_flags & TDF_DETAILS)) 2087 { 2088 fprintf (dump_file, "reduction phi is "); 2089 print_gimple_stmt (dump_file, red->reduc_phi, 0, 0); 2090 fprintf (dump_file, "reduction stmt is "); 2091 print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0); 2092 } 2093 } 2094 } 2095 2096 /* The iterations of the loop may communicate only through bivs whose 2097 iteration space can be distributed efficiently. */ 2098 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 2099 { 2100 gimple phi = gsi_stmt (gsi); 2101 tree def = PHI_RESULT (phi); 2102 affine_iv iv; 2103 2104 if (is_gimple_reg (def) && !simple_iv (loop, loop, def, &iv, true)) 2105 { 2106 struct reduction_info *red; 2107 2108 red = reduction_phi (reduction_list, phi); 2109 if (red == NULL) 2110 { 2111 if (dump_file && (dump_flags & TDF_DETAILS)) 2112 fprintf (dump_file, 2113 " FAILED: scalar dependency between iterations\n"); 2114 return false; 2115 } 2116 } 2117 } 2118 2119 2120 return true; 2121 } 2122 2123 /* Detect parallel loops and generate parallel code using libgomp 2124 primitives. Returns true if some loop was parallelized, false 2125 otherwise. */ 2126 2127 bool 2128 parallelize_loops (void) 2129 { 2130 unsigned n_threads = flag_tree_parallelize_loops; 2131 bool changed = false; 2132 struct loop *loop; 2133 struct tree_niter_desc niter_desc; 2134 loop_iterator li; 2135 htab_t reduction_list; 2136 struct obstack parloop_obstack; 2137 HOST_WIDE_INT estimated; 2138 LOC loop_loc; 2139 2140 /* Do not parallelize loops in the functions created by parallelization. */ 2141 if (parallelized_function_p (cfun->decl)) 2142 return false; 2143 if (cfun->has_nonlocal_label) 2144 return false; 2145 2146 gcc_obstack_init (&parloop_obstack); 2147 reduction_list = htab_create (10, reduction_info_hash, 2148 reduction_info_eq, free); 2149 init_stmt_vec_info_vec (); 2150 2151 FOR_EACH_LOOP (li, loop, 0) 2152 { 2153 htab_empty (reduction_list); 2154 if (dump_file && (dump_flags & TDF_DETAILS)) 2155 { 2156 fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); 2157 if (loop->inner) 2158 fprintf (dump_file, "loop %d is not innermost\n",loop->num); 2159 else 2160 fprintf (dump_file, "loop %d is innermost\n",loop->num); 2161 } 2162 2163 /* If we use autopar in graphite pass, we use its marked dependency 2164 checking results. */ 2165 if (flag_loop_parallelize_all && !loop->can_be_parallel) 2166 { 2167 if (dump_file && (dump_flags & TDF_DETAILS)) 2168 fprintf (dump_file, "loop is not parallel according to graphite\n"); 2169 continue; 2170 } 2171 2172 if (!single_dom_exit (loop)) 2173 { 2174 2175 if (dump_file && (dump_flags & TDF_DETAILS)) 2176 fprintf (dump_file, "loop is !single_dom_exit\n"); 2177 2178 continue; 2179 } 2180 2181 if (/* And of course, the loop must be parallelizable. */ 2182 !can_duplicate_loop_p (loop) 2183 || loop_has_blocks_with_irreducible_flag (loop) 2184 || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP) 2185 /* FIXME: the check for vector phi nodes could be removed. */ 2186 || loop_has_vector_phi_nodes (loop) 2187 /* FIXME: transform_to_exit_first_loop does not handle not 2188 header-copied loops correctly - see PR46886. */ 2189 || !do_while_loop_p (loop)) 2190 continue; 2191 estimated = max_stmt_executions_int (loop, false); 2192 /* FIXME: Bypass this check as graphite doesn't update the 2193 count and frequency correctly now. */ 2194 if (!flag_loop_parallelize_all 2195 && ((estimated !=-1 2196 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) 2197 /* Do not bother with loops in cold areas. */ 2198 || optimize_loop_nest_for_size_p (loop))) 2199 continue; 2200 2201 if (!try_get_loop_niter (loop, &niter_desc)) 2202 continue; 2203 2204 if (!try_create_reduction_list (loop, reduction_list)) 2205 continue; 2206 2207 if (!flag_loop_parallelize_all 2208 && !loop_parallel_p (loop, &parloop_obstack)) 2209 continue; 2210 2211 changed = true; 2212 if (dump_file && (dump_flags & TDF_DETAILS)) 2213 { 2214 if (loop->inner) 2215 fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index); 2216 else 2217 fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index); 2218 loop_loc = find_loop_location (loop); 2219 if (loop_loc != UNKNOWN_LOC) 2220 fprintf (dump_file, "\nloop at %s:%d: ", 2221 LOC_FILE (loop_loc), LOC_LINE (loop_loc)); 2222 } 2223 gen_parallel_loop (loop, reduction_list, 2224 n_threads, &niter_desc); 2225 verify_flow_info (); 2226 verify_dominators (CDI_DOMINATORS); 2227 verify_loop_structure (); 2228 verify_loop_closed_ssa (true); 2229 } 2230 2231 free_stmt_vec_info_vec (); 2232 htab_delete (reduction_list); 2233 obstack_free (&parloop_obstack, NULL); 2234 2235 /* Parallelization will cause new function calls to be inserted through 2236 which local variables will escape. Reset the points-to solution 2237 for ESCAPED. */ 2238 if (changed) 2239 pt_solution_reset (&cfun->gimple_df->escaped); 2240 2241 return changed; 2242 } 2243 2244 #include "gt-tree-parloops.h" 2245