1 /* SLP - Basic Block Vectorization 2 Copyright (C) 2007-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "tree-pass.h" 31 #include "ssa.h" 32 #include "optabs-tree.h" 33 #include "insn-config.h" 34 #include "recog.h" /* FIXME: for insn_data */ 35 #include "params.h" 36 #include "fold-const.h" 37 #include "stor-layout.h" 38 #include "gimple-iterator.h" 39 #include "cfgloop.h" 40 #include "tree-vectorizer.h" 41 #include "langhooks.h" 42 #include "gimple-walk.h" 43 #include "dbgcnt.h" 44 #include "tree-vector-builder.h" 45 #include "vec-perm-indices.h" 46 #include "gimple-fold.h" 47 #include "internal-fn.h" 48 49 50 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ 51 52 static void 53 vect_free_slp_tree (slp_tree node) 54 { 55 int i; 56 slp_tree child; 57 58 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 59 vect_free_slp_tree (child); 60 61 gimple *stmt; 62 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 63 /* After transform some stmts are removed and thus their vinfo is gone. */ 64 if (vinfo_for_stmt (stmt)) 65 { 66 gcc_assert (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) > 0); 67 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))--; 68 } 69 70 SLP_TREE_CHILDREN (node).release (); 71 SLP_TREE_SCALAR_STMTS (node).release (); 72 SLP_TREE_VEC_STMTS (node).release (); 73 SLP_TREE_LOAD_PERMUTATION (node).release (); 74 75 free (node); 76 } 77 78 79 /* Free the memory allocated for the SLP instance. */ 80 81 void 82 vect_free_slp_instance (slp_instance instance) 83 { 84 vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); 85 SLP_INSTANCE_LOADS (instance).release (); 86 free (instance); 87 } 88 89 90 /* Create an SLP node for SCALAR_STMTS. */ 91 92 static slp_tree 93 vect_create_new_slp_node (vec<gimple *> scalar_stmts) 94 { 95 slp_tree node; 96 gimple *stmt = scalar_stmts[0]; 97 unsigned int nops; 98 99 if (is_gimple_call (stmt)) 100 nops = gimple_call_num_args (stmt); 101 else if (is_gimple_assign (stmt)) 102 { 103 nops = gimple_num_ops (stmt) - 1; 104 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 105 nops++; 106 } 107 else if (gimple_code (stmt) == GIMPLE_PHI) 108 nops = 0; 109 else 110 return NULL; 111 112 node = XNEW (struct _slp_tree); 113 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; 114 SLP_TREE_VEC_STMTS (node).create (0); 115 SLP_TREE_CHILDREN (node).create (nops); 116 SLP_TREE_LOAD_PERMUTATION (node) = vNULL; 117 SLP_TREE_TWO_OPERATORS (node) = false; 118 SLP_TREE_DEF_TYPE (node) = vect_internal_def; 119 120 unsigned i; 121 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt) 122 STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))++; 123 124 return node; 125 } 126 127 128 /* This structure is used in creation of an SLP tree. Each instance 129 corresponds to the same operand in a group of scalar stmts in an SLP 130 node. */ 131 typedef struct _slp_oprnd_info 132 { 133 /* Def-stmts for the operands. */ 134 vec<gimple *> def_stmts; 135 /* Information about the first statement, its vector def-type, type, the 136 operand itself in case it's constant, and an indication if it's a pattern 137 stmt. */ 138 tree first_op_type; 139 enum vect_def_type first_dt; 140 bool first_pattern; 141 bool second_pattern; 142 } *slp_oprnd_info; 143 144 145 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each 146 operand. */ 147 static vec<slp_oprnd_info> 148 vect_create_oprnd_info (int nops, int group_size) 149 { 150 int i; 151 slp_oprnd_info oprnd_info; 152 vec<slp_oprnd_info> oprnds_info; 153 154 oprnds_info.create (nops); 155 for (i = 0; i < nops; i++) 156 { 157 oprnd_info = XNEW (struct _slp_oprnd_info); 158 oprnd_info->def_stmts.create (group_size); 159 oprnd_info->first_dt = vect_uninitialized_def; 160 oprnd_info->first_op_type = NULL_TREE; 161 oprnd_info->first_pattern = false; 162 oprnd_info->second_pattern = false; 163 oprnds_info.quick_push (oprnd_info); 164 } 165 166 return oprnds_info; 167 } 168 169 170 /* Free operands info. */ 171 172 static void 173 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) 174 { 175 int i; 176 slp_oprnd_info oprnd_info; 177 178 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 179 { 180 oprnd_info->def_stmts.release (); 181 XDELETE (oprnd_info); 182 } 183 184 oprnds_info.release (); 185 } 186 187 188 /* Find the place of the data-ref in STMT in the interleaving chain that starts 189 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */ 190 191 int 192 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt) 193 { 194 gimple *next_stmt = first_stmt; 195 int result = 0; 196 197 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 198 return -1; 199 200 do 201 { 202 if (next_stmt == stmt) 203 return result; 204 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 205 if (next_stmt) 206 result += GROUP_GAP (vinfo_for_stmt (next_stmt)); 207 } 208 while (next_stmt); 209 210 return -1; 211 } 212 213 /* Check whether it is possible to load COUNT elements of type ELT_MODE 214 using the method implemented by duplicate_and_interleave. Return true 215 if so, returning the number of intermediate vectors in *NVECTORS_OUT 216 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT 217 (if nonnull). */ 218 219 bool 220 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode, 221 unsigned int *nvectors_out, 222 tree *vector_type_out, 223 tree *permutes) 224 { 225 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode); 226 poly_int64 nelts; 227 unsigned int nvectors = 1; 228 for (;;) 229 { 230 scalar_int_mode int_mode; 231 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT; 232 if (multiple_p (current_vector_size, elt_bytes, &nelts) 233 && int_mode_for_size (elt_bits, 0).exists (&int_mode)) 234 { 235 tree int_type = build_nonstandard_integer_type 236 (GET_MODE_BITSIZE (int_mode), 1); 237 tree vector_type = build_vector_type (int_type, nelts); 238 if (VECTOR_MODE_P (TYPE_MODE (vector_type))) 239 { 240 vec_perm_builder sel1 (nelts, 2, 3); 241 vec_perm_builder sel2 (nelts, 2, 3); 242 poly_int64 half_nelts = exact_div (nelts, 2); 243 for (unsigned int i = 0; i < 3; ++i) 244 { 245 sel1.quick_push (i); 246 sel1.quick_push (i + nelts); 247 sel2.quick_push (half_nelts + i); 248 sel2.quick_push (half_nelts + i + nelts); 249 } 250 vec_perm_indices indices1 (sel1, 2, nelts); 251 vec_perm_indices indices2 (sel2, 2, nelts); 252 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1) 253 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2)) 254 { 255 if (nvectors_out) 256 *nvectors_out = nvectors; 257 if (vector_type_out) 258 *vector_type_out = vector_type; 259 if (permutes) 260 { 261 permutes[0] = vect_gen_perm_mask_checked (vector_type, 262 indices1); 263 permutes[1] = vect_gen_perm_mask_checked (vector_type, 264 indices2); 265 } 266 return true; 267 } 268 } 269 } 270 if (!multiple_p (elt_bytes, 2, &elt_bytes)) 271 return false; 272 nvectors *= 2; 273 } 274 } 275 276 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that 277 they are of a valid type and that they match the defs of the first stmt of 278 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts 279 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP 280 indicates swap is required for cond_expr stmts. Specifically, *SWAP 281 is 1 if STMT is cond and operands of comparison need to be swapped; 282 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted. 283 If there is any operand swap in this function, *SWAP is set to non-zero 284 value. 285 If there was a fatal error return -1; if the error could be corrected by 286 swapping operands of father node of this one, return 1; if everything is 287 ok return 0. */ 288 static int 289 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, 290 vec<gimple *> stmts, unsigned stmt_num, 291 vec<slp_oprnd_info> *oprnds_info) 292 { 293 gimple *stmt = stmts[stmt_num]; 294 tree oprnd; 295 unsigned int i, number_of_oprnds; 296 gimple *def_stmt; 297 enum vect_def_type dt = vect_uninitialized_def; 298 bool pattern = false; 299 slp_oprnd_info oprnd_info; 300 int first_op_idx = 1; 301 bool commutative = false; 302 bool first_op_cond = false; 303 bool first = stmt_num == 0; 304 bool second = stmt_num == 1; 305 306 if (is_gimple_call (stmt)) 307 { 308 number_of_oprnds = gimple_call_num_args (stmt); 309 first_op_idx = 3; 310 } 311 else if (is_gimple_assign (stmt)) 312 { 313 enum tree_code code = gimple_assign_rhs_code (stmt); 314 number_of_oprnds = gimple_num_ops (stmt) - 1; 315 /* Swap can only be done for cond_expr if asked to, otherwise we 316 could result in different comparison code to the first stmt. */ 317 if (gimple_assign_rhs_code (stmt) == COND_EXPR 318 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt))) 319 { 320 first_op_cond = true; 321 number_of_oprnds++; 322 } 323 else 324 commutative = commutative_tree_code (code); 325 } 326 else 327 return -1; 328 329 bool swapped = (*swap != 0); 330 gcc_assert (!swapped || first_op_cond); 331 for (i = 0; i < number_of_oprnds; i++) 332 { 333 again: 334 if (first_op_cond) 335 { 336 /* Map indicating how operands of cond_expr should be swapped. */ 337 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}}; 338 int *map = maps[*swap]; 339 340 if (i < 2) 341 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx), map[i]); 342 else 343 oprnd = gimple_op (stmt, map[i]); 344 } 345 else 346 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i)); 347 348 oprnd_info = (*oprnds_info)[i]; 349 350 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt)) 351 { 352 if (dump_enabled_p ()) 353 { 354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 355 "Build SLP failed: can't analyze def for "); 356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 358 } 359 360 return -1; 361 } 362 363 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt 364 from the pattern. Check that all the stmts of the node are in the 365 pattern. */ 366 if (def_stmt && gimple_bb (def_stmt) 367 && vect_stmt_in_region_p (vinfo, def_stmt) 368 && vinfo_for_stmt (def_stmt) 369 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt)) 370 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt)) 371 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) 372 { 373 pattern = true; 374 if (!first && !oprnd_info->first_pattern 375 /* Allow different pattern state for the defs of the 376 first stmt in reduction chains. */ 377 && (oprnd_info->first_dt != vect_reduction_def 378 || (!second && !oprnd_info->second_pattern))) 379 { 380 if (i == 0 381 && !swapped 382 && commutative) 383 { 384 swapped = true; 385 goto again; 386 } 387 388 if (dump_enabled_p ()) 389 { 390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 391 "Build SLP failed: some of the stmts" 392 " are in a pattern, and others are not "); 393 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 395 } 396 397 return 1; 398 } 399 400 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); 401 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)); 402 403 if (dt == vect_unknown_def_type) 404 { 405 if (dump_enabled_p ()) 406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 407 "Unsupported pattern.\n"); 408 return -1; 409 } 410 411 switch (gimple_code (def_stmt)) 412 { 413 case GIMPLE_PHI: 414 case GIMPLE_ASSIGN: 415 break; 416 417 default: 418 if (dump_enabled_p ()) 419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 420 "unsupported defining stmt:\n"); 421 return -1; 422 } 423 } 424 425 if (second) 426 oprnd_info->second_pattern = pattern; 427 428 if (first) 429 { 430 oprnd_info->first_dt = dt; 431 oprnd_info->first_pattern = pattern; 432 oprnd_info->first_op_type = TREE_TYPE (oprnd); 433 } 434 else 435 { 436 /* Not first stmt of the group, check that the def-stmt/s match 437 the def-stmt/s of the first stmt. Allow different definition 438 types for reduction chains: the first stmt must be a 439 vect_reduction_def (a phi node), and the rest 440 vect_internal_def. */ 441 tree type = TREE_TYPE (oprnd); 442 if ((oprnd_info->first_dt != dt 443 && !(oprnd_info->first_dt == vect_reduction_def 444 && dt == vect_internal_def) 445 && !((oprnd_info->first_dt == vect_external_def 446 || oprnd_info->first_dt == vect_constant_def) 447 && (dt == vect_external_def 448 || dt == vect_constant_def))) 449 || !types_compatible_p (oprnd_info->first_op_type, type)) 450 { 451 /* Try swapping operands if we got a mismatch. */ 452 if (i == 0 453 && !swapped 454 && commutative) 455 { 456 swapped = true; 457 goto again; 458 } 459 460 if (dump_enabled_p ()) 461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 462 "Build SLP failed: different types\n"); 463 464 return 1; 465 } 466 if ((dt == vect_constant_def 467 || dt == vect_external_def) 468 && !current_vector_size.is_constant () 469 && (TREE_CODE (type) == BOOLEAN_TYPE 470 || !can_duplicate_and_interleave_p (stmts.length (), 471 TYPE_MODE (type)))) 472 { 473 if (dump_enabled_p ()) 474 { 475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 476 "Build SLP failed: invalid type of def " 477 "for variable-length SLP "); 478 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 480 } 481 return -1; 482 } 483 } 484 485 /* Check the types of the definitions. */ 486 switch (dt) 487 { 488 case vect_constant_def: 489 case vect_external_def: 490 break; 491 492 case vect_reduction_def: 493 case vect_induction_def: 494 case vect_internal_def: 495 oprnd_info->def_stmts.quick_push (def_stmt); 496 break; 497 498 default: 499 /* FORNOW: Not supported. */ 500 if (dump_enabled_p ()) 501 { 502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 503 "Build SLP failed: illegal type of def "); 504 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd); 505 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 506 } 507 508 return -1; 509 } 510 } 511 512 /* Swap operands. */ 513 if (swapped) 514 { 515 /* If there are already uses of this stmt in a SLP instance then 516 we've committed to the operand order and can't swap it. */ 517 if (STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt)) != 0) 518 { 519 if (dump_enabled_p ()) 520 { 521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 522 "Build SLP failed: cannot swap operands of " 523 "shared stmt "); 524 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 525 } 526 return -1; 527 } 528 529 if (first_op_cond) 530 { 531 tree cond = gimple_assign_rhs1 (stmt); 532 enum tree_code code = TREE_CODE (cond); 533 534 /* Swap. */ 535 if (*swap == 1) 536 { 537 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0), 538 &TREE_OPERAND (cond, 1)); 539 TREE_SET_CODE (cond, swap_tree_comparison (code)); 540 } 541 /* Invert. */ 542 else 543 { 544 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt), 545 gimple_assign_rhs3_ptr (stmt)); 546 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0)); 547 code = invert_tree_comparison (TREE_CODE (cond), honor_nans); 548 gcc_assert (code != ERROR_MARK); 549 TREE_SET_CODE (cond, code); 550 } 551 } 552 else 553 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), 554 gimple_assign_rhs2_ptr (stmt)); 555 if (dump_enabled_p ()) 556 { 557 dump_printf_loc (MSG_NOTE, vect_location, 558 "swapped operands to match def types in "); 559 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 560 } 561 } 562 563 *swap = swapped; 564 return 0; 565 } 566 567 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the 568 caller's attempt to find the vector type in STMT with the narrowest 569 element type. Return true if VECTYPE is nonnull and if it is valid 570 for VINFO. When returning true, update MAX_NUNITS to reflect the 571 number of units in VECTYPE. VINFO, GORUP_SIZE and MAX_NUNITS are 572 as for vect_build_slp_tree. */ 573 574 static bool 575 vect_record_max_nunits (vec_info *vinfo, gimple *stmt, unsigned int group_size, 576 tree vectype, poly_uint64 *max_nunits) 577 { 578 if (!vectype) 579 { 580 if (dump_enabled_p ()) 581 { 582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 583 "Build SLP failed: unsupported data-type in "); 584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 585 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 586 } 587 /* Fatal mismatch. */ 588 return false; 589 } 590 591 /* If populating the vector type requires unrolling then fail 592 before adjusting *max_nunits for basic-block vectorization. */ 593 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 594 unsigned HOST_WIDE_INT const_nunits; 595 if (is_a <bb_vec_info> (vinfo) 596 && (!nunits.is_constant (&const_nunits) 597 || const_nunits > group_size)) 598 { 599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 600 "Build SLP failed: unrolling required " 601 "in basic block SLP\n"); 602 /* Fatal mismatch. */ 603 return false; 604 } 605 606 /* In case of multiple types we need to detect the smallest type. */ 607 vect_update_max_nunits (max_nunits, vectype); 608 return true; 609 } 610 611 /* Verify if the scalar stmts STMTS are isomorphic, require data 612 permutation or are of unsupported types of operation. Return 613 true if they are, otherwise return false and indicate in *MATCHES 614 which stmts are not isomorphic to the first one. If MATCHES[0] 615 is false then this indicates the comparison could not be 616 carried out or the stmts will never be vectorized by SLP. 617 618 Note COND_EXPR is possibly ismorphic to another one after swapping its 619 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to 620 the first stmt by swapping the two operands of comparison; set SWAP[i] 621 to 2 if stmt I is isormorphic to the first stmt by inverting the code 622 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped 623 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */ 624 625 static bool 626 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, 627 vec<gimple *> stmts, unsigned int group_size, 628 unsigned nops, poly_uint64 *max_nunits, 629 bool *matches, bool *two_operators) 630 { 631 unsigned int i; 632 gimple *first_stmt = stmts[0], *stmt = stmts[0]; 633 enum tree_code first_stmt_code = ERROR_MARK; 634 enum tree_code alt_stmt_code = ERROR_MARK; 635 enum tree_code rhs_code = ERROR_MARK; 636 enum tree_code first_cond_code = ERROR_MARK; 637 tree lhs; 638 bool need_same_oprnds = false; 639 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE; 640 optab optab; 641 int icode; 642 machine_mode optab_op2_mode; 643 machine_mode vec_mode; 644 HOST_WIDE_INT dummy; 645 gimple *first_load = NULL, *prev_first_load = NULL; 646 647 /* For every stmt in NODE find its def stmt/s. */ 648 FOR_EACH_VEC_ELT (stmts, i, stmt) 649 { 650 swap[i] = 0; 651 matches[i] = false; 652 653 if (dump_enabled_p ()) 654 { 655 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for "); 656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 657 } 658 659 /* Fail to vectorize statements marked as unvectorizable. */ 660 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt))) 661 { 662 if (dump_enabled_p ()) 663 { 664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 665 "Build SLP failed: unvectorizable statement "); 666 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 667 } 668 /* Fatal mismatch. */ 669 matches[0] = false; 670 return false; 671 } 672 673 lhs = gimple_get_lhs (stmt); 674 if (lhs == NULL_TREE) 675 { 676 if (dump_enabled_p ()) 677 { 678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 679 "Build SLP failed: not GIMPLE_ASSIGN nor " 680 "GIMPLE_CALL "); 681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 682 } 683 /* Fatal mismatch. */ 684 matches[0] = false; 685 return false; 686 } 687 688 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); 689 vectype = get_vectype_for_scalar_type (scalar_type); 690 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype, 691 max_nunits)) 692 { 693 /* Fatal mismatch. */ 694 matches[0] = false; 695 return false; 696 } 697 698 if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) 699 { 700 rhs_code = CALL_EXPR; 701 if (gimple_call_internal_p (call_stmt) 702 || gimple_call_tail_p (call_stmt) 703 || gimple_call_noreturn_p (call_stmt) 704 || !gimple_call_nothrow_p (call_stmt) 705 || gimple_call_chain (call_stmt)) 706 { 707 if (dump_enabled_p ()) 708 { 709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 710 "Build SLP failed: unsupported call type "); 711 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 712 call_stmt, 0); 713 } 714 /* Fatal mismatch. */ 715 matches[0] = false; 716 return false; 717 } 718 } 719 else 720 rhs_code = gimple_assign_rhs_code (stmt); 721 722 /* Check the operation. */ 723 if (i == 0) 724 { 725 first_stmt_code = rhs_code; 726 727 /* Shift arguments should be equal in all the packed stmts for a 728 vector shift with scalar shift operand. */ 729 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR 730 || rhs_code == LROTATE_EXPR 731 || rhs_code == RROTATE_EXPR) 732 { 733 vec_mode = TYPE_MODE (vectype); 734 735 /* First see if we have a vector/vector shift. */ 736 optab = optab_for_tree_code (rhs_code, vectype, 737 optab_vector); 738 739 if (!optab 740 || optab_handler (optab, vec_mode) == CODE_FOR_nothing) 741 { 742 /* No vector/vector shift, try for a vector/scalar shift. */ 743 optab = optab_for_tree_code (rhs_code, vectype, 744 optab_scalar); 745 746 if (!optab) 747 { 748 if (dump_enabled_p ()) 749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 750 "Build SLP failed: no optab.\n"); 751 /* Fatal mismatch. */ 752 matches[0] = false; 753 return false; 754 } 755 icode = (int) optab_handler (optab, vec_mode); 756 if (icode == CODE_FOR_nothing) 757 { 758 if (dump_enabled_p ()) 759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 760 "Build SLP failed: " 761 "op not supported by target.\n"); 762 /* Fatal mismatch. */ 763 matches[0] = false; 764 return false; 765 } 766 optab_op2_mode = insn_data[icode].operand[2].mode; 767 if (!VECTOR_MODE_P (optab_op2_mode)) 768 { 769 need_same_oprnds = true; 770 first_op1 = gimple_assign_rhs2 (stmt); 771 } 772 } 773 } 774 else if (rhs_code == WIDEN_LSHIFT_EXPR) 775 { 776 need_same_oprnds = true; 777 first_op1 = gimple_assign_rhs2 (stmt); 778 } 779 } 780 else 781 { 782 if (first_stmt_code != rhs_code 783 && alt_stmt_code == ERROR_MARK) 784 alt_stmt_code = rhs_code; 785 if (first_stmt_code != rhs_code 786 && (first_stmt_code != IMAGPART_EXPR 787 || rhs_code != REALPART_EXPR) 788 && (first_stmt_code != REALPART_EXPR 789 || rhs_code != IMAGPART_EXPR) 790 /* Handle mismatches in plus/minus by computing both 791 and merging the results. */ 792 && !((first_stmt_code == PLUS_EXPR 793 || first_stmt_code == MINUS_EXPR) 794 && (alt_stmt_code == PLUS_EXPR 795 || alt_stmt_code == MINUS_EXPR) 796 && rhs_code == alt_stmt_code) 797 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) 798 && (first_stmt_code == ARRAY_REF 799 || first_stmt_code == BIT_FIELD_REF 800 || first_stmt_code == INDIRECT_REF 801 || first_stmt_code == COMPONENT_REF 802 || first_stmt_code == MEM_REF))) 803 { 804 if (dump_enabled_p ()) 805 { 806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 807 "Build SLP failed: different operation " 808 "in stmt "); 809 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 811 "original stmt "); 812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 813 first_stmt, 0); 814 } 815 /* Mismatch. */ 816 continue; 817 } 818 819 if (need_same_oprnds 820 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) 821 { 822 if (dump_enabled_p ()) 823 { 824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 825 "Build SLP failed: different shift " 826 "arguments in "); 827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 828 } 829 /* Mismatch. */ 830 continue; 831 } 832 833 if (rhs_code == CALL_EXPR) 834 { 835 gimple *first_stmt = stmts[0]; 836 if (gimple_call_num_args (stmt) != nops 837 || !operand_equal_p (gimple_call_fn (first_stmt), 838 gimple_call_fn (stmt), 0) 839 || gimple_call_fntype (first_stmt) 840 != gimple_call_fntype (stmt)) 841 { 842 if (dump_enabled_p ()) 843 { 844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 845 "Build SLP failed: different calls in "); 846 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 847 stmt, 0); 848 } 849 /* Mismatch. */ 850 continue; 851 } 852 } 853 } 854 855 /* Grouped store or load. */ 856 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) 857 { 858 if (REFERENCE_CLASS_P (lhs)) 859 { 860 /* Store. */ 861 ; 862 } 863 else 864 { 865 /* Load. */ 866 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 867 if (prev_first_load) 868 { 869 /* Check that there are no loads from different interleaving 870 chains in the same node. */ 871 if (prev_first_load != first_load) 872 { 873 if (dump_enabled_p ()) 874 { 875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 876 vect_location, 877 "Build SLP failed: different " 878 "interleaving chains in one node "); 879 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 880 stmt, 0); 881 } 882 /* Mismatch. */ 883 continue; 884 } 885 } 886 else 887 prev_first_load = first_load; 888 } 889 } /* Grouped access. */ 890 else 891 { 892 if (TREE_CODE_CLASS (rhs_code) == tcc_reference) 893 { 894 /* Not grouped load. */ 895 if (dump_enabled_p ()) 896 { 897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 898 "Build SLP failed: not grouped load "); 899 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 900 } 901 902 /* FORNOW: Not grouped loads are not supported. */ 903 /* Fatal mismatch. */ 904 matches[0] = false; 905 return false; 906 } 907 908 /* Not memory operation. */ 909 if (TREE_CODE_CLASS (rhs_code) != tcc_binary 910 && TREE_CODE_CLASS (rhs_code) != tcc_unary 911 && TREE_CODE_CLASS (rhs_code) != tcc_expression 912 && TREE_CODE_CLASS (rhs_code) != tcc_comparison 913 && rhs_code != CALL_EXPR) 914 { 915 if (dump_enabled_p ()) 916 { 917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 918 "Build SLP failed: operation"); 919 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported "); 920 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 921 } 922 /* Fatal mismatch. */ 923 matches[0] = false; 924 return false; 925 } 926 927 if (rhs_code == COND_EXPR) 928 { 929 tree cond_expr = gimple_assign_rhs1 (stmt); 930 enum tree_code cond_code = TREE_CODE (cond_expr); 931 enum tree_code swap_code = ERROR_MARK; 932 enum tree_code invert_code = ERROR_MARK; 933 934 if (i == 0) 935 first_cond_code = TREE_CODE (cond_expr); 936 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 937 { 938 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 939 swap_code = swap_tree_comparison (cond_code); 940 invert_code = invert_tree_comparison (cond_code, honor_nans); 941 } 942 943 if (first_cond_code == cond_code) 944 ; 945 /* Isomorphic can be achieved by swapping. */ 946 else if (first_cond_code == swap_code) 947 swap[i] = 1; 948 /* Isomorphic can be achieved by inverting. */ 949 else if (first_cond_code == invert_code) 950 swap[i] = 2; 951 else 952 { 953 if (dump_enabled_p ()) 954 { 955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 956 "Build SLP failed: different" 957 " operation"); 958 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 959 stmt, 0); 960 } 961 /* Mismatch. */ 962 continue; 963 } 964 } 965 } 966 967 matches[i] = true; 968 } 969 970 for (i = 0; i < group_size; ++i) 971 if (!matches[i]) 972 return false; 973 974 /* If we allowed a two-operation SLP node verify the target can cope 975 with the permute we are going to use. */ 976 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 977 if (alt_stmt_code != ERROR_MARK 978 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) 979 { 980 unsigned HOST_WIDE_INT count; 981 if (!nunits.is_constant (&count)) 982 { 983 if (dump_enabled_p ()) 984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 985 "Build SLP failed: different operations " 986 "not allowed with variable-length SLP.\n"); 987 return false; 988 } 989 vec_perm_builder sel (count, count, 1); 990 for (i = 0; i < count; ++i) 991 { 992 unsigned int elt = i; 993 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code) 994 elt += count; 995 sel.quick_push (elt); 996 } 997 vec_perm_indices indices (sel, 2, count); 998 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 999 { 1000 for (i = 0; i < group_size; ++i) 1001 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code) 1002 { 1003 matches[i] = false; 1004 if (dump_enabled_p ()) 1005 { 1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1007 "Build SLP failed: different operation " 1008 "in stmt "); 1009 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1010 stmts[i], 0); 1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1012 "original stmt "); 1013 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1014 first_stmt, 0); 1015 } 1016 } 1017 return false; 1018 } 1019 *two_operators = true; 1020 } 1021 1022 return true; 1023 } 1024 1025 /* Traits for the hash_set to record failed SLP builds for a stmt set. 1026 Note we never remove apart from at destruction time so we do not 1027 need a special value for deleted that differs from empty. */ 1028 struct bst_traits 1029 { 1030 typedef vec <gimple *> value_type; 1031 typedef vec <gimple *> compare_type; 1032 static inline hashval_t hash (value_type); 1033 static inline bool equal (value_type existing, value_type candidate); 1034 static inline bool is_empty (value_type x) { return !x.exists (); } 1035 static inline bool is_deleted (value_type x) { return !x.exists (); } 1036 static inline void mark_empty (value_type &x) { x.release (); } 1037 static inline void mark_deleted (value_type &x) { x.release (); } 1038 static inline void remove (value_type &x) { x.release (); } 1039 }; 1040 inline hashval_t 1041 bst_traits::hash (value_type x) 1042 { 1043 inchash::hash h; 1044 for (unsigned i = 0; i < x.length (); ++i) 1045 h.add_int (gimple_uid (x[i])); 1046 return h.end (); 1047 } 1048 inline bool 1049 bst_traits::equal (value_type existing, value_type candidate) 1050 { 1051 if (existing.length () != candidate.length ()) 1052 return false; 1053 for (unsigned i = 0; i < existing.length (); ++i) 1054 if (existing[i] != candidate[i]) 1055 return false; 1056 return true; 1057 } 1058 1059 typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t; 1060 static scalar_stmts_set_t *bst_fail; 1061 1062 static slp_tree 1063 vect_build_slp_tree_2 (vec_info *vinfo, 1064 vec<gimple *> stmts, unsigned int group_size, 1065 poly_uint64 *max_nunits, 1066 vec<slp_tree> *loads, 1067 bool *matches, unsigned *npermutes, unsigned *tree_size, 1068 unsigned max_tree_size); 1069 1070 static slp_tree 1071 vect_build_slp_tree (vec_info *vinfo, 1072 vec<gimple *> stmts, unsigned int group_size, 1073 poly_uint64 *max_nunits, vec<slp_tree> *loads, 1074 bool *matches, unsigned *npermutes, unsigned *tree_size, 1075 unsigned max_tree_size) 1076 { 1077 if (bst_fail->contains (stmts)) 1078 return NULL; 1079 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits, 1080 loads, matches, npermutes, tree_size, 1081 max_tree_size); 1082 /* When SLP build fails for stmts record this, otherwise SLP build 1083 can be exponential in time when we allow to construct parts from 1084 scalars, see PR81723. */ 1085 if (! res) 1086 { 1087 vec <gimple *> x; 1088 x.create (stmts.length ()); 1089 x.splice (stmts); 1090 bst_fail->add (x); 1091 } 1092 return res; 1093 } 1094 1095 /* Recursively build an SLP tree starting from NODE. 1096 Fail (and return a value not equal to zero) if def-stmts are not 1097 isomorphic, require data permutation or are of unsupported types of 1098 operation. Otherwise, return 0. 1099 The value returned is the depth in the SLP tree where a mismatch 1100 was found. */ 1101 1102 static slp_tree 1103 vect_build_slp_tree_2 (vec_info *vinfo, 1104 vec<gimple *> stmts, unsigned int group_size, 1105 poly_uint64 *max_nunits, 1106 vec<slp_tree> *loads, 1107 bool *matches, unsigned *npermutes, unsigned *tree_size, 1108 unsigned max_tree_size) 1109 { 1110 unsigned nops, i, this_tree_size = 0; 1111 poly_uint64 this_max_nunits = *max_nunits; 1112 gimple *stmt; 1113 slp_tree node; 1114 1115 matches[0] = false; 1116 1117 stmt = stmts[0]; 1118 if (is_gimple_call (stmt)) 1119 nops = gimple_call_num_args (stmt); 1120 else if (is_gimple_assign (stmt)) 1121 { 1122 nops = gimple_num_ops (stmt) - 1; 1123 if (gimple_assign_rhs_code (stmt) == COND_EXPR) 1124 nops++; 1125 } 1126 else if (gimple_code (stmt) == GIMPLE_PHI) 1127 nops = 0; 1128 else 1129 return NULL; 1130 1131 /* If the SLP node is a PHI (induction or reduction), terminate 1132 the recursion. */ 1133 if (gimple_code (stmt) == GIMPLE_PHI) 1134 { 1135 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt)); 1136 tree vectype = get_vectype_for_scalar_type (scalar_type); 1137 if (!vect_record_max_nunits (vinfo, stmt, group_size, vectype, 1138 max_nunits)) 1139 return NULL; 1140 1141 vect_def_type def_type = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)); 1142 /* Induction from different IVs is not supported. */ 1143 if (def_type == vect_induction_def) 1144 { 1145 FOR_EACH_VEC_ELT (stmts, i, stmt) 1146 if (stmt != stmts[0]) 1147 return NULL; 1148 } 1149 else 1150 { 1151 /* Else def types have to match. */ 1152 FOR_EACH_VEC_ELT (stmts, i, stmt) 1153 { 1154 /* But for reduction chains only check on the first stmt. */ 1155 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 1156 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt) 1157 continue; 1158 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != def_type) 1159 return NULL; 1160 } 1161 } 1162 node = vect_create_new_slp_node (stmts); 1163 return node; 1164 } 1165 1166 1167 bool two_operators = false; 1168 unsigned char *swap = XALLOCAVEC (unsigned char, group_size); 1169 if (!vect_build_slp_tree_1 (vinfo, swap, 1170 stmts, group_size, nops, 1171 &this_max_nunits, matches, &two_operators)) 1172 return NULL; 1173 1174 /* If the SLP node is a load, terminate the recursion. */ 1175 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) 1176 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))) 1177 { 1178 *max_nunits = this_max_nunits; 1179 node = vect_create_new_slp_node (stmts); 1180 loads->safe_push (node); 1181 return node; 1182 } 1183 1184 /* Get at the operands, verifying they are compatible. */ 1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size); 1186 slp_oprnd_info oprnd_info; 1187 FOR_EACH_VEC_ELT (stmts, i, stmt) 1188 { 1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i], 1190 stmts, i, &oprnds_info); 1191 if (res != 0) 1192 matches[(res == -1) ? 0 : i] = false; 1193 if (!matches[0]) 1194 break; 1195 } 1196 for (i = 0; i < group_size; ++i) 1197 if (!matches[i]) 1198 { 1199 vect_free_oprnd_info (oprnds_info); 1200 return NULL; 1201 } 1202 1203 auto_vec<slp_tree, 4> children; 1204 auto_vec<slp_tree> this_loads; 1205 1206 stmt = stmts[0]; 1207 1208 if (tree_size) 1209 max_tree_size -= *tree_size; 1210 1211 /* Create SLP_TREE nodes for the definition node/s. */ 1212 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) 1213 { 1214 slp_tree child; 1215 unsigned old_nloads = this_loads.length (); 1216 unsigned old_tree_size = this_tree_size; 1217 unsigned int j; 1218 1219 if (oprnd_info->first_dt != vect_internal_def 1220 && oprnd_info->first_dt != vect_reduction_def 1221 && oprnd_info->first_dt != vect_induction_def) 1222 continue; 1223 1224 if (++this_tree_size > max_tree_size) 1225 { 1226 FOR_EACH_VEC_ELT (children, j, child) 1227 vect_free_slp_tree (child); 1228 vect_free_oprnd_info (oprnds_info); 1229 return NULL; 1230 } 1231 1232 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1233 group_size, &this_max_nunits, 1234 &this_loads, matches, npermutes, 1235 &this_tree_size, 1236 max_tree_size)) != NULL) 1237 { 1238 /* If we have all children of child built up from scalars then just 1239 throw that away and build it up this node from scalars. */ 1240 if (!SLP_TREE_CHILDREN (child).is_empty () 1241 /* ??? Rejecting patterns this way doesn't work. We'd have to 1242 do extra work to cancel the pattern so the uses see the 1243 scalar version. */ 1244 && !is_pattern_stmt_p 1245 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))) 1246 { 1247 slp_tree grandchild; 1248 1249 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1250 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) 1251 break; 1252 if (!grandchild) 1253 { 1254 /* Roll back. */ 1255 this_loads.truncate (old_nloads); 1256 this_tree_size = old_tree_size; 1257 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1258 vect_free_slp_tree (grandchild); 1259 SLP_TREE_CHILDREN (child).truncate (0); 1260 1261 dump_printf_loc (MSG_NOTE, vect_location, 1262 "Building parent vector operands from " 1263 "scalars instead\n"); 1264 oprnd_info->def_stmts = vNULL; 1265 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1266 children.safe_push (child); 1267 continue; 1268 } 1269 } 1270 1271 oprnd_info->def_stmts = vNULL; 1272 children.safe_push (child); 1273 continue; 1274 } 1275 1276 /* If the SLP build failed fatally and we analyze a basic-block 1277 simply treat nodes we fail to build as externally defined 1278 (and thus build vectors from the scalar defs). 1279 The cost model will reject outright expensive cases. 1280 ??? This doesn't treat cases where permutation ultimatively 1281 fails (or we don't try permutation below). Ideally we'd 1282 even compute a permutation that will end up with the maximum 1283 SLP tree size... */ 1284 if (is_a <bb_vec_info> (vinfo) 1285 && !matches[0] 1286 /* ??? Rejecting patterns this way doesn't work. We'd have to 1287 do extra work to cancel the pattern so the uses see the 1288 scalar version. */ 1289 && !is_pattern_stmt_p (vinfo_for_stmt (stmt))) 1290 { 1291 dump_printf_loc (MSG_NOTE, vect_location, 1292 "Building vector operands from scalars\n"); 1293 child = vect_create_new_slp_node (oprnd_info->def_stmts); 1294 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1295 children.safe_push (child); 1296 oprnd_info->def_stmts = vNULL; 1297 continue; 1298 } 1299 1300 /* If the SLP build for operand zero failed and operand zero 1301 and one can be commutated try that for the scalar stmts 1302 that failed the match. */ 1303 if (i == 0 1304 /* A first scalar stmt mismatch signals a fatal mismatch. */ 1305 && matches[0] 1306 /* ??? For COND_EXPRs we can swap the comparison operands 1307 as well as the arms under some constraints. */ 1308 && nops == 2 1309 && oprnds_info[1]->first_dt == vect_internal_def 1310 && is_gimple_assign (stmt) 1311 /* Do so only if the number of not successful permutes was nor more 1312 than a cut-ff as re-trying the recursive match on 1313 possibly each level of the tree would expose exponential 1314 behavior. */ 1315 && *npermutes < 4) 1316 { 1317 /* See whether we can swap the matching or the non-matching 1318 stmt operands. */ 1319 bool swap_not_matching = true; 1320 do 1321 { 1322 for (j = 0; j < group_size; ++j) 1323 { 1324 if (matches[j] != !swap_not_matching) 1325 continue; 1326 gimple *stmt = stmts[j]; 1327 /* Verify if we can swap operands of this stmt. */ 1328 if (!is_gimple_assign (stmt) 1329 || !commutative_tree_code (gimple_assign_rhs_code (stmt))) 1330 { 1331 if (!swap_not_matching) 1332 goto fail; 1333 swap_not_matching = false; 1334 break; 1335 } 1336 /* Verify if we can safely swap or if we committed to a 1337 specific operand order already. 1338 ??? Instead of modifying GIMPLE stmts here we could 1339 record whether we want to swap operands in the SLP 1340 node and temporarily do that when processing it 1341 (or wrap operand accessors in a helper). */ 1342 else if (swap[j] != 0 1343 || STMT_VINFO_NUM_SLP_USES (vinfo_for_stmt (stmt))) 1344 { 1345 if (!swap_not_matching) 1346 { 1347 if (dump_enabled_p ()) 1348 { 1349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 1350 vect_location, 1351 "Build SLP failed: cannot swap " 1352 "operands of shared stmt "); 1353 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, 1354 TDF_SLIM, stmts[j], 0); 1355 } 1356 goto fail; 1357 } 1358 swap_not_matching = false; 1359 break; 1360 } 1361 } 1362 } 1363 while (j != group_size); 1364 1365 /* Swap mismatched definition stmts. */ 1366 dump_printf_loc (MSG_NOTE, vect_location, 1367 "Re-trying with swapped operands of stmts "); 1368 for (j = 0; j < group_size; ++j) 1369 if (matches[j] == !swap_not_matching) 1370 { 1371 std::swap (oprnds_info[0]->def_stmts[j], 1372 oprnds_info[1]->def_stmts[j]); 1373 dump_printf (MSG_NOTE, "%d ", j); 1374 } 1375 dump_printf (MSG_NOTE, "\n"); 1376 /* And try again with scratch 'matches' ... */ 1377 bool *tem = XALLOCAVEC (bool, group_size); 1378 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, 1379 group_size, &this_max_nunits, 1380 &this_loads, tem, npermutes, 1381 &this_tree_size, 1382 max_tree_size)) != NULL) 1383 { 1384 /* ... so if successful we can apply the operand swapping 1385 to the GIMPLE IL. This is necessary because for example 1386 vect_get_slp_defs uses operand indexes and thus expects 1387 canonical operand order. This is also necessary even 1388 if we end up building the operand from scalars as 1389 we'll continue to process swapped operand two. */ 1390 for (j = 0; j < group_size; ++j) 1391 { 1392 gimple *stmt = stmts[j]; 1393 gimple_set_plf (stmt, GF_PLF_1, false); 1394 } 1395 for (j = 0; j < group_size; ++j) 1396 { 1397 gimple *stmt = stmts[j]; 1398 if (matches[j] == !swap_not_matching) 1399 { 1400 /* Avoid swapping operands twice. */ 1401 if (gimple_plf (stmt, GF_PLF_1)) 1402 continue; 1403 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), 1404 gimple_assign_rhs2_ptr (stmt)); 1405 gimple_set_plf (stmt, GF_PLF_1, true); 1406 } 1407 } 1408 /* Verify we swap all duplicates or none. */ 1409 if (flag_checking) 1410 for (j = 0; j < group_size; ++j) 1411 { 1412 gimple *stmt = stmts[j]; 1413 gcc_assert (gimple_plf (stmt, GF_PLF_1) 1414 == (matches[j] == !swap_not_matching)); 1415 } 1416 1417 /* If we have all children of child built up from scalars then 1418 just throw that away and build it up this node from scalars. */ 1419 if (!SLP_TREE_CHILDREN (child).is_empty () 1420 /* ??? Rejecting patterns this way doesn't work. We'd have 1421 to do extra work to cancel the pattern so the uses see the 1422 scalar version. */ 1423 && !is_pattern_stmt_p 1424 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0]))) 1425 { 1426 unsigned int j; 1427 slp_tree grandchild; 1428 1429 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1430 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def) 1431 break; 1432 if (!grandchild) 1433 { 1434 /* Roll back. */ 1435 this_loads.truncate (old_nloads); 1436 this_tree_size = old_tree_size; 1437 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild) 1438 vect_free_slp_tree (grandchild); 1439 SLP_TREE_CHILDREN (child).truncate (0); 1440 1441 dump_printf_loc (MSG_NOTE, vect_location, 1442 "Building parent vector operands from " 1443 "scalars instead\n"); 1444 oprnd_info->def_stmts = vNULL; 1445 SLP_TREE_DEF_TYPE (child) = vect_external_def; 1446 children.safe_push (child); 1447 continue; 1448 } 1449 } 1450 1451 oprnd_info->def_stmts = vNULL; 1452 children.safe_push (child); 1453 continue; 1454 } 1455 1456 ++*npermutes; 1457 } 1458 1459 fail: 1460 gcc_assert (child == NULL); 1461 FOR_EACH_VEC_ELT (children, j, child) 1462 vect_free_slp_tree (child); 1463 vect_free_oprnd_info (oprnds_info); 1464 return NULL; 1465 } 1466 1467 vect_free_oprnd_info (oprnds_info); 1468 1469 if (tree_size) 1470 *tree_size += this_tree_size; 1471 *max_nunits = this_max_nunits; 1472 loads->safe_splice (this_loads); 1473 1474 node = vect_create_new_slp_node (stmts); 1475 SLP_TREE_TWO_OPERATORS (node) = two_operators; 1476 SLP_TREE_CHILDREN (node).splice (children); 1477 return node; 1478 } 1479 1480 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */ 1481 1482 static void 1483 vect_print_slp_tree (dump_flags_t dump_kind, location_t loc, slp_tree node) 1484 { 1485 int i; 1486 gimple *stmt; 1487 slp_tree child; 1488 1489 dump_printf_loc (dump_kind, loc, "node%s\n", 1490 SLP_TREE_DEF_TYPE (node) != vect_internal_def 1491 ? " (external)" : ""); 1492 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1493 { 1494 dump_printf_loc (dump_kind, loc, "\tstmt %d ", i); 1495 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0); 1496 } 1497 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1498 vect_print_slp_tree (dump_kind, loc, child); 1499 } 1500 1501 1502 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). 1503 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index 1504 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the 1505 stmts in NODE are to be marked. */ 1506 1507 static void 1508 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) 1509 { 1510 int i; 1511 gimple *stmt; 1512 slp_tree child; 1513 1514 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1515 return; 1516 1517 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1518 if (j < 0 || i == j) 1519 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; 1520 1521 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1522 vect_mark_slp_stmts (child, mark, j); 1523 } 1524 1525 1526 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */ 1527 1528 static void 1529 vect_mark_slp_stmts_relevant (slp_tree node) 1530 { 1531 int i; 1532 gimple *stmt; 1533 stmt_vec_info stmt_info; 1534 slp_tree child; 1535 1536 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 1537 return; 1538 1539 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1540 { 1541 stmt_info = vinfo_for_stmt (stmt); 1542 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info) 1543 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope); 1544 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope; 1545 } 1546 1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1548 vect_mark_slp_stmts_relevant (child); 1549 } 1550 1551 1552 /* Rearrange the statements of NODE according to PERMUTATION. */ 1553 1554 static void 1555 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, 1556 vec<unsigned> permutation) 1557 { 1558 gimple *stmt; 1559 vec<gimple *> tmp_stmts; 1560 unsigned int i; 1561 slp_tree child; 1562 1563 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1564 vect_slp_rearrange_stmts (child, group_size, permutation); 1565 1566 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); 1567 tmp_stmts.create (group_size); 1568 tmp_stmts.quick_grow_cleared (group_size); 1569 1570 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 1571 tmp_stmts[permutation[i]] = stmt; 1572 1573 SLP_TREE_SCALAR_STMTS (node).release (); 1574 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; 1575 } 1576 1577 1578 /* Attempt to reorder stmts in a reduction chain so that we don't 1579 require any load permutation. Return true if that was possible, 1580 otherwise return false. */ 1581 1582 static bool 1583 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) 1584 { 1585 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1586 unsigned int i, j; 1587 unsigned int lidx; 1588 slp_tree node, load; 1589 1590 /* Compare all the permutation sequences to the first one. We know 1591 that at least one load is permuted. */ 1592 node = SLP_INSTANCE_LOADS (slp_instn)[0]; 1593 if (!node->load_permutation.exists ()) 1594 return false; 1595 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) 1596 { 1597 if (!load->load_permutation.exists ()) 1598 return false; 1599 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx) 1600 if (lidx != node->load_permutation[j]) 1601 return false; 1602 } 1603 1604 /* Check that the loads in the first sequence are different and there 1605 are no gaps between them. */ 1606 auto_sbitmap load_index (group_size); 1607 bitmap_clear (load_index); 1608 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) 1609 { 1610 if (lidx >= group_size) 1611 return false; 1612 if (bitmap_bit_p (load_index, lidx)) 1613 return false; 1614 1615 bitmap_set_bit (load_index, lidx); 1616 } 1617 for (i = 0; i < group_size; i++) 1618 if (!bitmap_bit_p (load_index, i)) 1619 return false; 1620 1621 /* This permutation is valid for reduction. Since the order of the 1622 statements in the nodes is not important unless they are memory 1623 accesses, we can rearrange the statements in all the nodes 1624 according to the order of the loads. */ 1625 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, 1626 node->load_permutation); 1627 1628 /* We are done, no actual permutations need to be generated. */ 1629 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); 1630 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1631 { 1632 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1633 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 1634 /* But we have to keep those permutations that are required because 1635 of handling of gaps. */ 1636 if (known_eq (unrolling_factor, 1U) 1637 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) 1638 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)) 1639 SLP_TREE_LOAD_PERMUTATION (node).release (); 1640 else 1641 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j) 1642 SLP_TREE_LOAD_PERMUTATION (node)[j] = j; 1643 } 1644 1645 return true; 1646 } 1647 1648 /* Check if the required load permutations in the SLP instance 1649 SLP_INSTN are supported. */ 1650 1651 static bool 1652 vect_supported_load_permutation_p (slp_instance slp_instn) 1653 { 1654 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); 1655 unsigned int i, j, k, next; 1656 slp_tree node; 1657 gimple *stmt, *load, *next_load; 1658 1659 if (dump_enabled_p ()) 1660 { 1661 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); 1662 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1663 if (node->load_permutation.exists ()) 1664 FOR_EACH_VEC_ELT (node->load_permutation, j, next) 1665 dump_printf (MSG_NOTE, "%d ", next); 1666 else 1667 for (k = 0; k < group_size; ++k) 1668 dump_printf (MSG_NOTE, "%d ", k); 1669 dump_printf (MSG_NOTE, "\n"); 1670 } 1671 1672 /* In case of reduction every load permutation is allowed, since the order 1673 of the reduction statements is not important (as opposed to the case of 1674 grouped stores). The only condition we need to check is that all the 1675 load nodes are of the same size and have the same permutation (and then 1676 rearrange all the nodes of the SLP instance according to this 1677 permutation). */ 1678 1679 /* Check that all the load nodes are of the same size. */ 1680 /* ??? Can't we assert this? */ 1681 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1682 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) 1683 return false; 1684 1685 node = SLP_INSTANCE_TREE (slp_instn); 1686 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1687 1688 /* Reduction (there are no data-refs in the root). 1689 In reduction chain the order of the loads is not important. */ 1690 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)) 1691 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 1692 vect_attempt_slp_rearrange_stmts (slp_instn); 1693 1694 /* In basic block vectorization we allow any subchain of an interleaving 1695 chain. 1696 FORNOW: not supported in loop SLP because of realignment compications. */ 1697 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt))) 1698 { 1699 /* Check whether the loads in an instance form a subchain and thus 1700 no permutation is necessary. */ 1701 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1702 { 1703 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1704 continue; 1705 bool subchain_p = true; 1706 next_load = NULL; 1707 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load) 1708 { 1709 if (j != 0 1710 && (next_load != load 1711 || GROUP_GAP (vinfo_for_stmt (load)) != 1)) 1712 { 1713 subchain_p = false; 1714 break; 1715 } 1716 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load)); 1717 } 1718 if (subchain_p) 1719 SLP_TREE_LOAD_PERMUTATION (node).release (); 1720 else 1721 { 1722 stmt_vec_info group_info 1723 = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 1724 group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info)); 1725 unsigned HOST_WIDE_INT nunits; 1726 unsigned k, maxk = 0; 1727 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k) 1728 if (k > maxk) 1729 maxk = k; 1730 /* In BB vectorization we may not actually use a loaded vector 1731 accessing elements in excess of GROUP_SIZE. */ 1732 tree vectype = STMT_VINFO_VECTYPE (group_info); 1733 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) 1734 || maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1))) 1735 { 1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1737 "BB vectorization with gaps at the end of " 1738 "a load is not supported\n"); 1739 return false; 1740 } 1741 1742 /* Verify the permutation can be generated. */ 1743 vec<tree> tem; 1744 unsigned n_perms; 1745 if (!vect_transform_slp_perm_load (node, tem, NULL, 1746 1, slp_instn, true, &n_perms)) 1747 { 1748 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 1749 vect_location, 1750 "unsupported load permutation\n"); 1751 return false; 1752 } 1753 } 1754 } 1755 return true; 1756 } 1757 1758 /* For loop vectorization verify we can generate the permutation. Be 1759 conservative about the vectorization factor, there are permutations 1760 that will use three vector inputs only starting from a specific factor 1761 and the vectorization factor is not yet final. 1762 ??? The SLP instance unrolling factor might not be the maximum one. */ 1763 unsigned n_perms; 1764 poly_uint64 test_vf 1765 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), 1766 LOOP_VINFO_VECT_FACTOR 1767 (STMT_VINFO_LOOP_VINFO (vinfo_for_stmt (stmt)))); 1768 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) 1769 if (node->load_permutation.exists () 1770 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf, 1771 slp_instn, true, &n_perms)) 1772 return false; 1773 1774 return true; 1775 } 1776 1777 1778 /* Find the last store in SLP INSTANCE. */ 1779 1780 gimple * 1781 vect_find_last_scalar_stmt_in_slp (slp_tree node) 1782 { 1783 gimple *last = NULL, *stmt; 1784 1785 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++) 1786 { 1787 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 1788 if (is_pattern_stmt_p (stmt_vinfo)) 1789 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last); 1790 else 1791 last = get_later_stmt (stmt, last); 1792 } 1793 1794 return last; 1795 } 1796 1797 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */ 1798 1799 static void 1800 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node, 1801 stmt_vector_for_cost *prologue_cost_vec, 1802 stmt_vector_for_cost *body_cost_vec, 1803 unsigned ncopies_for_cost, 1804 scalar_stmts_set_t* visited) 1805 { 1806 unsigned i, j; 1807 slp_tree child; 1808 gimple *stmt; 1809 stmt_vec_info stmt_info; 1810 tree lhs; 1811 1812 /* If we already costed the exact same set of scalar stmts we're done. 1813 We share the generated vector stmts for those. */ 1814 if (visited->contains (SLP_TREE_SCALAR_STMTS (node))) 1815 return; 1816 1817 visited->add (SLP_TREE_SCALAR_STMTS (node).copy ()); 1818 1819 /* Recurse down the SLP tree. */ 1820 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1821 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 1822 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec, 1823 body_cost_vec, ncopies_for_cost, visited); 1824 1825 /* Look at the first scalar stmt to determine the cost. */ 1826 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1827 stmt_info = vinfo_for_stmt (stmt); 1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1829 { 1830 vect_memory_access_type memory_access_type 1831 = (STMT_VINFO_STRIDED_P (stmt_info) 1832 ? VMAT_STRIDED_SLP 1833 : VMAT_CONTIGUOUS); 1834 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) 1835 vect_model_store_cost (stmt_info, ncopies_for_cost, 1836 memory_access_type, VLS_STORE, 1837 node, prologue_cost_vec, body_cost_vec); 1838 else 1839 { 1840 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); 1841 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 1842 { 1843 /* If the load is permuted then the alignment is determined by 1844 the first group element not by the first scalar stmt DR. */ 1845 stmt = GROUP_FIRST_ELEMENT (stmt_info); 1846 stmt_info = vinfo_for_stmt (stmt); 1847 /* Record the cost for the permutation. */ 1848 unsigned n_perms; 1849 vect_transform_slp_perm_load (node, vNULL, NULL, 1850 ncopies_for_cost, instance, true, 1851 &n_perms); 1852 record_stmt_cost (body_cost_vec, n_perms, vec_perm, 1853 stmt_info, 0, vect_body); 1854 unsigned assumed_nunits 1855 = vect_nunits_for_cost (STMT_VINFO_VECTYPE (stmt_info)); 1856 /* And adjust the number of loads performed. This handles 1857 redundancies as well as loads that are later dead. */ 1858 auto_sbitmap perm (GROUP_SIZE (stmt_info)); 1859 bitmap_clear (perm); 1860 for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i) 1861 bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]); 1862 ncopies_for_cost = 0; 1863 bool load_seen = false; 1864 for (i = 0; i < GROUP_SIZE (stmt_info); ++i) 1865 { 1866 if (i % assumed_nunits == 0) 1867 { 1868 if (load_seen) 1869 ncopies_for_cost++; 1870 load_seen = false; 1871 } 1872 if (bitmap_bit_p (perm, i)) 1873 load_seen = true; 1874 } 1875 if (load_seen) 1876 ncopies_for_cost++; 1877 gcc_assert (ncopies_for_cost 1878 <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) 1879 + assumed_nunits - 1) / assumed_nunits); 1880 poly_uint64 uf = SLP_INSTANCE_UNROLLING_FACTOR (instance); 1881 ncopies_for_cost *= estimated_poly_value (uf); 1882 } 1883 /* Record the cost for the vector loads. */ 1884 vect_model_load_cost (stmt_info, ncopies_for_cost, 1885 memory_access_type, node, prologue_cost_vec, 1886 body_cost_vec); 1887 return; 1888 } 1889 } 1890 else if (STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type) 1891 { 1892 /* ncopies_for_cost is the number of IVs we generate. */ 1893 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, 1894 stmt_info, 0, vect_body); 1895 1896 /* Prologue cost for the initial values and step vector. */ 1897 record_stmt_cost (prologue_cost_vec, ncopies_for_cost, 1898 CONSTANT_CLASS_P 1899 (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED 1900 (stmt_info)) 1901 ? vector_load : vec_construct, 1902 stmt_info, 0, vect_prologue); 1903 record_stmt_cost (prologue_cost_vec, 1, 1904 CONSTANT_CLASS_P 1905 (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info)) 1906 ? vector_load : vec_construct, 1907 stmt_info, 0, vect_prologue); 1908 1909 /* ??? No easy way to get at the actual number of vector stmts 1910 to be geneated and thus the derived IVs. */ 1911 } 1912 else 1913 { 1914 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, 1915 stmt_info, 0, vect_body); 1916 if (SLP_TREE_TWO_OPERATORS (node)) 1917 { 1918 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, 1919 stmt_info, 0, vect_body); 1920 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, 1921 stmt_info, 0, vect_body); 1922 } 1923 } 1924 1925 /* Push SLP node def-type to stmts. */ 1926 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1927 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 1928 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 1929 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child); 1930 1931 /* Scan operands and account for prologue cost of constants/externals. 1932 ??? This over-estimates cost for multiple uses and should be 1933 re-engineered. */ 1934 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 1935 lhs = gimple_get_lhs (stmt); 1936 for (i = 0; i < gimple_num_ops (stmt); ++i) 1937 { 1938 tree op = gimple_op (stmt, i); 1939 gimple *def_stmt; 1940 enum vect_def_type dt; 1941 if (!op || op == lhs) 1942 continue; 1943 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt) 1944 && (dt == vect_constant_def || dt == vect_external_def)) 1945 { 1946 /* Without looking at the actual initializer a vector of 1947 constants can be implemented as load from the constant pool. 1948 When all elements are the same we can use a splat. */ 1949 tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op)); 1950 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length (); 1951 unsigned num_vects_to_check; 1952 unsigned HOST_WIDE_INT const_nunits; 1953 unsigned nelt_limit; 1954 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits) 1955 && ! multiple_p (const_nunits, group_size)) 1956 { 1957 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node); 1958 nelt_limit = const_nunits; 1959 } 1960 else 1961 { 1962 /* If either the vector has variable length or the vectors 1963 are composed of repeated whole groups we only need to 1964 cost construction once. All vectors will be the same. */ 1965 num_vects_to_check = 1; 1966 nelt_limit = group_size; 1967 } 1968 tree elt = NULL_TREE; 1969 unsigned nelt = 0; 1970 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j) 1971 { 1972 unsigned si = j % group_size; 1973 if (nelt == 0) 1974 elt = gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i); 1975 /* ??? We're just tracking whether all operands of a single 1976 vector initializer are the same, ideally we'd check if 1977 we emitted the same one already. */ 1978 else if (elt != gimple_op (SLP_TREE_SCALAR_STMTS (node)[si], i)) 1979 elt = NULL_TREE; 1980 nelt++; 1981 if (nelt == nelt_limit) 1982 { 1983 /* ??? We need to pass down stmt_info for a vector type 1984 even if it points to the wrong stmt. */ 1985 record_stmt_cost (prologue_cost_vec, 1, 1986 dt == vect_external_def 1987 ? (elt ? scalar_to_vec : vec_construct) 1988 : vector_load, 1989 stmt_info, 0, vect_prologue); 1990 nelt = 0; 1991 } 1992 } 1993 } 1994 } 1995 1996 /* Restore stmt def-types. */ 1997 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 1998 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 1999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 2000 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def; 2001 } 2002 2003 /* Compute the cost for the SLP instance INSTANCE. */ 2004 2005 static void 2006 vect_analyze_slp_cost (slp_instance instance, void *data, scalar_stmts_set_t *visited) 2007 { 2008 stmt_vector_for_cost body_cost_vec, prologue_cost_vec; 2009 unsigned ncopies_for_cost; 2010 stmt_info_for_cost *si; 2011 unsigned i; 2012 2013 /* Calculate the number of vector stmts to create based on the unrolling 2014 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is 2015 GROUP_SIZE / NUNITS otherwise. */ 2016 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance); 2017 slp_tree node = SLP_INSTANCE_TREE (instance); 2018 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 2019 /* Get the estimated vectorization factor, which is always one for 2020 basic-block vectorization. */ 2021 unsigned int assumed_vf; 2022 if (STMT_VINFO_LOOP_VINFO (stmt_info)) 2023 assumed_vf = vect_vf_for_cost (STMT_VINFO_LOOP_VINFO (stmt_info)); 2024 else 2025 assumed_vf = 1; 2026 /* For reductions look at a reduction operand in case the reduction 2027 operation is widening like DOT_PROD or SAD. */ 2028 tree vectype_for_cost = STMT_VINFO_VECTYPE (stmt_info); 2029 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2030 { 2031 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2032 switch (gimple_assign_rhs_code (stmt)) 2033 { 2034 case DOT_PROD_EXPR: 2035 case SAD_EXPR: 2036 vectype_for_cost = get_vectype_for_scalar_type 2037 (TREE_TYPE (gimple_assign_rhs1 (stmt))); 2038 break; 2039 default:; 2040 } 2041 } 2042 unsigned int assumed_nunits = vect_nunits_for_cost (vectype_for_cost); 2043 ncopies_for_cost = (least_common_multiple (assumed_nunits, 2044 group_size * assumed_vf) 2045 / assumed_nunits); 2046 2047 prologue_cost_vec.create (10); 2048 body_cost_vec.create (10); 2049 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance), 2050 &prologue_cost_vec, &body_cost_vec, 2051 ncopies_for_cost, visited); 2052 2053 /* Record the prologue costs, which were delayed until we were 2054 sure that SLP was successful. */ 2055 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si) 2056 { 2057 struct _stmt_vec_info *stmt_info 2058 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 2059 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 2060 si->misalign, vect_prologue); 2061 } 2062 2063 /* Record the instance's instructions in the target cost model. */ 2064 FOR_EACH_VEC_ELT (body_cost_vec, i, si) 2065 { 2066 struct _stmt_vec_info *stmt_info 2067 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 2068 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 2069 si->misalign, vect_body); 2070 } 2071 2072 prologue_cost_vec.release (); 2073 body_cost_vec.release (); 2074 } 2075 2076 /* Splits a group of stores, currently beginning at FIRST_STMT, into two groups: 2077 one (still beginning at FIRST_STMT) of size GROUP1_SIZE (also containing 2078 the first GROUP1_SIZE stmts, since stores are consecutive), the second 2079 containing the remainder. 2080 Return the first stmt in the second group. */ 2081 2082 static gimple * 2083 vect_split_slp_store_group (gimple *first_stmt, unsigned group1_size) 2084 { 2085 stmt_vec_info first_vinfo = vinfo_for_stmt (first_stmt); 2086 gcc_assert (GROUP_FIRST_ELEMENT (first_vinfo) == first_stmt); 2087 gcc_assert (group1_size > 0); 2088 int group2_size = GROUP_SIZE (first_vinfo) - group1_size; 2089 gcc_assert (group2_size > 0); 2090 GROUP_SIZE (first_vinfo) = group1_size; 2091 2092 gimple *stmt = first_stmt; 2093 for (unsigned i = group1_size; i > 1; i--) 2094 { 2095 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2096 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1); 2097 } 2098 /* STMT is now the last element of the first group. */ 2099 gimple *group2 = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2100 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0; 2101 2102 GROUP_SIZE (vinfo_for_stmt (group2)) = group2_size; 2103 for (stmt = group2; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt))) 2104 { 2105 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = group2; 2106 gcc_assert (GROUP_GAP (vinfo_for_stmt (stmt)) == 1); 2107 } 2108 2109 /* For the second group, the GROUP_GAP is that before the original group, 2110 plus skipping over the first vector. */ 2111 GROUP_GAP (vinfo_for_stmt (group2)) = 2112 GROUP_GAP (first_vinfo) + group1_size; 2113 2114 /* GROUP_GAP of the first group now has to skip over the second group too. */ 2115 GROUP_GAP (first_vinfo) += group2_size; 2116 2117 if (dump_enabled_p ()) 2118 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n", 2119 group1_size, group2_size); 2120 2121 return group2; 2122 } 2123 2124 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE 2125 statements and a vector of NUNITS elements. */ 2126 2127 static poly_uint64 2128 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) 2129 { 2130 return exact_div (common_multiple (nunits, group_size), group_size); 2131 } 2132 2133 /* Analyze an SLP instance starting from a group of grouped stores. Call 2134 vect_build_slp_tree to build a tree of packed stmts if possible. 2135 Return FALSE if it's impossible to SLP any stmt in the loop. */ 2136 2137 static bool 2138 vect_analyze_slp_instance (vec_info *vinfo, 2139 gimple *stmt, unsigned max_tree_size) 2140 { 2141 slp_instance new_instance; 2142 slp_tree node; 2143 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt)); 2144 tree vectype, scalar_type = NULL_TREE; 2145 gimple *next; 2146 unsigned int i; 2147 vec<slp_tree> loads; 2148 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 2149 vec<gimple *> scalar_stmts; 2150 2151 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2152 { 2153 if (dr) 2154 { 2155 scalar_type = TREE_TYPE (DR_REF (dr)); 2156 vectype = get_vectype_for_scalar_type (scalar_type); 2157 } 2158 else 2159 { 2160 gcc_assert (is_a <loop_vec_info> (vinfo)); 2161 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2162 } 2163 2164 group_size = GROUP_SIZE (vinfo_for_stmt (stmt)); 2165 } 2166 else 2167 { 2168 gcc_assert (is_a <loop_vec_info> (vinfo)); 2169 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2170 group_size = as_a <loop_vec_info> (vinfo)->reductions.length (); 2171 } 2172 2173 if (!vectype) 2174 { 2175 if (dump_enabled_p ()) 2176 { 2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2178 "Build SLP failed: unsupported data-type "); 2179 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type); 2180 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2181 } 2182 2183 return false; 2184 } 2185 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 2186 2187 /* Create a node (a root of the SLP tree) for the packed grouped stores. */ 2188 scalar_stmts.create (group_size); 2189 next = stmt; 2190 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2191 { 2192 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ 2193 while (next) 2194 { 2195 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)) 2196 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next))) 2197 scalar_stmts.safe_push ( 2198 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next))); 2199 else 2200 scalar_stmts.safe_push (next); 2201 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2202 } 2203 /* Mark the first element of the reduction chain as reduction to properly 2204 transform the node. In the reduction analysis phase only the last 2205 element of the chain is marked as reduction. */ 2206 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) 2207 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def; 2208 } 2209 else 2210 { 2211 /* Collect reduction statements. */ 2212 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions; 2213 for (i = 0; reductions.iterate (i, &next); i++) 2214 scalar_stmts.safe_push (next); 2215 } 2216 2217 loads.create (group_size); 2218 2219 /* Build the tree for the SLP instance. */ 2220 bool *matches = XALLOCAVEC (bool, group_size); 2221 unsigned npermutes = 0; 2222 bst_fail = new scalar_stmts_set_t (); 2223 poly_uint64 max_nunits = nunits; 2224 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, 2225 &max_nunits, &loads, matches, &npermutes, 2226 NULL, max_tree_size); 2227 delete bst_fail; 2228 if (node != NULL) 2229 { 2230 /* Calculate the unrolling factor based on the smallest type. */ 2231 poly_uint64 unrolling_factor 2232 = calculate_unrolling_factor (max_nunits, group_size); 2233 2234 if (maybe_ne (unrolling_factor, 1U) 2235 && is_a <bb_vec_info> (vinfo)) 2236 { 2237 unsigned HOST_WIDE_INT const_max_nunits; 2238 if (!max_nunits.is_constant (&const_max_nunits) 2239 || const_max_nunits > group_size) 2240 { 2241 if (dump_enabled_p ()) 2242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2243 "Build SLP failed: store group " 2244 "size not a multiple of the vector size " 2245 "in basic block SLP\n"); 2246 vect_free_slp_tree (node); 2247 loads.release (); 2248 return false; 2249 } 2250 /* Fatal mismatch. */ 2251 matches[group_size / const_max_nunits * const_max_nunits] = false; 2252 vect_free_slp_tree (node); 2253 loads.release (); 2254 } 2255 else 2256 { 2257 /* Create a new SLP instance. */ 2258 new_instance = XNEW (struct _slp_instance); 2259 SLP_INSTANCE_TREE (new_instance) = node; 2260 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; 2261 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; 2262 SLP_INSTANCE_LOADS (new_instance) = loads; 2263 2264 /* Compute the load permutation. */ 2265 slp_tree load_node; 2266 bool loads_permuted = false; 2267 FOR_EACH_VEC_ELT (loads, i, load_node) 2268 { 2269 vec<unsigned> load_permutation; 2270 int j; 2271 gimple *load, *first_stmt; 2272 bool this_load_permuted = false; 2273 load_permutation.create (group_size); 2274 first_stmt = GROUP_FIRST_ELEMENT 2275 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); 2276 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load) 2277 { 2278 int load_place = vect_get_place_in_interleaving_chain 2279 (load, first_stmt); 2280 gcc_assert (load_place != -1); 2281 if (load_place != j) 2282 this_load_permuted = true; 2283 load_permutation.safe_push (load_place); 2284 } 2285 if (!this_load_permuted 2286 /* The load requires permutation when unrolling exposes 2287 a gap either because the group is larger than the SLP 2288 group-size or because there is a gap between the groups. */ 2289 && (known_eq (unrolling_factor, 1U) 2290 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt)) 2291 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0))) 2292 { 2293 load_permutation.release (); 2294 continue; 2295 } 2296 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; 2297 loads_permuted = true; 2298 } 2299 2300 if (loads_permuted) 2301 { 2302 if (!vect_supported_load_permutation_p (new_instance)) 2303 { 2304 if (dump_enabled_p ()) 2305 { 2306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2307 "Build SLP failed: unsupported load " 2308 "permutation "); 2309 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, 2310 TDF_SLIM, stmt, 0); 2311 } 2312 vect_free_slp_instance (new_instance); 2313 return false; 2314 } 2315 } 2316 2317 /* If the loads and stores can be handled with load/store-lan 2318 instructions do not generate this SLP instance. */ 2319 if (is_a <loop_vec_info> (vinfo) 2320 && loads_permuted 2321 && dr && vect_store_lanes_supported (vectype, group_size, false)) 2322 { 2323 slp_tree load_node; 2324 FOR_EACH_VEC_ELT (loads, i, load_node) 2325 { 2326 gimple *first_stmt = GROUP_FIRST_ELEMENT 2327 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0])); 2328 stmt_vec_info stmt_vinfo = vinfo_for_stmt (first_stmt); 2329 /* Use SLP for strided accesses (or if we 2330 can't load-lanes). */ 2331 if (STMT_VINFO_STRIDED_P (stmt_vinfo) 2332 || ! vect_load_lanes_supported 2333 (STMT_VINFO_VECTYPE (stmt_vinfo), 2334 GROUP_SIZE (stmt_vinfo), false)) 2335 break; 2336 } 2337 if (i == loads.length ()) 2338 { 2339 if (dump_enabled_p ()) 2340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2341 "Built SLP cancelled: can use " 2342 "load/store-lanes\n"); 2343 vect_free_slp_instance (new_instance); 2344 return false; 2345 } 2346 } 2347 2348 vinfo->slp_instances.safe_push (new_instance); 2349 2350 if (dump_enabled_p ()) 2351 { 2352 dump_printf_loc (MSG_NOTE, vect_location, 2353 "Final SLP tree for instance:\n"); 2354 vect_print_slp_tree (MSG_NOTE, vect_location, node); 2355 } 2356 2357 return true; 2358 } 2359 } 2360 else 2361 { 2362 /* Failed to SLP. */ 2363 /* Free the allocated memory. */ 2364 scalar_stmts.release (); 2365 loads.release (); 2366 } 2367 2368 /* For basic block SLP, try to break the group up into multiples of the 2369 vector size. */ 2370 unsigned HOST_WIDE_INT const_nunits; 2371 if (is_a <bb_vec_info> (vinfo) 2372 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 2373 && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) 2374 && nunits.is_constant (&const_nunits)) 2375 { 2376 /* We consider breaking the group only on VF boundaries from the existing 2377 start. */ 2378 for (i = 0; i < group_size; i++) 2379 if (!matches[i]) break; 2380 2381 if (i >= const_nunits && i < group_size) 2382 { 2383 /* Split into two groups at the first vector boundary before i. */ 2384 gcc_assert ((const_nunits & (const_nunits - 1)) == 0); 2385 unsigned group1_size = i & ~(const_nunits - 1); 2386 2387 gimple *rest = vect_split_slp_store_group (stmt, group1_size); 2388 bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size); 2389 /* If the first non-match was in the middle of a vector, 2390 skip the rest of that vector. */ 2391 if (group1_size < i) 2392 { 2393 i = group1_size + const_nunits; 2394 if (i < group_size) 2395 rest = vect_split_slp_store_group (rest, const_nunits); 2396 } 2397 if (i < group_size) 2398 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); 2399 return res; 2400 } 2401 /* Even though the first vector did not all match, we might be able to SLP 2402 (some) of the remainder. FORNOW ignore this possibility. */ 2403 } 2404 2405 return false; 2406 } 2407 2408 2409 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP 2410 trees of packed scalar stmts if SLP is possible. */ 2411 2412 bool 2413 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) 2414 { 2415 unsigned int i; 2416 gimple *first_element; 2417 2418 if (dump_enabled_p ()) 2419 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n"); 2420 2421 /* Find SLP sequences starting from groups of grouped stores. */ 2422 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element) 2423 vect_analyze_slp_instance (vinfo, first_element, max_tree_size); 2424 2425 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 2426 { 2427 if (loop_vinfo->reduction_chains.length () > 0) 2428 { 2429 /* Find SLP sequences starting from reduction chains. */ 2430 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element) 2431 if (! vect_analyze_slp_instance (vinfo, first_element, 2432 max_tree_size)) 2433 { 2434 /* Dissolve reduction chain group. */ 2435 gimple *next, *stmt = first_element; 2436 while (stmt) 2437 { 2438 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2439 next = GROUP_NEXT_ELEMENT (vinfo); 2440 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2441 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2442 stmt = next; 2443 } 2444 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (first_element)) 2445 = vect_internal_def; 2446 } 2447 } 2448 2449 /* Find SLP sequences starting from groups of reductions. */ 2450 if (loop_vinfo->reductions.length () > 1) 2451 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0], 2452 max_tree_size); 2453 } 2454 2455 return true; 2456 } 2457 2458 2459 /* For each possible SLP instance decide whether to SLP it and calculate overall 2460 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at 2461 least one instance. */ 2462 2463 bool 2464 vect_make_slp_decision (loop_vec_info loop_vinfo) 2465 { 2466 unsigned int i; 2467 poly_uint64 unrolling_factor = 1; 2468 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2469 slp_instance instance; 2470 int decided_to_slp = 0; 2471 2472 if (dump_enabled_p ()) 2473 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ===" 2474 "\n"); 2475 2476 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2477 { 2478 /* FORNOW: SLP if you can. */ 2479 /* All unroll factors have the form current_vector_size * X for some 2480 rational X, so they must have a common multiple. */ 2481 unrolling_factor 2482 = force_common_multiple (unrolling_factor, 2483 SLP_INSTANCE_UNROLLING_FACTOR (instance)); 2484 2485 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we 2486 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and 2487 loop-based vectorization. Such stmts will be marked as HYBRID. */ 2488 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); 2489 decided_to_slp++; 2490 } 2491 2492 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; 2493 2494 if (decided_to_slp && dump_enabled_p ()) 2495 { 2496 dump_printf_loc (MSG_NOTE, vect_location, 2497 "Decided to SLP %d instances. Unrolling factor ", 2498 decided_to_slp); 2499 dump_dec (MSG_NOTE, unrolling_factor); 2500 dump_printf (MSG_NOTE, "\n"); 2501 } 2502 2503 return (decided_to_slp > 0); 2504 } 2505 2506 2507 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that 2508 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ 2509 2510 static void 2511 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype) 2512 { 2513 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i]; 2514 imm_use_iterator imm_iter; 2515 gimple *use_stmt; 2516 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt); 2517 slp_tree child; 2518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 2519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2520 int j; 2521 2522 /* Propagate hybrid down the SLP tree. */ 2523 if (stype == hybrid) 2524 ; 2525 else if (HYBRID_SLP_STMT (stmt_vinfo)) 2526 stype = hybrid; 2527 else 2528 { 2529 /* Check if a pure SLP stmt has uses in non-SLP stmts. */ 2530 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo)); 2531 /* If we get a pattern stmt here we have to use the LHS of the 2532 original stmt for immediate uses. */ 2533 if (! STMT_VINFO_IN_PATTERN_P (stmt_vinfo) 2534 && STMT_VINFO_RELATED_STMT (stmt_vinfo)) 2535 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo); 2536 tree def; 2537 if (gimple_code (stmt) == GIMPLE_PHI) 2538 def = gimple_phi_result (stmt); 2539 else 2540 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF); 2541 if (def) 2542 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def) 2543 { 2544 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2545 continue; 2546 use_vinfo = vinfo_for_stmt (use_stmt); 2547 if (STMT_VINFO_IN_PATTERN_P (use_vinfo) 2548 && STMT_VINFO_RELATED_STMT (use_vinfo)) 2549 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo)); 2550 if (!STMT_SLP_TYPE (use_vinfo) 2551 && (STMT_VINFO_RELEVANT (use_vinfo) 2552 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))) 2553 && !(gimple_code (use_stmt) == GIMPLE_PHI 2554 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def)) 2555 { 2556 if (dump_enabled_p ()) 2557 { 2558 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP " 2559 "def in non-SLP stmt: "); 2560 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0); 2561 } 2562 stype = hybrid; 2563 } 2564 } 2565 } 2566 2567 if (stype == hybrid 2568 && !HYBRID_SLP_STMT (stmt_vinfo)) 2569 { 2570 if (dump_enabled_p ()) 2571 { 2572 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: "); 2573 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2574 } 2575 STMT_SLP_TYPE (stmt_vinfo) = hybrid; 2576 } 2577 2578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2579 if (SLP_TREE_DEF_TYPE (child) != vect_external_def) 2580 vect_detect_hybrid_slp_stmts (child, i, stype); 2581 } 2582 2583 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */ 2584 2585 static tree 2586 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data) 2587 { 2588 walk_stmt_info *wi = (walk_stmt_info *)data; 2589 struct loop *loopp = (struct loop *)wi->info; 2590 2591 if (wi->is_lhs) 2592 return NULL_TREE; 2593 2594 if (TREE_CODE (*tp) == SSA_NAME 2595 && !SSA_NAME_IS_DEFAULT_DEF (*tp)) 2596 { 2597 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp); 2598 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt)) 2599 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt))) 2600 { 2601 if (dump_enabled_p ()) 2602 { 2603 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: "); 2604 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0); 2605 } 2606 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid; 2607 } 2608 } 2609 2610 return NULL_TREE; 2611 } 2612 2613 static tree 2614 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled, 2615 walk_stmt_info *) 2616 { 2617 stmt_vec_info use_vinfo = vinfo_for_stmt (gsi_stmt (*gsi)); 2618 /* If the stmt is in a SLP instance then this isn't a reason 2619 to mark use definitions in other SLP instances as hybrid. */ 2620 if (! STMT_SLP_TYPE (use_vinfo) 2621 && (STMT_VINFO_RELEVANT (use_vinfo) 2622 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))) 2623 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI 2624 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def)) 2625 ; 2626 else 2627 *handled = true; 2628 return NULL_TREE; 2629 } 2630 2631 /* Find stmts that must be both vectorized and SLPed. */ 2632 2633 void 2634 vect_detect_hybrid_slp (loop_vec_info loop_vinfo) 2635 { 2636 unsigned int i; 2637 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); 2638 slp_instance instance; 2639 2640 if (dump_enabled_p ()) 2641 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ===" 2642 "\n"); 2643 2644 /* First walk all pattern stmt in the loop and mark defs of uses as 2645 hybrid because immediate uses in them are not recorded. */ 2646 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2647 { 2648 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2649 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 2650 gsi_next (&gsi)) 2651 { 2652 gimple *stmt = gsi_stmt (gsi); 2653 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2654 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2655 { 2656 walk_stmt_info wi; 2657 memset (&wi, 0, sizeof (wi)); 2658 wi.info = LOOP_VINFO_LOOP (loop_vinfo); 2659 gimple_stmt_iterator gsi2 2660 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); 2661 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2, 2662 vect_detect_hybrid_slp_1, &wi); 2663 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info), 2664 vect_detect_hybrid_slp_2, 2665 vect_detect_hybrid_slp_1, &wi); 2666 } 2667 } 2668 } 2669 2670 /* Then walk the SLP instance trees marking stmts with uses in 2671 non-SLP stmts as hybrid, also propagating hybrid down the 2672 SLP tree, collecting the above info on-the-fly. */ 2673 FOR_EACH_VEC_ELT (slp_instances, i, instance) 2674 { 2675 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i) 2676 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance), 2677 i, pure_slp); 2678 } 2679 } 2680 2681 2682 /* Initialize a bb_vec_info struct for the statements between 2683 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */ 2684 2685 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in, 2686 gimple_stmt_iterator region_end_in) 2687 : vec_info (vec_info::bb, init_cost (NULL)), 2688 bb (gsi_bb (region_begin_in)), 2689 region_begin (region_begin_in), 2690 region_end (region_end_in) 2691 { 2692 gimple_stmt_iterator gsi; 2693 2694 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end); 2695 gsi_next (&gsi)) 2696 { 2697 gimple *stmt = gsi_stmt (gsi); 2698 gimple_set_uid (stmt, 0); 2699 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this)); 2700 } 2701 2702 bb->aux = this; 2703 } 2704 2705 2706 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the 2707 stmts in the basic block. */ 2708 2709 _bb_vec_info::~_bb_vec_info () 2710 { 2711 for (gimple_stmt_iterator si = region_begin; 2712 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si)) 2713 { 2714 gimple *stmt = gsi_stmt (si); 2715 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2716 2717 if (stmt_info) 2718 /* Free stmt_vec_info. */ 2719 free_stmt_vec_info (stmt); 2720 2721 /* Reset region marker. */ 2722 gimple_set_uid (stmt, -1); 2723 } 2724 2725 bb->aux = NULL; 2726 } 2727 2728 2729 /* Analyze statements contained in SLP tree NODE after recursively analyzing 2730 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE. 2731 2732 Return true if the operations are supported. */ 2733 2734 static bool 2735 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, 2736 slp_instance node_instance) 2737 { 2738 bool dummy; 2739 int i, j; 2740 gimple *stmt; 2741 slp_tree child; 2742 2743 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 2744 return true; 2745 2746 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 2747 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance)) 2748 return false; 2749 2750 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2751 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2752 gcc_assert (stmt_info); 2753 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect); 2754 2755 /* For BB vectorization vector types are assigned here. 2756 Memory accesses already got their vector type assigned 2757 in vect_analyze_data_refs. */ 2758 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2759 if (bb_vinfo 2760 && ! STMT_VINFO_DATA_REF (stmt_info)) 2761 { 2762 gcc_assert (PURE_SLP_STMT (stmt_info)); 2763 2764 tree scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); 2765 if (dump_enabled_p ()) 2766 { 2767 dump_printf_loc (MSG_NOTE, vect_location, 2768 "get vectype for scalar type: "); 2769 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 2770 dump_printf (MSG_NOTE, "\n"); 2771 } 2772 2773 tree vectype = get_vectype_for_scalar_type (scalar_type); 2774 if (!vectype) 2775 { 2776 if (dump_enabled_p ()) 2777 { 2778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2779 "not SLPed: unsupported data-type "); 2780 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 2781 scalar_type); 2782 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 2783 } 2784 return false; 2785 } 2786 2787 if (dump_enabled_p ()) 2788 { 2789 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 2790 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 2791 dump_printf (MSG_NOTE, "\n"); 2792 } 2793 2794 gimple *sstmt; 2795 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt) 2796 STMT_VINFO_VECTYPE (vinfo_for_stmt (sstmt)) = vectype; 2797 } 2798 2799 /* Calculate the number of vector statements to be created for the 2800 scalar stmts in this node. For SLP reductions it is equal to the 2801 number of vector statements in the children (which has already been 2802 calculated by the recursive call). Otherwise it is the number of 2803 scalar elements in one scalar iteration (GROUP_SIZE) multiplied by 2804 VF divided by the number of elements in a vector. */ 2805 if (GROUP_FIRST_ELEMENT (stmt_info) 2806 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2807 SLP_TREE_NUMBER_OF_VEC_STMTS (node) 2808 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]); 2809 else 2810 { 2811 poly_uint64 vf; 2812 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 2813 vf = loop_vinfo->vectorization_factor; 2814 else 2815 vf = 1; 2816 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance); 2817 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 2818 SLP_TREE_NUMBER_OF_VEC_STMTS (node) 2819 = vect_get_num_vectors (vf * group_size, vectype); 2820 } 2821 2822 /* ??? We have to catch the case late where two first scalar stmts appear 2823 in multiple SLP children with different def type and fail. Remember 2824 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily 2825 match it when that is vect_internal_def. */ 2826 auto_vec<vect_def_type, 4> dt; 2827 dt.safe_grow (SLP_TREE_CHILDREN (node).length ()); 2828 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2829 dt[j] 2830 = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])); 2831 2832 /* Push SLP node def-type to stmt operands. */ 2833 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2834 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 2835 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) 2836 = SLP_TREE_DEF_TYPE (child); 2837 2838 /* Check everything worked out. */ 2839 bool res = true; 2840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2841 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 2842 { 2843 if (STMT_VINFO_DEF_TYPE 2844 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) 2845 != SLP_TREE_DEF_TYPE (child)) 2846 res = false; 2847 } 2848 else if (STMT_VINFO_DEF_TYPE 2849 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) != dt[j]) 2850 res = false; 2851 if (!res && dump_enabled_p ()) 2852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2853 "not vectorized: same operand with different " 2854 "def type in stmt.\n"); 2855 if (res) 2856 res = vect_analyze_stmt (stmt, &dummy, node, node_instance); 2857 2858 /* Restore def-types. */ 2859 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) 2860 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (child)[0])) 2861 = dt[j]; 2862 2863 return res; 2864 } 2865 2866 2867 /* Analyze statements in SLP instances of VINFO. Return true if the 2868 operations are supported. */ 2869 2870 bool 2871 vect_slp_analyze_operations (vec_info *vinfo) 2872 { 2873 slp_instance instance; 2874 int i; 2875 2876 if (dump_enabled_p ()) 2877 dump_printf_loc (MSG_NOTE, vect_location, 2878 "=== vect_slp_analyze_operations ===\n"); 2879 2880 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ) 2881 { 2882 if (!vect_slp_analyze_node_operations (vinfo, 2883 SLP_INSTANCE_TREE (instance), 2884 instance)) 2885 { 2886 dump_printf_loc (MSG_NOTE, vect_location, 2887 "removing SLP instance operations starting from: "); 2888 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 2889 SLP_TREE_SCALAR_STMTS 2890 (SLP_INSTANCE_TREE (instance))[0], 0); 2891 vect_free_slp_instance (instance); 2892 vinfo->slp_instances.ordered_remove (i); 2893 } 2894 else 2895 i++; 2896 } 2897 2898 if (dump_enabled_p ()) 2899 dump_printf_loc (MSG_NOTE, vect_location, 2900 "=== vect_analyze_slp_cost ===\n"); 2901 2902 /* Compute the costs of the SLP instances. */ 2903 scalar_stmts_set_t *visited = new scalar_stmts_set_t (); 2904 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) 2905 vect_analyze_slp_cost (instance, vinfo->target_cost_data, visited); 2906 delete visited; 2907 2908 return !vinfo->slp_instances.is_empty (); 2909 } 2910 2911 2912 /* Compute the scalar cost of the SLP node NODE and its children 2913 and return it. Do not account defs that are marked in LIFE and 2914 update LIFE according to uses of NODE. */ 2915 2916 static unsigned 2917 vect_bb_slp_scalar_cost (basic_block bb, 2918 slp_tree node, vec<bool, va_heap> *life) 2919 { 2920 unsigned scalar_cost = 0; 2921 unsigned i; 2922 gimple *stmt; 2923 slp_tree child; 2924 2925 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 2926 { 2927 unsigned stmt_cost; 2928 ssa_op_iter op_iter; 2929 def_operand_p def_p; 2930 stmt_vec_info stmt_info; 2931 2932 if ((*life)[i]) 2933 continue; 2934 2935 /* If there is a non-vectorized use of the defs then the scalar 2936 stmt is kept live in which case we do not account it or any 2937 required defs in the SLP children in the scalar cost. This 2938 way we make the vectorization more costly when compared to 2939 the scalar cost. */ 2940 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) 2941 { 2942 imm_use_iterator use_iter; 2943 gimple *use_stmt; 2944 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p)) 2945 if (!is_gimple_debug (use_stmt) 2946 && (! vect_stmt_in_region_p (vinfo_for_stmt (stmt)->vinfo, 2947 use_stmt) 2948 || ! PURE_SLP_STMT (vinfo_for_stmt (use_stmt)))) 2949 { 2950 (*life)[i] = true; 2951 BREAK_FROM_IMM_USE_STMT (use_iter); 2952 } 2953 } 2954 if ((*life)[i]) 2955 continue; 2956 2957 /* Count scalar stmts only once. */ 2958 if (gimple_visited_p (stmt)) 2959 continue; 2960 gimple_set_visited (stmt, true); 2961 2962 stmt_info = vinfo_for_stmt (stmt); 2963 if (STMT_VINFO_DATA_REF (stmt_info)) 2964 { 2965 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 2966 stmt_cost = vect_get_stmt_cost (scalar_load); 2967 else 2968 stmt_cost = vect_get_stmt_cost (scalar_store); 2969 } 2970 else 2971 stmt_cost = vect_get_stmt_cost (scalar_stmt); 2972 2973 scalar_cost += stmt_cost; 2974 } 2975 2976 auto_vec<bool, 20> subtree_life; 2977 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 2978 { 2979 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 2980 { 2981 /* Do not directly pass LIFE to the recursive call, copy it to 2982 confine changes in the callee to the current child/subtree. */ 2983 subtree_life.safe_splice (*life); 2984 scalar_cost += vect_bb_slp_scalar_cost (bb, child, &subtree_life); 2985 subtree_life.truncate (0); 2986 } 2987 } 2988 2989 return scalar_cost; 2990 } 2991 2992 /* Check if vectorization of the basic block is profitable. */ 2993 2994 static bool 2995 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) 2996 { 2997 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); 2998 slp_instance instance; 2999 int i; 3000 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; 3001 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; 3002 3003 /* Calculate scalar cost. */ 3004 FOR_EACH_VEC_ELT (slp_instances, i, instance) 3005 { 3006 auto_vec<bool, 20> life; 3007 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance)); 3008 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo), 3009 SLP_INSTANCE_TREE (instance), 3010 &life); 3011 } 3012 3013 /* Unset visited flag. */ 3014 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin; 3015 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi)) 3016 gimple_set_visited (gsi_stmt (gsi), false); 3017 3018 /* Complete the target-specific cost calculation. */ 3019 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost, 3020 &vec_inside_cost, &vec_epilogue_cost); 3021 3022 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; 3023 3024 if (dump_enabled_p ()) 3025 { 3026 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3027 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", 3028 vec_inside_cost); 3029 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); 3030 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); 3031 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); 3032 } 3033 3034 /* Vectorization is profitable if its cost is more than the cost of scalar 3035 version. Note that we err on the vector side for equal cost because 3036 the cost estimate is otherwise quite pessimistic (constant uses are 3037 free on the scalar side but cost a load on the vector side for 3038 example). */ 3039 if (vec_outside_cost + vec_inside_cost > scalar_cost) 3040 return false; 3041 3042 return true; 3043 } 3044 3045 /* Check if the basic block can be vectorized. Returns a bb_vec_info 3046 if so and sets fatal to true if failure is independent of 3047 current_vector_size. */ 3048 3049 static bb_vec_info 3050 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin, 3051 gimple_stmt_iterator region_end, 3052 vec<data_reference_p> datarefs, int n_stmts, 3053 bool &fatal) 3054 { 3055 bb_vec_info bb_vinfo; 3056 slp_instance instance; 3057 int i; 3058 poly_uint64 min_vf = 2; 3059 3060 /* The first group of checks is independent of the vector size. */ 3061 fatal = true; 3062 3063 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)) 3064 { 3065 if (dump_enabled_p ()) 3066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3067 "not vectorized: too many instructions in " 3068 "basic block.\n"); 3069 free_data_refs (datarefs); 3070 return NULL; 3071 } 3072 3073 bb_vinfo = new _bb_vec_info (region_begin, region_end); 3074 if (!bb_vinfo) 3075 return NULL; 3076 3077 BB_VINFO_DATAREFS (bb_vinfo) = datarefs; 3078 3079 /* Analyze the data references. */ 3080 3081 if (!vect_analyze_data_refs (bb_vinfo, &min_vf)) 3082 { 3083 if (dump_enabled_p ()) 3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3085 "not vectorized: unhandled data-ref in basic " 3086 "block.\n"); 3087 3088 delete bb_vinfo; 3089 return NULL; 3090 } 3091 3092 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2) 3093 { 3094 if (dump_enabled_p ()) 3095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3096 "not vectorized: not enough data-refs in " 3097 "basic block.\n"); 3098 3099 delete bb_vinfo; 3100 return NULL; 3101 } 3102 3103 if (!vect_analyze_data_ref_accesses (bb_vinfo)) 3104 { 3105 if (dump_enabled_p ()) 3106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3107 "not vectorized: unhandled data access in " 3108 "basic block.\n"); 3109 3110 delete bb_vinfo; 3111 return NULL; 3112 } 3113 3114 /* If there are no grouped stores in the region there is no need 3115 to continue with pattern recog as vect_analyze_slp will fail 3116 anyway. */ 3117 if (bb_vinfo->grouped_stores.is_empty ()) 3118 { 3119 if (dump_enabled_p ()) 3120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3121 "not vectorized: no grouped stores in " 3122 "basic block.\n"); 3123 3124 delete bb_vinfo; 3125 return NULL; 3126 } 3127 3128 /* While the rest of the analysis below depends on it in some way. */ 3129 fatal = false; 3130 3131 vect_pattern_recog (bb_vinfo); 3132 3133 /* Check the SLP opportunities in the basic block, analyze and build SLP 3134 trees. */ 3135 if (!vect_analyze_slp (bb_vinfo, n_stmts)) 3136 { 3137 if (dump_enabled_p ()) 3138 { 3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3140 "Failed to SLP the basic block.\n"); 3141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3142 "not vectorized: failed to find SLP opportunities " 3143 "in basic block.\n"); 3144 } 3145 3146 delete bb_vinfo; 3147 return NULL; 3148 } 3149 3150 vect_record_base_alignments (bb_vinfo); 3151 3152 /* Analyze and verify the alignment of data references and the 3153 dependence in the SLP instances. */ 3154 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); ) 3155 { 3156 if (! vect_slp_analyze_and_verify_instance_alignment (instance) 3157 || ! vect_slp_analyze_instance_dependence (instance)) 3158 { 3159 dump_printf_loc (MSG_NOTE, vect_location, 3160 "removing SLP instance operations starting from: "); 3161 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 3162 SLP_TREE_SCALAR_STMTS 3163 (SLP_INSTANCE_TREE (instance))[0], 0); 3164 vect_free_slp_instance (instance); 3165 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i); 3166 continue; 3167 } 3168 3169 /* Mark all the statements that we want to vectorize as pure SLP and 3170 relevant. */ 3171 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); 3172 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance)); 3173 3174 i++; 3175 } 3176 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) 3177 { 3178 delete bb_vinfo; 3179 return NULL; 3180 } 3181 3182 if (!vect_slp_analyze_operations (bb_vinfo)) 3183 { 3184 if (dump_enabled_p ()) 3185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3186 "not vectorized: bad operation in basic block.\n"); 3187 3188 delete bb_vinfo; 3189 return NULL; 3190 } 3191 3192 /* Cost model: check if the vectorization is worthwhile. */ 3193 if (!unlimited_cost_model (NULL) 3194 && !vect_bb_vectorization_profitable_p (bb_vinfo)) 3195 { 3196 if (dump_enabled_p ()) 3197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3198 "not vectorized: vectorization is not " 3199 "profitable.\n"); 3200 3201 delete bb_vinfo; 3202 return NULL; 3203 } 3204 3205 if (dump_enabled_p ()) 3206 dump_printf_loc (MSG_NOTE, vect_location, 3207 "Basic block will be vectorized using SLP\n"); 3208 3209 return bb_vinfo; 3210 } 3211 3212 3213 /* Main entry for the BB vectorizer. Analyze and transform BB, returns 3214 true if anything in the basic-block was vectorized. */ 3215 3216 bool 3217 vect_slp_bb (basic_block bb) 3218 { 3219 bb_vec_info bb_vinfo; 3220 gimple_stmt_iterator gsi; 3221 bool any_vectorized = false; 3222 auto_vector_sizes vector_sizes; 3223 3224 if (dump_enabled_p ()) 3225 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n"); 3226 3227 /* Autodetect first vector size we try. */ 3228 current_vector_size = 0; 3229 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 3230 unsigned int next_size = 0; 3231 3232 gsi = gsi_start_bb (bb); 3233 3234 poly_uint64 autodetected_vector_size = 0; 3235 while (1) 3236 { 3237 if (gsi_end_p (gsi)) 3238 break; 3239 3240 gimple_stmt_iterator region_begin = gsi; 3241 vec<data_reference_p> datarefs = vNULL; 3242 int insns = 0; 3243 3244 for (; !gsi_end_p (gsi); gsi_next (&gsi)) 3245 { 3246 gimple *stmt = gsi_stmt (gsi); 3247 if (is_gimple_debug (stmt)) 3248 continue; 3249 insns++; 3250 3251 if (gimple_location (stmt) != UNKNOWN_LOCATION) 3252 vect_location = gimple_location (stmt); 3253 3254 if (!find_data_references_in_stmt (NULL, stmt, &datarefs)) 3255 break; 3256 } 3257 3258 /* Skip leading unhandled stmts. */ 3259 if (gsi_stmt (region_begin) == gsi_stmt (gsi)) 3260 { 3261 gsi_next (&gsi); 3262 continue; 3263 } 3264 3265 gimple_stmt_iterator region_end = gsi; 3266 3267 bool vectorized = false; 3268 bool fatal = false; 3269 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end, 3270 datarefs, insns, fatal); 3271 if (bb_vinfo 3272 && dbg_cnt (vect_slp)) 3273 { 3274 if (dump_enabled_p ()) 3275 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n"); 3276 3277 vect_schedule_slp (bb_vinfo); 3278 3279 if (dump_enabled_p ()) 3280 dump_printf_loc (MSG_NOTE, vect_location, 3281 "basic block part vectorized\n"); 3282 3283 vectorized = true; 3284 } 3285 delete bb_vinfo; 3286 3287 any_vectorized |= vectorized; 3288 3289 if (next_size == 0) 3290 autodetected_vector_size = current_vector_size; 3291 3292 if (next_size < vector_sizes.length () 3293 && known_eq (vector_sizes[next_size], autodetected_vector_size)) 3294 next_size += 1; 3295 3296 if (vectorized 3297 || next_size == vector_sizes.length () 3298 || known_eq (current_vector_size, 0U) 3299 /* If vect_slp_analyze_bb_1 signaled that analysis for all 3300 vector sizes will fail do not bother iterating. */ 3301 || fatal) 3302 { 3303 if (gsi_end_p (region_end)) 3304 break; 3305 3306 /* Skip the unhandled stmt. */ 3307 gsi_next (&gsi); 3308 3309 /* And reset vector sizes. */ 3310 current_vector_size = 0; 3311 next_size = 0; 3312 } 3313 else 3314 { 3315 /* Try the next biggest vector size. */ 3316 current_vector_size = vector_sizes[next_size++]; 3317 if (dump_enabled_p ()) 3318 { 3319 dump_printf_loc (MSG_NOTE, vect_location, 3320 "***** Re-trying analysis with " 3321 "vector size "); 3322 dump_dec (MSG_NOTE, current_vector_size); 3323 dump_printf (MSG_NOTE, "\n"); 3324 } 3325 3326 /* Start over. */ 3327 gsi = region_begin; 3328 } 3329 } 3330 3331 return any_vectorized; 3332 } 3333 3334 3335 /* Return 1 if vector type of boolean constant which is OPNUM 3336 operand in statement STMT is a boolean vector. */ 3337 3338 static bool 3339 vect_mask_constant_operand_p (gimple *stmt, int opnum) 3340 { 3341 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 3342 enum tree_code code = gimple_expr_code (stmt); 3343 tree op, vectype; 3344 gimple *def_stmt; 3345 enum vect_def_type dt; 3346 3347 /* For comparison and COND_EXPR type is chosen depending 3348 on the other comparison operand. */ 3349 if (TREE_CODE_CLASS (code) == tcc_comparison) 3350 { 3351 if (opnum) 3352 op = gimple_assign_rhs1 (stmt); 3353 else 3354 op = gimple_assign_rhs2 (stmt); 3355 3356 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt, 3357 &dt, &vectype)) 3358 gcc_unreachable (); 3359 3360 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 3361 } 3362 3363 if (code == COND_EXPR) 3364 { 3365 tree cond = gimple_assign_rhs1 (stmt); 3366 3367 if (TREE_CODE (cond) == SSA_NAME) 3368 op = cond; 3369 else if (opnum) 3370 op = TREE_OPERAND (cond, 1); 3371 else 3372 op = TREE_OPERAND (cond, 0); 3373 3374 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &def_stmt, 3375 &dt, &vectype)) 3376 gcc_unreachable (); 3377 3378 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype); 3379 } 3380 3381 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo)); 3382 } 3383 3384 /* Build a variable-length vector in which the elements in ELTS are repeated 3385 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in 3386 RESULTS and add any new instructions to SEQ. 3387 3388 The approach we use is: 3389 3390 (1) Find a vector mode VM with integer elements of mode IM. 3391 3392 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of 3393 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs 3394 from small vectors to IM. 3395 3396 (3) Duplicate each ELTS'[I] into a vector of mode VM. 3397 3398 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the 3399 correct byte contents. 3400 3401 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type. 3402 3403 We try to find the largest IM for which this sequence works, in order 3404 to cut down on the number of interleaves. */ 3405 3406 void 3407 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts, 3408 unsigned int nresults, vec<tree> &results) 3409 { 3410 unsigned int nelts = elts.length (); 3411 tree element_type = TREE_TYPE (vector_type); 3412 3413 /* (1) Find a vector mode VM with integer elements of mode IM. */ 3414 unsigned int nvectors = 1; 3415 tree new_vector_type; 3416 tree permutes[2]; 3417 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type), 3418 &nvectors, &new_vector_type, 3419 permutes)) 3420 gcc_unreachable (); 3421 3422 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */ 3423 unsigned int partial_nelts = nelts / nvectors; 3424 tree partial_vector_type = build_vector_type (element_type, partial_nelts); 3425 3426 tree_vector_builder partial_elts; 3427 auto_vec<tree, 32> pieces (nvectors * 2); 3428 pieces.quick_grow (nvectors * 2); 3429 for (unsigned int i = 0; i < nvectors; ++i) 3430 { 3431 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of 3432 ELTS' has mode IM. */ 3433 partial_elts.new_vector (partial_vector_type, partial_nelts, 1); 3434 for (unsigned int j = 0; j < partial_nelts; ++j) 3435 partial_elts.quick_push (elts[i * partial_nelts + j]); 3436 tree t = gimple_build_vector (seq, &partial_elts); 3437 t = gimple_build (seq, VIEW_CONVERT_EXPR, 3438 TREE_TYPE (new_vector_type), t); 3439 3440 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */ 3441 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t); 3442 } 3443 3444 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the 3445 correct byte contents. 3446 3447 We need to repeat the following operation log2(nvectors) times: 3448 3449 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute); 3450 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute); 3451 3452 However, if each input repeats every N elements and the VF is 3453 a multiple of N * 2, the HI result is the same as the LO. */ 3454 unsigned int in_start = 0; 3455 unsigned int out_start = nvectors; 3456 unsigned int hi_start = nvectors / 2; 3457 /* A bound on the number of outputs needed to produce NRESULTS results 3458 in the final iteration. */ 3459 unsigned int noutputs_bound = nvectors * nresults; 3460 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2) 3461 { 3462 noutputs_bound /= 2; 3463 unsigned int limit = MIN (noutputs_bound, nvectors); 3464 for (unsigned int i = 0; i < limit; ++i) 3465 { 3466 if ((i & 1) != 0 3467 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type), 3468 2 * in_repeat)) 3469 { 3470 pieces[out_start + i] = pieces[out_start + i - 1]; 3471 continue; 3472 } 3473 3474 tree output = make_ssa_name (new_vector_type); 3475 tree input1 = pieces[in_start + (i / 2)]; 3476 tree input2 = pieces[in_start + (i / 2) + hi_start]; 3477 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR, 3478 input1, input2, 3479 permutes[i & 1]); 3480 gimple_seq_add_stmt (seq, stmt); 3481 pieces[out_start + i] = output; 3482 } 3483 std::swap (in_start, out_start); 3484 } 3485 3486 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */ 3487 results.reserve (nresults); 3488 for (unsigned int i = 0; i < nresults; ++i) 3489 if (i < nvectors) 3490 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type, 3491 pieces[in_start + i])); 3492 else 3493 results.quick_push (results[i - nvectors]); 3494 } 3495 3496 3497 /* For constant and loop invariant defs of SLP_NODE this function returns 3498 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. 3499 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of 3500 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create. 3501 REDUC_INDEX is the index of the reduction operand in the statements, unless 3502 it is -1. */ 3503 3504 static void 3505 vect_get_constant_vectors (tree op, slp_tree slp_node, 3506 vec<tree> *vec_oprnds, 3507 unsigned int op_num, unsigned int number_of_vectors) 3508 { 3509 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 3510 gimple *stmt = stmts[0]; 3511 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 3512 unsigned HOST_WIDE_INT nunits; 3513 tree vec_cst; 3514 unsigned j, number_of_places_left_in_vector; 3515 tree vector_type; 3516 tree vop; 3517 int group_size = stmts.length (); 3518 unsigned int vec_num, i; 3519 unsigned number_of_copies = 1; 3520 vec<tree> voprnds; 3521 voprnds.create (number_of_vectors); 3522 bool constant_p, is_store; 3523 tree neutral_op = NULL; 3524 enum tree_code code = gimple_expr_code (stmt); 3525 gimple_seq ctor_seq = NULL; 3526 auto_vec<tree, 16> permute_results; 3527 3528 /* Check if vector type is a boolean vector. */ 3529 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op)) 3530 && vect_mask_constant_operand_p (stmt, op_num)) 3531 vector_type 3532 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo)); 3533 else 3534 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); 3535 3536 if (STMT_VINFO_DATA_REF (stmt_vinfo)) 3537 { 3538 is_store = true; 3539 op = gimple_assign_rhs1 (stmt); 3540 } 3541 else 3542 is_store = false; 3543 3544 gcc_assert (op); 3545 3546 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 3547 created vectors. It is greater than 1 if unrolling is performed. 3548 3549 For example, we have two scalar operands, s1 and s2 (e.g., group of 3550 strided accesses of size two), while NUNITS is four (i.e., four scalars 3551 of this type can be packed in a vector). The output vector will contain 3552 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 3553 will be 2). 3554 3555 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 3556 containing the operands. 3557 3558 For example, NUNITS is four as before, and the group size is 8 3559 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 3560 {s5, s6, s7, s8}. */ 3561 3562 /* When using duplicate_and_interleave, we just need one element for 3563 each scalar statement. */ 3564 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 3565 nunits = group_size; 3566 3567 number_of_copies = nunits * number_of_vectors / group_size; 3568 3569 number_of_places_left_in_vector = nunits; 3570 constant_p = true; 3571 tree_vector_builder elts (vector_type, nunits, 1); 3572 elts.quick_grow (nunits); 3573 bool place_after_defs = false; 3574 for (j = 0; j < number_of_copies; j++) 3575 { 3576 for (i = group_size - 1; stmts.iterate (i, &stmt); i--) 3577 { 3578 if (is_store) 3579 op = gimple_assign_rhs1 (stmt); 3580 else 3581 { 3582 switch (code) 3583 { 3584 case COND_EXPR: 3585 { 3586 tree cond = gimple_assign_rhs1 (stmt); 3587 if (TREE_CODE (cond) == SSA_NAME) 3588 op = gimple_op (stmt, op_num + 1); 3589 else if (op_num == 0 || op_num == 1) 3590 op = TREE_OPERAND (cond, op_num); 3591 else 3592 { 3593 if (op_num == 2) 3594 op = gimple_assign_rhs2 (stmt); 3595 else 3596 op = gimple_assign_rhs3 (stmt); 3597 } 3598 } 3599 break; 3600 3601 case CALL_EXPR: 3602 op = gimple_call_arg (stmt, op_num); 3603 break; 3604 3605 case LSHIFT_EXPR: 3606 case RSHIFT_EXPR: 3607 case LROTATE_EXPR: 3608 case RROTATE_EXPR: 3609 op = gimple_op (stmt, op_num + 1); 3610 /* Unlike the other binary operators, shifts/rotates have 3611 the shift count being int, instead of the same type as 3612 the lhs, so make sure the scalar is the right type if 3613 we are dealing with vectors of 3614 long long/long/short/char. */ 3615 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST) 3616 op = fold_convert (TREE_TYPE (vector_type), op); 3617 break; 3618 3619 default: 3620 op = gimple_op (stmt, op_num + 1); 3621 break; 3622 } 3623 } 3624 3625 /* Create 'vect_ = {op0,op1,...,opn}'. */ 3626 number_of_places_left_in_vector--; 3627 tree orig_op = op; 3628 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op))) 3629 { 3630 if (CONSTANT_CLASS_P (op)) 3631 { 3632 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3633 { 3634 /* Can't use VIEW_CONVERT_EXPR for booleans because 3635 of possibly different sizes of scalar value and 3636 vector element. */ 3637 if (integer_zerop (op)) 3638 op = build_int_cst (TREE_TYPE (vector_type), 0); 3639 else if (integer_onep (op)) 3640 op = build_all_ones_cst (TREE_TYPE (vector_type)); 3641 else 3642 gcc_unreachable (); 3643 } 3644 else 3645 op = fold_unary (VIEW_CONVERT_EXPR, 3646 TREE_TYPE (vector_type), op); 3647 gcc_assert (op && CONSTANT_CLASS_P (op)); 3648 } 3649 else 3650 { 3651 tree new_temp = make_ssa_name (TREE_TYPE (vector_type)); 3652 gimple *init_stmt; 3653 if (VECTOR_BOOLEAN_TYPE_P (vector_type)) 3654 { 3655 tree true_val 3656 = build_all_ones_cst (TREE_TYPE (vector_type)); 3657 tree false_val 3658 = build_zero_cst (TREE_TYPE (vector_type)); 3659 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op))); 3660 init_stmt = gimple_build_assign (new_temp, COND_EXPR, 3661 op, true_val, 3662 false_val); 3663 } 3664 else 3665 { 3666 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), 3667 op); 3668 init_stmt 3669 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, 3670 op); 3671 } 3672 gimple_seq_add_stmt (&ctor_seq, init_stmt); 3673 op = new_temp; 3674 } 3675 } 3676 elts[number_of_places_left_in_vector] = op; 3677 if (!CONSTANT_CLASS_P (op)) 3678 constant_p = false; 3679 if (TREE_CODE (orig_op) == SSA_NAME 3680 && !SSA_NAME_IS_DEFAULT_DEF (orig_op) 3681 && STMT_VINFO_BB_VINFO (stmt_vinfo) 3682 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb 3683 == gimple_bb (SSA_NAME_DEF_STMT (orig_op)))) 3684 place_after_defs = true; 3685 3686 if (number_of_places_left_in_vector == 0) 3687 { 3688 if (constant_p 3689 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 3690 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 3691 vec_cst = gimple_build_vector (&ctor_seq, &elts); 3692 else 3693 { 3694 if (vec_oprnds->is_empty ()) 3695 duplicate_and_interleave (&ctor_seq, vector_type, elts, 3696 number_of_vectors, 3697 permute_results); 3698 vec_cst = permute_results[number_of_vectors - j - 1]; 3699 } 3700 tree init; 3701 gimple_stmt_iterator gsi; 3702 if (place_after_defs) 3703 { 3704 gsi = gsi_for_stmt 3705 (vect_find_last_scalar_stmt_in_slp (slp_node)); 3706 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi); 3707 } 3708 else 3709 init = vect_init_vector (stmt, vec_cst, vector_type, NULL); 3710 if (ctor_seq != NULL) 3711 { 3712 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init)); 3713 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT); 3714 ctor_seq = NULL; 3715 } 3716 voprnds.quick_push (init); 3717 place_after_defs = false; 3718 number_of_places_left_in_vector = nunits; 3719 constant_p = true; 3720 elts.new_vector (vector_type, nunits, 1); 3721 elts.quick_grow (nunits); 3722 } 3723 } 3724 } 3725 3726 /* Since the vectors are created in the reverse order, we should invert 3727 them. */ 3728 vec_num = voprnds.length (); 3729 for (j = vec_num; j != 0; j--) 3730 { 3731 vop = voprnds[j - 1]; 3732 vec_oprnds->quick_push (vop); 3733 } 3734 3735 voprnds.release (); 3736 3737 /* In case that VF is greater than the unrolling factor needed for the SLP 3738 group of stmts, NUMBER_OF_VECTORS to be created is greater than 3739 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have 3740 to replicate the vectors. */ 3741 while (number_of_vectors > vec_oprnds->length ()) 3742 { 3743 tree neutral_vec = NULL; 3744 3745 if (neutral_op) 3746 { 3747 if (!neutral_vec) 3748 neutral_vec = build_vector_from_val (vector_type, neutral_op); 3749 3750 vec_oprnds->quick_push (neutral_vec); 3751 } 3752 else 3753 { 3754 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) 3755 vec_oprnds->quick_push (vop); 3756 } 3757 } 3758 } 3759 3760 3761 /* Get vectorized definitions from SLP_NODE that contains corresponding 3762 vectorized def-stmts. */ 3763 3764 static void 3765 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds) 3766 { 3767 tree vec_oprnd; 3768 gimple *vec_def_stmt; 3769 unsigned int i; 3770 3771 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ()); 3772 3773 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt) 3774 { 3775 gcc_assert (vec_def_stmt); 3776 if (gimple_code (vec_def_stmt) == GIMPLE_PHI) 3777 vec_oprnd = gimple_phi_result (vec_def_stmt); 3778 else 3779 vec_oprnd = gimple_get_lhs (vec_def_stmt); 3780 vec_oprnds->quick_push (vec_oprnd); 3781 } 3782 } 3783 3784 3785 /* Get vectorized definitions for SLP_NODE. 3786 If the scalar definitions are loop invariants or constants, collect them and 3787 call vect_get_constant_vectors() to create vector stmts. 3788 Otherwise, the def-stmts must be already vectorized and the vectorized stmts 3789 must be stored in the corresponding child of SLP_NODE, and we call 3790 vect_get_slp_vect_defs () to retrieve them. */ 3791 3792 void 3793 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node, 3794 vec<vec<tree> > *vec_oprnds) 3795 { 3796 gimple *first_stmt; 3797 int number_of_vects = 0, i; 3798 unsigned int child_index = 0; 3799 HOST_WIDE_INT lhs_size_unit, rhs_size_unit; 3800 slp_tree child = NULL; 3801 vec<tree> vec_defs; 3802 tree oprnd; 3803 bool vectorized_defs; 3804 3805 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0]; 3806 FOR_EACH_VEC_ELT (ops, i, oprnd) 3807 { 3808 /* For each operand we check if it has vectorized definitions in a child 3809 node or we need to create them (for invariants and constants). We 3810 check if the LHS of the first stmt of the next child matches OPRND. 3811 If it does, we found the correct child. Otherwise, we call 3812 vect_get_constant_vectors (), and not advance CHILD_INDEX in order 3813 to check this child node for the next operand. */ 3814 vectorized_defs = false; 3815 if (SLP_TREE_CHILDREN (slp_node).length () > child_index) 3816 { 3817 child = SLP_TREE_CHILDREN (slp_node)[child_index]; 3818 3819 /* We have to check both pattern and original def, if available. */ 3820 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) 3821 { 3822 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0]; 3823 gimple *related 3824 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def)); 3825 tree first_def_op; 3826 3827 if (gimple_code (first_def) == GIMPLE_PHI) 3828 first_def_op = gimple_phi_result (first_def); 3829 else 3830 first_def_op = gimple_get_lhs (first_def); 3831 if (operand_equal_p (oprnd, first_def_op, 0) 3832 || (related 3833 && operand_equal_p (oprnd, gimple_get_lhs (related), 0))) 3834 { 3835 /* The number of vector defs is determined by the number of 3836 vector statements in the node from which we get those 3837 statements. */ 3838 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child); 3839 vectorized_defs = true; 3840 child_index++; 3841 } 3842 } 3843 else 3844 child_index++; 3845 } 3846 3847 if (!vectorized_defs) 3848 { 3849 if (i == 0) 3850 { 3851 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 3852 /* Number of vector stmts was calculated according to LHS in 3853 vect_schedule_slp_instance (), fix it by replacing LHS with 3854 RHS, if necessary. See vect_get_smallest_scalar_type () for 3855 details. */ 3856 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, 3857 &rhs_size_unit); 3858 if (rhs_size_unit != lhs_size_unit) 3859 { 3860 number_of_vects *= rhs_size_unit; 3861 number_of_vects /= lhs_size_unit; 3862 } 3863 } 3864 } 3865 3866 /* Allocate memory for vectorized defs. */ 3867 vec_defs = vNULL; 3868 vec_defs.create (number_of_vects); 3869 3870 /* For reduction defs we call vect_get_constant_vectors (), since we are 3871 looking for initial loop invariant values. */ 3872 if (vectorized_defs) 3873 /* The defs are already vectorized. */ 3874 vect_get_slp_vect_defs (child, &vec_defs); 3875 else 3876 /* Build vectors from scalar defs. */ 3877 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i, 3878 number_of_vects); 3879 3880 vec_oprnds->quick_push (vec_defs); 3881 } 3882 } 3883 3884 /* Generate vector permute statements from a list of loads in DR_CHAIN. 3885 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid 3886 permute statements for the SLP node NODE of the SLP instance 3887 SLP_NODE_INSTANCE. */ 3888 3889 bool 3890 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, 3891 gimple_stmt_iterator *gsi, poly_uint64 vf, 3892 slp_instance slp_node_instance, bool analyze_only, 3893 unsigned *n_perms) 3894 { 3895 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 3896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3897 tree mask_element_type = NULL_TREE, mask_type; 3898 int vec_index = 0; 3899 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3900 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); 3901 unsigned int mask_element; 3902 machine_mode mode; 3903 unsigned HOST_WIDE_INT nunits, const_vf; 3904 3905 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 3906 return false; 3907 3908 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)); 3909 3910 mode = TYPE_MODE (vectype); 3911 3912 /* At the moment, all permutations are represented using per-element 3913 indices, so we can't cope with variable vector lengths or 3914 vectorization factors. */ 3915 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) 3916 || !vf.is_constant (&const_vf)) 3917 return false; 3918 3919 /* The generic VEC_PERM_EXPR code always uses an integral type of the 3920 same size as the vector element being permuted. */ 3921 mask_element_type = lang_hooks.types.type_for_mode 3922 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1); 3923 mask_type = get_vectype_for_scalar_type (mask_element_type); 3924 vec_perm_builder mask (nunits, nunits, 1); 3925 mask.quick_grow (nunits); 3926 vec_perm_indices indices; 3927 3928 /* Initialize the vect stmts of NODE to properly insert the generated 3929 stmts later. */ 3930 if (! analyze_only) 3931 for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); 3932 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) 3933 SLP_TREE_VEC_STMTS (node).quick_push (NULL); 3934 3935 /* Generate permutation masks for every NODE. Number of masks for each NODE 3936 is equal to GROUP_SIZE. 3937 E.g., we have a group of three nodes with three loads from the same 3938 location in each node, and the vector size is 4. I.e., we have a 3939 a0b0c0a1b1c1... sequence and we need to create the following vectors: 3940 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 3941 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 3942 ... 3943 3944 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}. 3945 The last mask is illegal since we assume two operands for permute 3946 operation, and the mask element values can't be outside that range. 3947 Hence, the last mask must be converted into {2,5,5,5}. 3948 For the first two permutations we need the first and the second input 3949 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation 3950 we need the second and the third vectors: {b1,c1,a2,b2} and 3951 {c2,a3,b3,c3}. */ 3952 3953 int vect_stmts_counter = 0; 3954 unsigned int index = 0; 3955 int first_vec_index = -1; 3956 int second_vec_index = -1; 3957 bool noop_p = true; 3958 *n_perms = 0; 3959 3960 for (unsigned int j = 0; j < const_vf; j++) 3961 { 3962 for (int k = 0; k < group_size; k++) 3963 { 3964 unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] 3965 + j * STMT_VINFO_GROUP_SIZE (stmt_info)); 3966 vec_index = i / nunits; 3967 mask_element = i % nunits; 3968 if (vec_index == first_vec_index 3969 || first_vec_index == -1) 3970 { 3971 first_vec_index = vec_index; 3972 } 3973 else if (vec_index == second_vec_index 3974 || second_vec_index == -1) 3975 { 3976 second_vec_index = vec_index; 3977 mask_element += nunits; 3978 } 3979 else 3980 { 3981 if (dump_enabled_p ()) 3982 { 3983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3984 "permutation requires at " 3985 "least three vectors "); 3986 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 3987 stmt, 0); 3988 } 3989 gcc_assert (analyze_only); 3990 return false; 3991 } 3992 3993 gcc_assert (mask_element < 2 * nunits); 3994 if (mask_element != index) 3995 noop_p = false; 3996 mask[index++] = mask_element; 3997 3998 if (index == nunits && !noop_p) 3999 { 4000 indices.new_vector (mask, 2, nunits); 4001 if (!can_vec_perm_const_p (mode, indices)) 4002 { 4003 if (dump_enabled_p ()) 4004 { 4005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, 4006 vect_location, 4007 "unsupported vect permute { "); 4008 for (i = 0; i < nunits; ++i) 4009 { 4010 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]); 4011 dump_printf (MSG_MISSED_OPTIMIZATION, " "); 4012 } 4013 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n"); 4014 } 4015 gcc_assert (analyze_only); 4016 return false; 4017 } 4018 4019 ++*n_perms; 4020 } 4021 4022 if (index == nunits) 4023 { 4024 if (!analyze_only) 4025 { 4026 tree mask_vec = NULL_TREE; 4027 4028 if (! noop_p) 4029 mask_vec = vec_perm_indices_to_tree (mask_type, indices); 4030 4031 if (second_vec_index == -1) 4032 second_vec_index = first_vec_index; 4033 4034 /* Generate the permute statement if necessary. */ 4035 tree first_vec = dr_chain[first_vec_index]; 4036 tree second_vec = dr_chain[second_vec_index]; 4037 gimple *perm_stmt; 4038 if (! noop_p) 4039 { 4040 tree perm_dest 4041 = vect_create_destination_var (gimple_assign_lhs (stmt), 4042 vectype); 4043 perm_dest = make_ssa_name (perm_dest); 4044 perm_stmt = gimple_build_assign (perm_dest, 4045 VEC_PERM_EXPR, 4046 first_vec, second_vec, 4047 mask_vec); 4048 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4049 } 4050 else 4051 /* If mask was NULL_TREE generate the requested 4052 identity transform. */ 4053 perm_stmt = SSA_NAME_DEF_STMT (first_vec); 4054 4055 /* Store the vector statement in NODE. */ 4056 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt; 4057 } 4058 4059 index = 0; 4060 first_vec_index = -1; 4061 second_vec_index = -1; 4062 noop_p = true; 4063 } 4064 } 4065 } 4066 4067 return true; 4068 } 4069 4070 typedef hash_map <vec <gimple *>, slp_tree, 4071 simple_hashmap_traits <bst_traits, slp_tree> > 4072 scalar_stmts_to_slp_tree_map_t; 4073 4074 /* Vectorize SLP instance tree in postorder. */ 4075 4076 static bool 4077 vect_schedule_slp_instance (slp_tree node, slp_instance instance, 4078 scalar_stmts_to_slp_tree_map_t *bst_map) 4079 { 4080 gimple *stmt; 4081 bool grouped_store, is_store; 4082 gimple_stmt_iterator si; 4083 stmt_vec_info stmt_info; 4084 unsigned int group_size; 4085 tree vectype; 4086 int i, j; 4087 slp_tree child; 4088 4089 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 4090 return false; 4091 4092 /* See if we have already vectorized the same set of stmts and reuse their 4093 vectorized stmts. */ 4094 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node))) 4095 { 4096 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader)); 4097 SLP_TREE_NUMBER_OF_VEC_STMTS (node) 4098 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader); 4099 return false; 4100 } 4101 4102 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node); 4103 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4104 vect_schedule_slp_instance (child, instance, bst_map); 4105 4106 /* Push SLP node def-type to stmts. */ 4107 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4108 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 4109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 4110 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = SLP_TREE_DEF_TYPE (child); 4111 4112 stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 4113 stmt_info = vinfo_for_stmt (stmt); 4114 4115 /* VECTYPE is the type of the destination. */ 4116 vectype = STMT_VINFO_VECTYPE (stmt_info); 4117 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 4118 group_size = SLP_INSTANCE_GROUP_SIZE (instance); 4119 4120 if (!SLP_TREE_VEC_STMTS (node).exists ()) 4121 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); 4122 4123 if (dump_enabled_p ()) 4124 { 4125 dump_printf_loc (MSG_NOTE,vect_location, 4126 "------>vectorizing SLP node starting from: "); 4127 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4128 } 4129 4130 /* Vectorized stmts go before the last scalar stmt which is where 4131 all uses are ready. */ 4132 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node)); 4133 4134 /* Mark the first element of the reduction chain as reduction to properly 4135 transform the node. In the analysis phase only the last element of the 4136 chain is marked as reduction. */ 4137 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info) 4138 && GROUP_FIRST_ELEMENT (stmt_info) == stmt) 4139 { 4140 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def; 4141 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 4142 } 4143 4144 /* Handle two-operation SLP nodes by vectorizing the group with 4145 both operations and then performing a merge. */ 4146 if (SLP_TREE_TWO_OPERATORS (node)) 4147 { 4148 enum tree_code code0 = gimple_assign_rhs_code (stmt); 4149 enum tree_code ocode = ERROR_MARK; 4150 gimple *ostmt; 4151 vec_perm_builder mask (group_size, group_size, 1); 4152 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt) 4153 if (gimple_assign_rhs_code (ostmt) != code0) 4154 { 4155 mask.quick_push (1); 4156 ocode = gimple_assign_rhs_code (ostmt); 4157 } 4158 else 4159 mask.quick_push (0); 4160 if (ocode != ERROR_MARK) 4161 { 4162 vec<gimple *> v0; 4163 vec<gimple *> v1; 4164 unsigned j; 4165 tree tmask = NULL_TREE; 4166 vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 4167 v0 = SLP_TREE_VEC_STMTS (node).copy (); 4168 SLP_TREE_VEC_STMTS (node).truncate (0); 4169 gimple_assign_set_rhs_code (stmt, ocode); 4170 vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 4171 gimple_assign_set_rhs_code (stmt, code0); 4172 v1 = SLP_TREE_VEC_STMTS (node).copy (); 4173 SLP_TREE_VEC_STMTS (node).truncate (0); 4174 tree meltype = build_nonstandard_integer_type 4175 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1); 4176 tree mvectype = get_same_sized_vectype (meltype, vectype); 4177 unsigned k = 0, l; 4178 for (j = 0; j < v0.length (); ++j) 4179 { 4180 /* Enforced by vect_build_slp_tree, which rejects variable-length 4181 vectors for SLP_TREE_TWO_OPERATORS. */ 4182 unsigned int const_nunits = nunits.to_constant (); 4183 tree_vector_builder melts (mvectype, const_nunits, 1); 4184 for (l = 0; l < const_nunits; ++l) 4185 { 4186 if (k >= group_size) 4187 k = 0; 4188 tree t = build_int_cst (meltype, 4189 mask[k++] * const_nunits + l); 4190 melts.quick_push (t); 4191 } 4192 tmask = melts.build (); 4193 4194 /* ??? Not all targets support a VEC_PERM_EXPR with a 4195 constant mask that would translate to a vec_merge RTX 4196 (with their vec_perm_const_ok). We can either not 4197 vectorize in that case or let veclower do its job. 4198 Unfortunately that isn't too great and at least for 4199 plus/minus we'd eventually like to match targets 4200 vector addsub instructions. */ 4201 gimple *vstmt; 4202 vstmt = gimple_build_assign (make_ssa_name (vectype), 4203 VEC_PERM_EXPR, 4204 gimple_assign_lhs (v0[j]), 4205 gimple_assign_lhs (v1[j]), tmask); 4206 vect_finish_stmt_generation (stmt, vstmt, &si); 4207 SLP_TREE_VEC_STMTS (node).quick_push (vstmt); 4208 } 4209 v0.release (); 4210 v1.release (); 4211 return false; 4212 } 4213 } 4214 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance); 4215 4216 /* Restore stmt def-types. */ 4217 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4218 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def) 4219 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, stmt) 4220 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_internal_def; 4221 4222 return is_store; 4223 } 4224 4225 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero. 4226 For loop vectorization this is done in vectorizable_call, but for SLP 4227 it needs to be deferred until end of vect_schedule_slp, because multiple 4228 SLP instances may refer to the same scalar stmt. */ 4229 4230 static void 4231 vect_remove_slp_scalar_calls (slp_tree node) 4232 { 4233 gimple *stmt, *new_stmt; 4234 gimple_stmt_iterator gsi; 4235 int i; 4236 slp_tree child; 4237 tree lhs; 4238 stmt_vec_info stmt_info; 4239 4240 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) 4241 return; 4242 4243 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) 4244 vect_remove_slp_scalar_calls (child); 4245 4246 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt) 4247 { 4248 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL) 4249 continue; 4250 stmt_info = vinfo_for_stmt (stmt); 4251 if (stmt_info == NULL 4252 || is_pattern_stmt_p (stmt_info) 4253 || !PURE_SLP_STMT (stmt_info)) 4254 continue; 4255 lhs = gimple_call_lhs (stmt); 4256 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs))); 4257 set_vinfo_for_stmt (new_stmt, stmt_info); 4258 set_vinfo_for_stmt (stmt, NULL); 4259 STMT_VINFO_STMT (stmt_info) = new_stmt; 4260 gsi = gsi_for_stmt (stmt); 4261 gsi_replace (&gsi, new_stmt, false); 4262 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt; 4263 } 4264 } 4265 4266 /* Generate vector code for all SLP instances in the loop/basic block. */ 4267 4268 bool 4269 vect_schedule_slp (vec_info *vinfo) 4270 { 4271 vec<slp_instance> slp_instances; 4272 slp_instance instance; 4273 unsigned int i; 4274 bool is_store = false; 4275 4276 4277 scalar_stmts_to_slp_tree_map_t *bst_map 4278 = new scalar_stmts_to_slp_tree_map_t (); 4279 slp_instances = vinfo->slp_instances; 4280 FOR_EACH_VEC_ELT (slp_instances, i, instance) 4281 { 4282 /* Schedule the tree of INSTANCE. */ 4283 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), 4284 instance, bst_map); 4285 if (dump_enabled_p ()) 4286 dump_printf_loc (MSG_NOTE, vect_location, 4287 "vectorizing stmts using SLP.\n"); 4288 } 4289 delete bst_map; 4290 4291 FOR_EACH_VEC_ELT (slp_instances, i, instance) 4292 { 4293 slp_tree root = SLP_INSTANCE_TREE (instance); 4294 gimple *store; 4295 unsigned int j; 4296 gimple_stmt_iterator gsi; 4297 4298 /* Remove scalar call stmts. Do not do this for basic-block 4299 vectorization as not all uses may be vectorized. 4300 ??? Why should this be necessary? DCE should be able to 4301 remove the stmts itself. 4302 ??? For BB vectorization we can as well remove scalar 4303 stmts starting from the SLP tree root if they have no 4304 uses. */ 4305 if (is_a <loop_vec_info> (vinfo)) 4306 vect_remove_slp_scalar_calls (root); 4307 4308 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store) 4309 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) 4310 { 4311 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) 4312 break; 4313 4314 if (is_pattern_stmt_p (vinfo_for_stmt (store))) 4315 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store)); 4316 /* Free the attached stmt_vec_info and remove the stmt. */ 4317 gsi = gsi_for_stmt (store); 4318 unlink_stmt_vdef (store); 4319 gsi_remove (&gsi, true); 4320 release_defs (store); 4321 free_stmt_vec_info (store); 4322 } 4323 } 4324 4325 return is_store; 4326 } 4327