1 /* Loop Vectorization 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "cfghooks.h" 31 #include "tree-pass.h" 32 #include "ssa.h" 33 #include "optabs-tree.h" 34 #include "diagnostic-core.h" 35 #include "fold-const.h" 36 #include "stor-layout.h" 37 #include "cfganal.h" 38 #include "gimplify.h" 39 #include "gimple-iterator.h" 40 #include "gimplify-me.h" 41 #include "tree-ssa-loop-ivopts.h" 42 #include "tree-ssa-loop-manip.h" 43 #include "tree-ssa-loop-niter.h" 44 #include "tree-ssa-loop.h" 45 #include "cfgloop.h" 46 #include "params.h" 47 #include "tree-scalar-evolution.h" 48 #include "tree-vectorizer.h" 49 #include "gimple-fold.h" 50 #include "cgraph.h" 51 #include "tree-cfg.h" 52 #include "tree-if-conv.h" 53 #include "internal-fn.h" 54 #include "tree-vector-builder.h" 55 #include "vec-perm-indices.h" 56 #include "tree-eh.h" 57 58 /* Loop Vectorization Pass. 59 60 This pass tries to vectorize loops. 61 62 For example, the vectorizer transforms the following simple loop: 63 64 short a[N]; short b[N]; short c[N]; int i; 65 66 for (i=0; i<N; i++){ 67 a[i] = b[i] + c[i]; 68 } 69 70 as if it was manually vectorized by rewriting the source code into: 71 72 typedef int __attribute__((mode(V8HI))) v8hi; 73 short a[N]; short b[N]; short c[N]; int i; 74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 75 v8hi va, vb, vc; 76 77 for (i=0; i<N/8; i++){ 78 vb = pb[i]; 79 vc = pc[i]; 80 va = vb + vc; 81 pa[i] = va; 82 } 83 84 The main entry to this pass is vectorize_loops(), in which 85 the vectorizer applies a set of analyses on a given set of loops, 86 followed by the actual vectorization transformation for the loops that 87 had successfully passed the analysis phase. 88 Throughout this pass we make a distinction between two types of 89 data: scalars (which are represented by SSA_NAMES), and memory references 90 ("data-refs"). These two types of data require different handling both 91 during analysis and transformation. The types of data-refs that the 92 vectorizer currently supports are ARRAY_REFS which base is an array DECL 93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 94 accesses are required to have a simple (consecutive) access pattern. 95 96 Analysis phase: 97 =============== 98 The driver for the analysis phase is vect_analyze_loop(). 99 It applies a set of analyses, some of which rely on the scalar evolution 100 analyzer (scev) developed by Sebastian Pop. 101 102 During the analysis phase the vectorizer records some information 103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 104 loop, as well as general information about the loop as a whole, which is 105 recorded in a "loop_vec_info" struct attached to each loop. 106 107 Transformation phase: 108 ===================== 109 The loop transformation phase scans all the stmts in the loop, and 110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 111 the loop that needs to be vectorized. It inserts the vector code sequence 112 just before the scalar stmt S, and records a pointer to the vector code 113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 114 attached to S). This pointer will be used for the vectorization of following 115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 116 otherwise, we rely on dead code elimination for removing it. 117 118 For example, say stmt S1 was vectorized into stmt VS1: 119 120 VS1: vb = px[i]; 121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 122 S2: a = b; 123 124 To vectorize stmt S2, the vectorizer first finds the stmt that defines 125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 127 resulting sequence would be: 128 129 VS1: vb = px[i]; 130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 131 VS2: va = vb; 132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 133 134 Operands that are not SSA_NAMEs, are data-refs that appear in 135 load/store operations (like 'x[i]' in S1), and are handled differently. 136 137 Target modeling: 138 ================= 139 Currently the only target specific information that is used is the 140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 141 Targets that can support different sizes of vectors, for now will need 142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 143 flexibility will be added in the future. 144 145 Since we only vectorize operations which vector form can be 146 expressed using existing tree codes, to verify that an operation is 147 supported, the vectorizer checks the relevant optab at the relevant 148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 149 the value found is CODE_FOR_nothing, then there's no target support, and 150 we can't vectorize the stmt. 151 152 For additional information on this project see: 153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 154 */ 155 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 157 158 /* Function vect_determine_vectorization_factor 159 160 Determine the vectorization factor (VF). VF is the number of data elements 161 that are operated upon in parallel in a single iteration of the vectorized 162 loop. For example, when vectorizing a loop that operates on 4byte elements, 163 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 164 elements can fit in a single vector register. 165 166 We currently support vectorization of loops in which all types operated upon 167 are of the same size. Therefore this function currently sets VF according to 168 the size of the types operated upon, and fails if there are multiple sizes 169 in the loop. 170 171 VF is also the factor by which the loop iterations are strip-mined, e.g.: 172 original loop: 173 for (i=0; i<N; i++){ 174 a[i] = b[i] + c[i]; 175 } 176 177 vectorized loop: 178 for (i=0; i<N; i+=VF){ 179 a[i:VF] = b[i:VF] + c[i:VF]; 180 } 181 */ 182 183 static bool 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 185 { 186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 187 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 188 unsigned nbbs = loop->num_nodes; 189 poly_uint64 vectorization_factor = 1; 190 tree scalar_type = NULL_TREE; 191 gphi *phi; 192 tree vectype; 193 stmt_vec_info stmt_info; 194 unsigned i; 195 HOST_WIDE_INT dummy; 196 gimple *stmt, *pattern_stmt = NULL; 197 gimple_seq pattern_def_seq = NULL; 198 gimple_stmt_iterator pattern_def_si = gsi_none (); 199 bool analyze_pattern_stmt = false; 200 bool bool_result; 201 auto_vec<stmt_vec_info> mask_producers; 202 203 if (dump_enabled_p ()) 204 dump_printf_loc (MSG_NOTE, vect_location, 205 "=== vect_determine_vectorization_factor ===\n"); 206 207 for (i = 0; i < nbbs; i++) 208 { 209 basic_block bb = bbs[i]; 210 211 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 212 gsi_next (&si)) 213 { 214 phi = si.phi (); 215 stmt_info = vinfo_for_stmt (phi); 216 if (dump_enabled_p ()) 217 { 218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: "); 219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 220 } 221 222 gcc_assert (stmt_info); 223 224 if (STMT_VINFO_RELEVANT_P (stmt_info) 225 || STMT_VINFO_LIVE_P (stmt_info)) 226 { 227 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 228 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 229 230 if (dump_enabled_p ()) 231 { 232 dump_printf_loc (MSG_NOTE, vect_location, 233 "get vectype for scalar type: "); 234 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 235 dump_printf (MSG_NOTE, "\n"); 236 } 237 238 vectype = get_vectype_for_scalar_type (scalar_type); 239 if (!vectype) 240 { 241 if (dump_enabled_p ()) 242 { 243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 244 "not vectorized: unsupported " 245 "data-type "); 246 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 247 scalar_type); 248 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 249 } 250 return false; 251 } 252 STMT_VINFO_VECTYPE (stmt_info) = vectype; 253 254 if (dump_enabled_p ()) 255 { 256 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 257 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 258 dump_printf (MSG_NOTE, "\n"); 259 } 260 261 if (dump_enabled_p ()) 262 { 263 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 264 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); 265 dump_printf (MSG_NOTE, "\n"); 266 } 267 268 vect_update_max_nunits (&vectorization_factor, vectype); 269 } 270 } 271 272 for (gimple_stmt_iterator si = gsi_start_bb (bb); 273 !gsi_end_p (si) || analyze_pattern_stmt;) 274 { 275 tree vf_vectype; 276 277 if (analyze_pattern_stmt) 278 stmt = pattern_stmt; 279 else 280 stmt = gsi_stmt (si); 281 282 stmt_info = vinfo_for_stmt (stmt); 283 284 if (dump_enabled_p ()) 285 { 286 dump_printf_loc (MSG_NOTE, vect_location, 287 "==> examining statement: "); 288 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 289 } 290 291 gcc_assert (stmt_info); 292 293 /* Skip stmts which do not need to be vectorized. */ 294 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 295 && !STMT_VINFO_LIVE_P (stmt_info)) 296 || gimple_clobber_p (stmt)) 297 { 298 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 302 { 303 stmt = pattern_stmt; 304 stmt_info = vinfo_for_stmt (pattern_stmt); 305 if (dump_enabled_p ()) 306 { 307 dump_printf_loc (MSG_NOTE, vect_location, 308 "==> examining pattern statement: "); 309 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 310 } 311 } 312 else 313 { 314 if (dump_enabled_p ()) 315 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 316 gsi_next (&si); 317 continue; 318 } 319 } 320 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 321 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 322 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 323 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 324 analyze_pattern_stmt = true; 325 326 /* If a pattern statement has def stmts, analyze them too. */ 327 if (is_pattern_stmt_p (stmt_info)) 328 { 329 if (pattern_def_seq == NULL) 330 { 331 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 332 pattern_def_si = gsi_start (pattern_def_seq); 333 } 334 else if (!gsi_end_p (pattern_def_si)) 335 gsi_next (&pattern_def_si); 336 if (pattern_def_seq != NULL) 337 { 338 gimple *pattern_def_stmt = NULL; 339 stmt_vec_info pattern_def_stmt_info = NULL; 340 341 while (!gsi_end_p (pattern_def_si)) 342 { 343 pattern_def_stmt = gsi_stmt (pattern_def_si); 344 pattern_def_stmt_info 345 = vinfo_for_stmt (pattern_def_stmt); 346 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 347 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 348 break; 349 gsi_next (&pattern_def_si); 350 } 351 352 if (!gsi_end_p (pattern_def_si)) 353 { 354 if (dump_enabled_p ()) 355 { 356 dump_printf_loc (MSG_NOTE, vect_location, 357 "==> examining pattern def stmt: "); 358 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 359 pattern_def_stmt, 0); 360 } 361 362 stmt = pattern_def_stmt; 363 stmt_info = pattern_def_stmt_info; 364 } 365 else 366 { 367 pattern_def_si = gsi_none (); 368 analyze_pattern_stmt = false; 369 } 370 } 371 else 372 analyze_pattern_stmt = false; 373 } 374 375 if (gimple_get_lhs (stmt) == NULL_TREE 376 /* MASK_STORE has no lhs, but is ok. */ 377 && (!is_gimple_call (stmt) 378 || !gimple_call_internal_p (stmt) 379 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE)) 380 { 381 if (is_gimple_call (stmt)) 382 { 383 /* Ignore calls with no lhs. These must be calls to 384 #pragma omp simd functions, and what vectorization factor 385 it really needs can't be determined until 386 vectorizable_simd_clone_call. */ 387 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 388 { 389 pattern_def_seq = NULL; 390 gsi_next (&si); 391 } 392 continue; 393 } 394 if (dump_enabled_p ()) 395 { 396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 397 "not vectorized: irregular stmt."); 398 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 399 0); 400 } 401 return false; 402 } 403 404 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt)))) 405 { 406 if (dump_enabled_p ()) 407 { 408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 409 "not vectorized: vector stmt in loop:"); 410 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 411 } 412 return false; 413 } 414 415 bool_result = false; 416 417 if (STMT_VINFO_VECTYPE (stmt_info)) 418 { 419 /* The only case when a vectype had been already set is for stmts 420 that contain a dataref, or for "pattern-stmts" (stmts 421 generated by the vectorizer to represent/replace a certain 422 idiom). */ 423 gcc_assert (STMT_VINFO_DATA_REF (stmt_info) 424 || is_pattern_stmt_p (stmt_info) 425 || !gsi_end_p (pattern_def_si)); 426 vectype = STMT_VINFO_VECTYPE (stmt_info); 427 } 428 else 429 { 430 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)); 431 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 432 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); 433 else 434 scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); 435 436 /* Bool ops don't participate in vectorization factor 437 computation. For comparison use compared types to 438 compute a factor. */ 439 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type) 440 && is_gimple_assign (stmt) 441 && gimple_assign_rhs_code (stmt) != COND_EXPR) 442 { 443 if (STMT_VINFO_RELEVANT_P (stmt_info) 444 || STMT_VINFO_LIVE_P (stmt_info)) 445 mask_producers.safe_push (stmt_info); 446 bool_result = true; 447 448 if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) 449 == tcc_comparison 450 && !VECT_SCALAR_BOOLEAN_TYPE_P 451 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 452 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 453 else 454 { 455 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 456 { 457 pattern_def_seq = NULL; 458 gsi_next (&si); 459 } 460 continue; 461 } 462 } 463 464 if (dump_enabled_p ()) 465 { 466 dump_printf_loc (MSG_NOTE, vect_location, 467 "get vectype for scalar type: "); 468 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 469 dump_printf (MSG_NOTE, "\n"); 470 } 471 vectype = get_vectype_for_scalar_type (scalar_type); 472 if (!vectype) 473 { 474 if (dump_enabled_p ()) 475 { 476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 477 "not vectorized: unsupported " 478 "data-type "); 479 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 480 scalar_type); 481 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 482 } 483 return false; 484 } 485 486 if (!bool_result) 487 STMT_VINFO_VECTYPE (stmt_info) = vectype; 488 489 if (dump_enabled_p ()) 490 { 491 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 492 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype); 493 dump_printf (MSG_NOTE, "\n"); 494 } 495 } 496 497 /* Don't try to compute VF out scalar types if we stmt 498 produces boolean vector. Use result vectype instead. */ 499 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 500 vf_vectype = vectype; 501 else 502 { 503 /* The vectorization factor is according to the smallest 504 scalar type (or the largest vector size, but we only 505 support one vector size per loop). */ 506 if (!bool_result) 507 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 508 &dummy); 509 if (dump_enabled_p ()) 510 { 511 dump_printf_loc (MSG_NOTE, vect_location, 512 "get vectype for scalar type: "); 513 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type); 514 dump_printf (MSG_NOTE, "\n"); 515 } 516 vf_vectype = get_vectype_for_scalar_type (scalar_type); 517 } 518 if (!vf_vectype) 519 { 520 if (dump_enabled_p ()) 521 { 522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 523 "not vectorized: unsupported data-type "); 524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 525 scalar_type); 526 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 527 } 528 return false; 529 } 530 531 if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)), 532 GET_MODE_SIZE (TYPE_MODE (vf_vectype)))) 533 { 534 if (dump_enabled_p ()) 535 { 536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 537 "not vectorized: different sized vector " 538 "types in statement, "); 539 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 540 vectype); 541 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 542 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 543 vf_vectype); 544 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 545 } 546 return false; 547 } 548 549 if (dump_enabled_p ()) 550 { 551 dump_printf_loc (MSG_NOTE, vect_location, "vectype: "); 552 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype); 553 dump_printf (MSG_NOTE, "\n"); 554 } 555 556 if (dump_enabled_p ()) 557 { 558 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 559 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype)); 560 dump_printf (MSG_NOTE, "\n"); 561 } 562 563 vect_update_max_nunits (&vectorization_factor, vf_vectype); 564 565 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) 566 { 567 pattern_def_seq = NULL; 568 gsi_next (&si); 569 } 570 } 571 } 572 573 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 574 if (dump_enabled_p ()) 575 { 576 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); 577 dump_dec (MSG_NOTE, vectorization_factor); 578 dump_printf (MSG_NOTE, "\n"); 579 } 580 581 if (known_le (vectorization_factor, 1U)) 582 { 583 if (dump_enabled_p ()) 584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 585 "not vectorized: unsupported data-type\n"); 586 return false; 587 } 588 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 589 590 for (i = 0; i < mask_producers.length (); i++) 591 { 592 tree mask_type = NULL; 593 594 stmt = STMT_VINFO_STMT (mask_producers[i]); 595 596 if (is_gimple_assign (stmt) 597 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison 598 && !VECT_SCALAR_BOOLEAN_TYPE_P 599 (TREE_TYPE (gimple_assign_rhs1 (stmt)))) 600 { 601 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 602 mask_type = get_mask_type_for_scalar_type (scalar_type); 603 604 if (!mask_type) 605 { 606 if (dump_enabled_p ()) 607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 608 "not vectorized: unsupported mask\n"); 609 return false; 610 } 611 } 612 else 613 { 614 tree rhs; 615 ssa_op_iter iter; 616 gimple *def_stmt; 617 enum vect_def_type dt; 618 619 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE) 620 { 621 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo, 622 &def_stmt, &dt, &vectype)) 623 { 624 if (dump_enabled_p ()) 625 { 626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 627 "not vectorized: can't compute mask type " 628 "for statement, "); 629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 630 0); 631 } 632 return false; 633 } 634 635 /* No vectype probably means external definition. 636 Allow it in case there is another operand which 637 allows to determine mask type. */ 638 if (!vectype) 639 continue; 640 641 if (!mask_type) 642 mask_type = vectype; 643 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 644 TYPE_VECTOR_SUBPARTS (vectype))) 645 { 646 if (dump_enabled_p ()) 647 { 648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 649 "not vectorized: different sized masks " 650 "types in statement, "); 651 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 652 mask_type); 653 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 654 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 655 vectype); 656 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 657 } 658 return false; 659 } 660 else if (VECTOR_BOOLEAN_TYPE_P (mask_type) 661 != VECTOR_BOOLEAN_TYPE_P (vectype)) 662 { 663 if (dump_enabled_p ()) 664 { 665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 666 "not vectorized: mixed mask and " 667 "nonmask vector types in statement, "); 668 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 669 mask_type); 670 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 671 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 672 vectype); 673 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 674 } 675 return false; 676 } 677 } 678 679 /* We may compare boolean value loaded as vector of integers. 680 Fix mask_type in such case. */ 681 if (mask_type 682 && !VECTOR_BOOLEAN_TYPE_P (mask_type) 683 && gimple_code (stmt) == GIMPLE_ASSIGN 684 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison) 685 mask_type = build_same_sized_truth_vector_type (mask_type); 686 } 687 688 /* No mask_type should mean loop invariant predicate. 689 This is probably a subject for optimization in 690 if-conversion. */ 691 if (!mask_type) 692 { 693 if (dump_enabled_p ()) 694 { 695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 696 "not vectorized: can't compute mask type " 697 "for statement, "); 698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 699 0); 700 } 701 return false; 702 } 703 704 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type; 705 } 706 707 return true; 708 } 709 710 711 /* Function vect_is_simple_iv_evolution. 712 713 FORNOW: A simple evolution of an induction variables in the loop is 714 considered a polynomial evolution. */ 715 716 static bool 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 718 tree * step) 719 { 720 tree init_expr; 721 tree step_expr; 722 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 723 basic_block bb; 724 725 /* When there is no evolution in this loop, the evolution function 726 is not "simple". */ 727 if (evolution_part == NULL_TREE) 728 return false; 729 730 /* When the evolution is a polynomial of degree >= 2 731 the evolution function is not "simple". */ 732 if (tree_is_chrec (evolution_part)) 733 return false; 734 735 step_expr = evolution_part; 736 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 737 738 if (dump_enabled_p ()) 739 { 740 dump_printf_loc (MSG_NOTE, vect_location, "step: "); 741 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr); 742 dump_printf (MSG_NOTE, ", init: "); 743 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr); 744 dump_printf (MSG_NOTE, "\n"); 745 } 746 747 *init = init_expr; 748 *step = step_expr; 749 750 if (TREE_CODE (step_expr) != INTEGER_CST 751 && (TREE_CODE (step_expr) != SSA_NAME 752 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 753 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 754 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 755 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 756 || !flag_associative_math))) 757 && (TREE_CODE (step_expr) != REAL_CST 758 || !flag_associative_math)) 759 { 760 if (dump_enabled_p ()) 761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 762 "step unknown.\n"); 763 return false; 764 } 765 766 return true; 767 } 768 769 /* Function vect_analyze_scalar_cycles_1. 770 771 Examine the cross iteration def-use cycles of scalar variables 772 in LOOP. LOOP_VINFO represents the loop that is now being 773 considered for vectorization (can be LOOP, or an outer-loop 774 enclosing LOOP). */ 775 776 static void 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) 778 { 779 basic_block bb = loop->header; 780 tree init, step; 781 auto_vec<gimple *, 64> worklist; 782 gphi_iterator gsi; 783 bool double_reduc; 784 785 if (dump_enabled_p ()) 786 dump_printf_loc (MSG_NOTE, vect_location, 787 "=== vect_analyze_scalar_cycles ===\n"); 788 789 /* First - identify all inductions. Reduction detection assumes that all the 790 inductions have been identified, therefore, this order must not be 791 changed. */ 792 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 793 { 794 gphi *phi = gsi.phi (); 795 tree access_fn = NULL; 796 tree def = PHI_RESULT (phi); 797 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 798 799 if (dump_enabled_p ()) 800 { 801 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 802 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 803 } 804 805 /* Skip virtual phi's. The data dependences that are associated with 806 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 807 if (virtual_operand_p (def)) 808 continue; 809 810 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 811 812 /* Analyze the evolution function. */ 813 access_fn = analyze_scalar_evolution (loop, def); 814 if (access_fn) 815 { 816 STRIP_NOPS (access_fn); 817 if (dump_enabled_p ()) 818 { 819 dump_printf_loc (MSG_NOTE, vect_location, 820 "Access function of PHI: "); 821 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn); 822 dump_printf (MSG_NOTE, "\n"); 823 } 824 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 825 = initial_condition_in_loop_num (access_fn, loop->num); 826 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 827 = evolution_part_in_loop_num (access_fn, loop->num); 828 } 829 830 if (!access_fn 831 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 832 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 833 && TREE_CODE (step) != INTEGER_CST)) 834 { 835 worklist.safe_push (phi); 836 continue; 837 } 838 839 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 840 != NULL_TREE); 841 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 842 843 if (dump_enabled_p ()) 844 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 845 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 846 } 847 848 849 /* Second - identify all reductions and nested cycles. */ 850 while (worklist.length () > 0) 851 { 852 gimple *phi = worklist.pop (); 853 tree def = PHI_RESULT (phi); 854 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); 855 gimple *reduc_stmt; 856 857 if (dump_enabled_p ()) 858 { 859 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: "); 860 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 861 } 862 863 gcc_assert (!virtual_operand_p (def) 864 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 865 866 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, 867 &double_reduc, false); 868 if (reduc_stmt) 869 { 870 if (double_reduc) 871 { 872 if (dump_enabled_p ()) 873 dump_printf_loc (MSG_NOTE, vect_location, 874 "Detected double reduction.\n"); 875 876 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 877 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 878 vect_double_reduction_def; 879 } 880 else 881 { 882 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 883 { 884 if (dump_enabled_p ()) 885 dump_printf_loc (MSG_NOTE, vect_location, 886 "Detected vectorizable nested cycle.\n"); 887 888 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 889 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 890 vect_nested_cycle; 891 } 892 else 893 { 894 if (dump_enabled_p ()) 895 dump_printf_loc (MSG_NOTE, vect_location, 896 "Detected reduction.\n"); 897 898 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 899 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = 900 vect_reduction_def; 901 /* Store the reduction cycles for possible vectorization in 902 loop-aware SLP if it was not detected as reduction 903 chain. */ 904 if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt))) 905 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); 906 } 907 } 908 } 909 else 910 if (dump_enabled_p ()) 911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 912 "Unknown def-use cycle pattern.\n"); 913 } 914 } 915 916 917 /* Function vect_analyze_scalar_cycles. 918 919 Examine the cross iteration def-use cycles of scalar variables, by 920 analyzing the loop-header PHIs of scalar variables. Classify each 921 cycle as one of the following: invariant, induction, reduction, unknown. 922 We do that for the loop represented by LOOP_VINFO, and also to its 923 inner-loop, if exists. 924 Examples for scalar cycles: 925 926 Example1: reduction: 927 928 loop1: 929 for (i=0; i<N; i++) 930 sum += a[i]; 931 932 Example2: induction: 933 934 loop2: 935 for (i=0; i<N; i++) 936 a[i] = i; */ 937 938 static void 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 940 { 941 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 942 943 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 944 945 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 946 Reductions in such inner-loop therefore have different properties than 947 the reductions in the nest that gets vectorized: 948 1. When vectorized, they are executed in the same order as in the original 949 scalar loop, so we can't change the order of computation when 950 vectorizing them. 951 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 952 current checks are too strict. */ 953 954 if (loop->inner) 955 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 956 } 957 958 /* Transfer group and reduction information from STMT to its pattern stmt. */ 959 960 static void 961 vect_fixup_reduc_chain (gimple *stmt) 962 { 963 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 964 gimple *stmtp; 965 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp)) 966 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 967 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt)); 968 do 969 { 970 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 971 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp; 972 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 973 if (stmt) 974 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp)) 975 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 976 } 977 while (stmt); 978 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def; 979 } 980 981 /* Fixup scalar cycles that now have their stmts detected as patterns. */ 982 983 static void 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 985 { 986 gimple *first; 987 unsigned i; 988 989 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 990 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first))) 991 { 992 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); 993 while (next) 994 { 995 if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))) 996 break; 997 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 998 } 999 /* If not all stmt in the chain are patterns try to handle 1000 the chain without patterns. */ 1001 if (! next) 1002 { 1003 vect_fixup_reduc_chain (first); 1004 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 1005 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first)); 1006 } 1007 } 1008 } 1009 1010 /* Function vect_get_loop_niters. 1011 1012 Determine how many iterations the loop is executed and place it 1013 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 1014 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 1015 niter information holds in ASSUMPTIONS. 1016 1017 Return the loop exit condition. */ 1018 1019 1020 static gcond * 1021 vect_get_loop_niters (struct loop *loop, tree *assumptions, 1022 tree *number_of_iterations, tree *number_of_iterationsm1) 1023 { 1024 edge exit = single_exit (loop); 1025 struct tree_niter_desc niter_desc; 1026 tree niter_assumptions, niter, may_be_zero; 1027 gcond *cond = get_loop_exit_condition (loop); 1028 1029 *assumptions = boolean_true_node; 1030 *number_of_iterationsm1 = chrec_dont_know; 1031 *number_of_iterations = chrec_dont_know; 1032 if (dump_enabled_p ()) 1033 dump_printf_loc (MSG_NOTE, vect_location, 1034 "=== get_loop_niters ===\n"); 1035 1036 if (!exit) 1037 return cond; 1038 1039 niter = chrec_dont_know; 1040 may_be_zero = NULL_TREE; 1041 niter_assumptions = boolean_true_node; 1042 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 1043 || chrec_contains_undetermined (niter_desc.niter)) 1044 return cond; 1045 1046 niter_assumptions = niter_desc.assumptions; 1047 may_be_zero = niter_desc.may_be_zero; 1048 niter = niter_desc.niter; 1049 1050 if (may_be_zero && integer_zerop (may_be_zero)) 1051 may_be_zero = NULL_TREE; 1052 1053 if (may_be_zero) 1054 { 1055 if (COMPARISON_CLASS_P (may_be_zero)) 1056 { 1057 /* Try to combine may_be_zero with assumptions, this can simplify 1058 computation of niter expression. */ 1059 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 1060 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 1061 niter_assumptions, 1062 fold_build1 (TRUTH_NOT_EXPR, 1063 boolean_type_node, 1064 may_be_zero)); 1065 else 1066 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 1067 build_int_cst (TREE_TYPE (niter), 0), 1068 rewrite_to_non_trapping_overflow (niter)); 1069 1070 may_be_zero = NULL_TREE; 1071 } 1072 else if (integer_nonzerop (may_be_zero)) 1073 { 1074 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 1075 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 1076 return cond; 1077 } 1078 else 1079 return cond; 1080 } 1081 1082 *assumptions = niter_assumptions; 1083 *number_of_iterationsm1 = niter; 1084 1085 /* We want the number of loop header executions which is the number 1086 of latch executions plus one. 1087 ??? For UINT_MAX latch executions this number overflows to zero 1088 for loops like do { n++; } while (n != 0); */ 1089 if (niter && !chrec_contains_undetermined (niter)) 1090 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 1091 build_int_cst (TREE_TYPE (niter), 1)); 1092 *number_of_iterations = niter; 1093 1094 return cond; 1095 } 1096 1097 /* Function bb_in_loop_p 1098 1099 Used as predicate for dfs order traversal of the loop bbs. */ 1100 1101 static bool 1102 bb_in_loop_p (const_basic_block bb, const void *data) 1103 { 1104 const struct loop *const loop = (const struct loop *)data; 1105 if (flow_bb_inside_loop_p (loop, bb)) 1106 return true; 1107 return false; 1108 } 1109 1110 1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 1112 stmt_vec_info structs for all the stmts in LOOP_IN. */ 1113 1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in) 1115 : vec_info (vec_info::loop, init_cost (loop_in)), 1116 loop (loop_in), 1117 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 1118 num_itersm1 (NULL_TREE), 1119 num_iters (NULL_TREE), 1120 num_iters_unchanged (NULL_TREE), 1121 num_iters_assumptions (NULL_TREE), 1122 th (0), 1123 versioning_threshold (0), 1124 vectorization_factor (0), 1125 max_vectorization_factor (0), 1126 mask_skip_niters (NULL_TREE), 1127 mask_compare_type (NULL_TREE), 1128 unaligned_dr (NULL), 1129 peeling_for_alignment (0), 1130 ptr_mask (0), 1131 ivexpr_map (NULL), 1132 slp_unrolling_factor (1), 1133 single_scalar_iteration_cost (0), 1134 vectorizable (false), 1135 can_fully_mask_p (true), 1136 fully_masked_p (false), 1137 peeling_for_gaps (false), 1138 peeling_for_niter (false), 1139 operands_swapped (false), 1140 no_data_dependencies (false), 1141 has_mask_store (false), 1142 scalar_loop (NULL), 1143 orig_loop_info (NULL) 1144 { 1145 /* Create/Update stmt_info for all stmts in the loop. */ 1146 basic_block *body = get_loop_body (loop); 1147 for (unsigned int i = 0; i < loop->num_nodes; i++) 1148 { 1149 basic_block bb = body[i]; 1150 gimple_stmt_iterator si; 1151 1152 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1153 { 1154 gimple *phi = gsi_stmt (si); 1155 gimple_set_uid (phi, 0); 1156 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this)); 1157 } 1158 1159 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1160 { 1161 gimple *stmt = gsi_stmt (si); 1162 gimple_set_uid (stmt, 0); 1163 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this)); 1164 } 1165 } 1166 free (body); 1167 1168 /* CHECKME: We want to visit all BBs before their successors (except for 1169 latch blocks, for which this assertion wouldn't hold). In the simple 1170 case of the loop forms we allow, a dfs order of the BBs would the same 1171 as reversed postorder traversal, so we are safe. */ 1172 1173 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 1174 bbs, loop->num_nodes, loop); 1175 gcc_assert (nbbs == loop->num_nodes); 1176 } 1177 1178 /* Free all levels of MASKS. */ 1179 1180 void 1181 release_vec_loop_masks (vec_loop_masks *masks) 1182 { 1183 rgroup_masks *rgm; 1184 unsigned int i; 1185 FOR_EACH_VEC_ELT (*masks, i, rgm) 1186 rgm->masks.release (); 1187 masks->release (); 1188 } 1189 1190 /* Free all memory used by the _loop_vec_info, as well as all the 1191 stmt_vec_info structs of all the stmts in the loop. */ 1192 1193 _loop_vec_info::~_loop_vec_info () 1194 { 1195 int nbbs; 1196 gimple_stmt_iterator si; 1197 int j; 1198 1199 nbbs = loop->num_nodes; 1200 for (j = 0; j < nbbs; j++) 1201 { 1202 basic_block bb = bbs[j]; 1203 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 1204 free_stmt_vec_info (gsi_stmt (si)); 1205 1206 for (si = gsi_start_bb (bb); !gsi_end_p (si); ) 1207 { 1208 gimple *stmt = gsi_stmt (si); 1209 1210 /* We may have broken canonical form by moving a constant 1211 into RHS1 of a commutative op. Fix such occurrences. */ 1212 if (operands_swapped && is_gimple_assign (stmt)) 1213 { 1214 enum tree_code code = gimple_assign_rhs_code (stmt); 1215 1216 if ((code == PLUS_EXPR 1217 || code == POINTER_PLUS_EXPR 1218 || code == MULT_EXPR) 1219 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt))) 1220 swap_ssa_operands (stmt, 1221 gimple_assign_rhs1_ptr (stmt), 1222 gimple_assign_rhs2_ptr (stmt)); 1223 else if (code == COND_EXPR 1224 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt))) 1225 { 1226 tree cond_expr = gimple_assign_rhs1 (stmt); 1227 enum tree_code cond_code = TREE_CODE (cond_expr); 1228 1229 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 1230 { 1231 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 1232 0)); 1233 cond_code = invert_tree_comparison (cond_code, 1234 honor_nans); 1235 if (cond_code != ERROR_MARK) 1236 { 1237 TREE_SET_CODE (cond_expr, cond_code); 1238 swap_ssa_operands (stmt, 1239 gimple_assign_rhs2_ptr (stmt), 1240 gimple_assign_rhs3_ptr (stmt)); 1241 } 1242 } 1243 } 1244 } 1245 1246 /* Free stmt_vec_info. */ 1247 free_stmt_vec_info (stmt); 1248 gsi_next (&si); 1249 } 1250 } 1251 1252 free (bbs); 1253 1254 release_vec_loop_masks (&masks); 1255 delete ivexpr_map; 1256 1257 loop->aux = NULL; 1258 } 1259 1260 /* Return an invariant or register for EXPR and emit necessary 1261 computations in the LOOP_VINFO loop preheader. */ 1262 1263 tree 1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) 1265 { 1266 if (is_gimple_reg (expr) 1267 || is_gimple_min_invariant (expr)) 1268 return expr; 1269 1270 if (! loop_vinfo->ivexpr_map) 1271 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; 1272 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr); 1273 if (! cached) 1274 { 1275 gimple_seq stmts = NULL; 1276 cached = force_gimple_operand (unshare_expr (expr), 1277 &stmts, true, NULL_TREE); 1278 if (stmts) 1279 { 1280 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1281 gsi_insert_seq_on_edge_immediate (e, stmts); 1282 } 1283 } 1284 return cached; 1285 } 1286 1287 /* Return true if we can use CMP_TYPE as the comparison type to produce 1288 all masks required to mask LOOP_VINFO. */ 1289 1290 static bool 1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) 1292 { 1293 rgroup_masks *rgm; 1294 unsigned int i; 1295 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1296 if (rgm->mask_type != NULL_TREE 1297 && !direct_internal_fn_supported_p (IFN_WHILE_ULT, 1298 cmp_type, rgm->mask_type, 1299 OPTIMIZE_FOR_SPEED)) 1300 return false; 1301 return true; 1302 } 1303 1304 /* Calculate the maximum number of scalars per iteration for every 1305 rgroup in LOOP_VINFO. */ 1306 1307 static unsigned int 1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) 1309 { 1310 unsigned int res = 1; 1311 unsigned int i; 1312 rgroup_masks *rgm; 1313 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 1314 res = MAX (res, rgm->max_nscalars_per_iter); 1315 return res; 1316 } 1317 1318 /* Each statement in LOOP_VINFO can be masked where necessary. Check 1319 whether we can actually generate the masks required. Return true if so, 1320 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ 1321 1322 static bool 1323 vect_verify_full_masking (loop_vec_info loop_vinfo) 1324 { 1325 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1326 unsigned int min_ni_width; 1327 1328 /* Use a normal loop if there are no statements that need masking. 1329 This only happens in rare degenerate cases: it means that the loop 1330 has no loads, no stores, and no live-out values. */ 1331 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 1332 return false; 1333 1334 /* Get the maximum number of iterations that is representable 1335 in the counter type. */ 1336 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); 1337 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; 1338 1339 /* Get a more refined estimate for the number of iterations. */ 1340 widest_int max_back_edges; 1341 if (max_loop_iterations (loop, &max_back_edges)) 1342 max_ni = wi::smin (max_ni, max_back_edges + 1); 1343 1344 /* Account for rgroup masks, in which each bit is replicated N times. */ 1345 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); 1346 1347 /* Work out how many bits we need to represent the limit. */ 1348 min_ni_width = wi::min_precision (max_ni, UNSIGNED); 1349 1350 /* Find a scalar mode for which WHILE_ULT is supported. */ 1351 opt_scalar_int_mode cmp_mode_iter; 1352 tree cmp_type = NULL_TREE; 1353 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1354 { 1355 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1356 if (cmp_bits >= min_ni_width 1357 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1358 { 1359 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1360 if (this_type 1361 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1362 { 1363 /* Although we could stop as soon as we find a valid mode, 1364 it's often better to continue until we hit Pmode, since the 1365 operands to the WHILE are more likely to be reusable in 1366 address calculations. */ 1367 cmp_type = this_type; 1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1369 break; 1370 } 1371 } 1372 } 1373 1374 if (!cmp_type) 1375 return false; 1376 1377 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; 1378 return true; 1379 } 1380 1381 /* Calculate the cost of one scalar iteration of the loop. */ 1382 static void 1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1384 { 1385 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1386 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1387 int nbbs = loop->num_nodes, factor; 1388 int innerloop_iters, i; 1389 1390 /* Gather costs for statements in the scalar loop. */ 1391 1392 /* FORNOW. */ 1393 innerloop_iters = 1; 1394 if (loop->inner) 1395 innerloop_iters = 50; /* FIXME */ 1396 1397 for (i = 0; i < nbbs; i++) 1398 { 1399 gimple_stmt_iterator si; 1400 basic_block bb = bbs[i]; 1401 1402 if (bb->loop_father == loop->inner) 1403 factor = innerloop_iters; 1404 else 1405 factor = 1; 1406 1407 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1408 { 1409 gimple *stmt = gsi_stmt (si); 1410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1411 1412 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1413 continue; 1414 1415 /* Skip stmts that are not vectorized inside the loop. */ 1416 if (stmt_info 1417 && !STMT_VINFO_RELEVANT_P (stmt_info) 1418 && (!STMT_VINFO_LIVE_P (stmt_info) 1419 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1420 && !STMT_VINFO_IN_PATTERN_P (stmt_info)) 1421 continue; 1422 1423 vect_cost_for_stmt kind; 1424 if (STMT_VINFO_DATA_REF (stmt_info)) 1425 { 1426 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1427 kind = scalar_load; 1428 else 1429 kind = scalar_store; 1430 } 1431 else 1432 kind = scalar_stmt; 1433 1434 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1435 factor, kind, stmt_info, 0, vect_prologue); 1436 } 1437 } 1438 1439 /* Now accumulate cost. */ 1440 void *target_cost_data = init_cost (loop); 1441 stmt_info_for_cost *si; 1442 int j; 1443 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1444 j, si) 1445 { 1446 struct _stmt_vec_info *stmt_info 1447 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 1448 (void) add_stmt_cost (target_cost_data, si->count, 1449 si->kind, stmt_info, si->misalign, 1450 vect_body); 1451 } 1452 unsigned dummy, body_cost = 0; 1453 finish_cost (target_cost_data, &dummy, &body_cost, &dummy); 1454 destroy_cost_data (target_cost_data); 1455 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost; 1456 } 1457 1458 1459 /* Function vect_analyze_loop_form_1. 1460 1461 Verify that certain CFG restrictions hold, including: 1462 - the loop has a pre-header 1463 - the loop has a single entry and exit 1464 - the loop exit condition is simple enough 1465 - the number of iterations can be analyzed, i.e, a countable loop. The 1466 niter could be analyzed under some assumptions. */ 1467 1468 bool 1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, 1470 tree *assumptions, tree *number_of_iterationsm1, 1471 tree *number_of_iterations, gcond **inner_loop_cond) 1472 { 1473 if (dump_enabled_p ()) 1474 dump_printf_loc (MSG_NOTE, vect_location, 1475 "=== vect_analyze_loop_form ===\n"); 1476 1477 /* Different restrictions apply when we are considering an inner-most loop, 1478 vs. an outer (nested) loop. 1479 (FORNOW. May want to relax some of these restrictions in the future). */ 1480 1481 if (!loop->inner) 1482 { 1483 /* Inner-most loop. We currently require that the number of BBs is 1484 exactly 2 (the header and latch). Vectorizable inner-most loops 1485 look like this: 1486 1487 (pre-header) 1488 | 1489 header <--------+ 1490 | | | 1491 | +--> latch --+ 1492 | 1493 (exit-bb) */ 1494 1495 if (loop->num_nodes != 2) 1496 { 1497 if (dump_enabled_p ()) 1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1499 "not vectorized: control flow in loop.\n"); 1500 return false; 1501 } 1502 1503 if (empty_block_p (loop->header)) 1504 { 1505 if (dump_enabled_p ()) 1506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1507 "not vectorized: empty loop.\n"); 1508 return false; 1509 } 1510 } 1511 else 1512 { 1513 struct loop *innerloop = loop->inner; 1514 edge entryedge; 1515 1516 /* Nested loop. We currently require that the loop is doubly-nested, 1517 contains a single inner loop, and the number of BBs is exactly 5. 1518 Vectorizable outer-loops look like this: 1519 1520 (pre-header) 1521 | 1522 header <---+ 1523 | | 1524 inner-loop | 1525 | | 1526 tail ------+ 1527 | 1528 (exit-bb) 1529 1530 The inner-loop has the properties expected of inner-most loops 1531 as described above. */ 1532 1533 if ((loop->inner)->inner || (loop->inner)->next) 1534 { 1535 if (dump_enabled_p ()) 1536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1537 "not vectorized: multiple nested loops.\n"); 1538 return false; 1539 } 1540 1541 if (loop->num_nodes != 5) 1542 { 1543 if (dump_enabled_p ()) 1544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1545 "not vectorized: control flow in loop.\n"); 1546 return false; 1547 } 1548 1549 entryedge = loop_preheader_edge (innerloop); 1550 if (entryedge->src != loop->header 1551 || !single_exit (innerloop) 1552 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1553 { 1554 if (dump_enabled_p ()) 1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1556 "not vectorized: unsupported outerloop form.\n"); 1557 return false; 1558 } 1559 1560 /* Analyze the inner-loop. */ 1561 tree inner_niterm1, inner_niter, inner_assumptions; 1562 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond, 1563 &inner_assumptions, &inner_niterm1, 1564 &inner_niter, NULL) 1565 /* Don't support analyzing niter under assumptions for inner 1566 loop. */ 1567 || !integer_onep (inner_assumptions)) 1568 { 1569 if (dump_enabled_p ()) 1570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1571 "not vectorized: Bad inner loop.\n"); 1572 return false; 1573 } 1574 1575 if (!expr_invariant_in_loop_p (loop, inner_niter)) 1576 { 1577 if (dump_enabled_p ()) 1578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1579 "not vectorized: inner-loop count not" 1580 " invariant.\n"); 1581 return false; 1582 } 1583 1584 if (dump_enabled_p ()) 1585 dump_printf_loc (MSG_NOTE, vect_location, 1586 "Considering outer-loop vectorization.\n"); 1587 } 1588 1589 if (!single_exit (loop) 1590 || EDGE_COUNT (loop->header->preds) != 2) 1591 { 1592 if (dump_enabled_p ()) 1593 { 1594 if (!single_exit (loop)) 1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1596 "not vectorized: multiple exits.\n"); 1597 else if (EDGE_COUNT (loop->header->preds) != 2) 1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1599 "not vectorized: too many incoming edges.\n"); 1600 } 1601 return false; 1602 } 1603 1604 /* We assume that the loop exit condition is at the end of the loop. i.e, 1605 that the loop is represented as a do-while (with a proper if-guard 1606 before the loop if needed), where the loop header contains all the 1607 executable statements, and the latch is empty. */ 1608 if (!empty_block_p (loop->latch) 1609 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1610 { 1611 if (dump_enabled_p ()) 1612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1613 "not vectorized: latch block not empty.\n"); 1614 return false; 1615 } 1616 1617 /* Make sure the exit is not abnormal. */ 1618 edge e = single_exit (loop); 1619 if (e->flags & EDGE_ABNORMAL) 1620 { 1621 if (dump_enabled_p ()) 1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1623 "not vectorized: abnormal loop exit edge.\n"); 1624 return false; 1625 } 1626 1627 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations, 1628 number_of_iterationsm1); 1629 if (!*loop_cond) 1630 { 1631 if (dump_enabled_p ()) 1632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1633 "not vectorized: complicated exit condition.\n"); 1634 return false; 1635 } 1636 1637 if (integer_zerop (*assumptions) 1638 || !*number_of_iterations 1639 || chrec_contains_undetermined (*number_of_iterations)) 1640 { 1641 if (dump_enabled_p ()) 1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1643 "not vectorized: number of iterations cannot be " 1644 "computed.\n"); 1645 return false; 1646 } 1647 1648 if (integer_zerop (*number_of_iterations)) 1649 { 1650 if (dump_enabled_p ()) 1651 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1652 "not vectorized: number of iterations = 0.\n"); 1653 return false; 1654 } 1655 1656 return true; 1657 } 1658 1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1660 1661 loop_vec_info 1662 vect_analyze_loop_form (struct loop *loop) 1663 { 1664 tree assumptions, number_of_iterations, number_of_iterationsm1; 1665 gcond *loop_cond, *inner_loop_cond = NULL; 1666 1667 if (! vect_analyze_loop_form_1 (loop, &loop_cond, 1668 &assumptions, &number_of_iterationsm1, 1669 &number_of_iterations, &inner_loop_cond)) 1670 return NULL; 1671 1672 loop_vec_info loop_vinfo = new _loop_vec_info (loop); 1673 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; 1674 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1675 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1676 if (!integer_onep (assumptions)) 1677 { 1678 /* We consider to vectorize this loop by versioning it under 1679 some assumptions. In order to do this, we need to clear 1680 existing information computed by scev and niter analyzer. */ 1681 scev_reset_htab (); 1682 free_numbers_of_iterations_estimates (loop); 1683 /* Also set flag for this loop so that following scev and niter 1684 analysis are done under the assumptions. */ 1685 loop_constraint_set (loop, LOOP_C_FINITE); 1686 /* Also record the assumptions for versioning. */ 1687 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions; 1688 } 1689 1690 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1691 { 1692 if (dump_enabled_p ()) 1693 { 1694 dump_printf_loc (MSG_NOTE, vect_location, 1695 "Symbolic number of iterations is "); 1696 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations); 1697 dump_printf (MSG_NOTE, "\n"); 1698 } 1699 } 1700 1701 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; 1702 if (inner_loop_cond) 1703 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond)) 1704 = loop_exit_ctrl_vec_info_type; 1705 1706 gcc_assert (!loop->aux); 1707 loop->aux = loop_vinfo; 1708 return loop_vinfo; 1709 } 1710 1711 1712 1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP 1714 statements update the vectorization factor. */ 1715 1716 static void 1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1718 { 1719 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1721 int nbbs = loop->num_nodes; 1722 poly_uint64 vectorization_factor; 1723 int i; 1724 1725 if (dump_enabled_p ()) 1726 dump_printf_loc (MSG_NOTE, vect_location, 1727 "=== vect_update_vf_for_slp ===\n"); 1728 1729 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1730 gcc_assert (known_ne (vectorization_factor, 0U)); 1731 1732 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1733 vectorization factor of the loop is the unrolling factor required by 1734 the SLP instances. If that unrolling factor is 1, we say, that we 1735 perform pure SLP on loop - cross iteration parallelism is not 1736 exploited. */ 1737 bool only_slp_in_loop = true; 1738 for (i = 0; i < nbbs; i++) 1739 { 1740 basic_block bb = bbs[i]; 1741 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1742 gsi_next (&si)) 1743 { 1744 gimple *stmt = gsi_stmt (si); 1745 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1746 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 1747 && STMT_VINFO_RELATED_STMT (stmt_info)) 1748 { 1749 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 1750 stmt_info = vinfo_for_stmt (stmt); 1751 } 1752 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1753 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1754 && !PURE_SLP_STMT (stmt_info)) 1755 /* STMT needs both SLP and loop-based vectorization. */ 1756 only_slp_in_loop = false; 1757 } 1758 } 1759 1760 if (only_slp_in_loop) 1761 { 1762 dump_printf_loc (MSG_NOTE, vect_location, 1763 "Loop contains only SLP stmts\n"); 1764 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1765 } 1766 else 1767 { 1768 dump_printf_loc (MSG_NOTE, vect_location, 1769 "Loop contains SLP and non-SLP stmts\n"); 1770 /* Both the vectorization factor and unroll factor have the form 1771 current_vector_size * X for some rational X, so they must have 1772 a common multiple. */ 1773 vectorization_factor 1774 = force_common_multiple (vectorization_factor, 1775 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1776 } 1777 1778 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1779 if (dump_enabled_p ()) 1780 { 1781 dump_printf_loc (MSG_NOTE, vect_location, 1782 "Updating vectorization factor to "); 1783 dump_dec (MSG_NOTE, vectorization_factor); 1784 dump_printf (MSG_NOTE, ".\n"); 1785 } 1786 } 1787 1788 /* Return true if STMT_INFO describes a double reduction phi and if 1789 the other phi in the reduction is also relevant for vectorization. 1790 This rejects cases such as: 1791 1792 outer1: 1793 x_1 = PHI <x_3(outer2), ...>; 1794 ... 1795 1796 inner: 1797 x_2 = ...; 1798 ... 1799 1800 outer2: 1801 x_3 = PHI <x_2(inner)>; 1802 1803 if nothing in x_2 or elsewhere makes x_1 relevant. */ 1804 1805 static bool 1806 vect_active_double_reduction_p (stmt_vec_info stmt_info) 1807 { 1808 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 1809 return false; 1810 1811 gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info); 1812 return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi)); 1813 } 1814 1815 /* Function vect_analyze_loop_operations. 1816 1817 Scan the loop stmts and make sure they are all vectorizable. */ 1818 1819 static bool 1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1821 { 1822 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1823 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1824 int nbbs = loop->num_nodes; 1825 int i; 1826 stmt_vec_info stmt_info; 1827 bool need_to_vectorize = false; 1828 bool ok; 1829 1830 if (dump_enabled_p ()) 1831 dump_printf_loc (MSG_NOTE, vect_location, 1832 "=== vect_analyze_loop_operations ===\n"); 1833 1834 for (i = 0; i < nbbs; i++) 1835 { 1836 basic_block bb = bbs[i]; 1837 1838 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1839 gsi_next (&si)) 1840 { 1841 gphi *phi = si.phi (); 1842 ok = true; 1843 1844 stmt_info = vinfo_for_stmt (phi); 1845 if (dump_enabled_p ()) 1846 { 1847 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: "); 1848 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 1849 } 1850 if (virtual_operand_p (gimple_phi_result (phi))) 1851 continue; 1852 1853 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1854 (i.e., a phi in the tail of the outer-loop). */ 1855 if (! is_loop_header_bb_p (bb)) 1856 { 1857 /* FORNOW: we currently don't support the case that these phis 1858 are not used in the outerloop (unless it is double reduction, 1859 i.e., this phi is vect_reduction_def), cause this case 1860 requires to actually do something here. */ 1861 if (STMT_VINFO_LIVE_P (stmt_info) 1862 && !vect_active_double_reduction_p (stmt_info)) 1863 { 1864 if (dump_enabled_p ()) 1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1866 "Unsupported loop-closed phi in " 1867 "outer-loop.\n"); 1868 return false; 1869 } 1870 1871 /* If PHI is used in the outer loop, we check that its operand 1872 is defined in the inner loop. */ 1873 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1874 { 1875 tree phi_op; 1876 gimple *op_def_stmt; 1877 1878 if (gimple_phi_num_args (phi) != 1) 1879 return false; 1880 1881 phi_op = PHI_ARG_DEF (phi, 0); 1882 if (TREE_CODE (phi_op) != SSA_NAME) 1883 return false; 1884 1885 op_def_stmt = SSA_NAME_DEF_STMT (phi_op); 1886 if (gimple_nop_p (op_def_stmt) 1887 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt)) 1888 || !vinfo_for_stmt (op_def_stmt)) 1889 return false; 1890 1891 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1892 != vect_used_in_outer 1893 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt)) 1894 != vect_used_in_outer_by_reduction) 1895 return false; 1896 } 1897 1898 continue; 1899 } 1900 1901 gcc_assert (stmt_info); 1902 1903 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1904 || STMT_VINFO_LIVE_P (stmt_info)) 1905 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1906 { 1907 /* A scalar-dependence cycle that we don't support. */ 1908 if (dump_enabled_p ()) 1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1910 "not vectorized: scalar dependence cycle.\n"); 1911 return false; 1912 } 1913 1914 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1915 { 1916 need_to_vectorize = true; 1917 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1918 && ! PURE_SLP_STMT (stmt_info)) 1919 ok = vectorizable_induction (phi, NULL, NULL, NULL); 1920 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1921 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1922 && ! PURE_SLP_STMT (stmt_info)) 1923 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL); 1924 } 1925 1926 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1927 if (ok 1928 && STMT_VINFO_LIVE_P (stmt_info) 1929 && !PURE_SLP_STMT (stmt_info)) 1930 ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL); 1931 1932 if (!ok) 1933 { 1934 if (dump_enabled_p ()) 1935 { 1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1937 "not vectorized: relevant phi not " 1938 "supported: "); 1939 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0); 1940 } 1941 return false; 1942 } 1943 } 1944 1945 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1946 gsi_next (&si)) 1947 { 1948 gimple *stmt = gsi_stmt (si); 1949 if (!gimple_clobber_p (stmt) 1950 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL)) 1951 return false; 1952 } 1953 } /* bbs */ 1954 1955 /* All operations in the loop are either irrelevant (deal with loop 1956 control, or dead), or only used outside the loop and can be moved 1957 out of the loop (e.g. invariants, inductions). The loop can be 1958 optimized away by scalar optimizations. We're better off not 1959 touching this loop. */ 1960 if (!need_to_vectorize) 1961 { 1962 if (dump_enabled_p ()) 1963 dump_printf_loc (MSG_NOTE, vect_location, 1964 "All the computation can be taken out of the loop.\n"); 1965 if (dump_enabled_p ()) 1966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1967 "not vectorized: redundant loop. no profit to " 1968 "vectorize.\n"); 1969 return false; 1970 } 1971 1972 return true; 1973 } 1974 1975 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it 1976 is worthwhile to vectorize. Return 1 if definitely yes, 0 if 1977 definitely no, or -1 if it's worth retrying. */ 1978 1979 static int 1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo) 1981 { 1982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1983 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1984 1985 /* Only fully-masked loops can have iteration counts less than the 1986 vectorization factor. */ 1987 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1988 { 1989 HOST_WIDE_INT max_niter; 1990 1991 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1992 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); 1993 else 1994 max_niter = max_stmt_executions_int (loop); 1995 1996 if (max_niter != -1 1997 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) 1998 { 1999 if (dump_enabled_p ()) 2000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2001 "not vectorized: iteration count smaller than " 2002 "vectorization factor.\n"); 2003 return 0; 2004 } 2005 } 2006 2007 int min_profitable_iters, min_profitable_estimate; 2008 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 2009 &min_profitable_estimate); 2010 2011 if (min_profitable_iters < 0) 2012 { 2013 if (dump_enabled_p ()) 2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2015 "not vectorized: vectorization not profitable.\n"); 2016 if (dump_enabled_p ()) 2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2018 "not vectorized: vector version will never be " 2019 "profitable.\n"); 2020 return -1; 2021 } 2022 2023 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) 2024 * assumed_vf); 2025 2026 /* Use the cost model only if it is more conservative than user specified 2027 threshold. */ 2028 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 2029 min_profitable_iters); 2030 2031 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 2032 2033 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2034 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) 2035 { 2036 if (dump_enabled_p ()) 2037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2038 "not vectorized: vectorization not profitable.\n"); 2039 if (dump_enabled_p ()) 2040 dump_printf_loc (MSG_NOTE, vect_location, 2041 "not vectorized: iteration count smaller than user " 2042 "specified loop bound parameter or minimum profitable " 2043 "iterations (whichever is more conservative).\n"); 2044 return 0; 2045 } 2046 2047 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); 2048 if (estimated_niter == -1) 2049 estimated_niter = likely_max_stmt_executions_int (loop); 2050 if (estimated_niter != -1 2051 && ((unsigned HOST_WIDE_INT) estimated_niter 2052 < MAX (th, (unsigned) min_profitable_estimate))) 2053 { 2054 if (dump_enabled_p ()) 2055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2056 "not vectorized: estimated iteration count too " 2057 "small.\n"); 2058 if (dump_enabled_p ()) 2059 dump_printf_loc (MSG_NOTE, vect_location, 2060 "not vectorized: estimated iteration count smaller " 2061 "than specified loop bound parameter or minimum " 2062 "profitable iterations (whichever is more " 2063 "conservative).\n"); 2064 return -1; 2065 } 2066 2067 return 1; 2068 } 2069 2070 2071 /* Function vect_analyze_loop_2. 2072 2073 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2074 for it. The different analyses will record information in the 2075 loop_vec_info struct. */ 2076 static bool 2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) 2078 { 2079 bool ok; 2080 int res; 2081 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 2082 poly_uint64 min_vf = 2; 2083 unsigned int n_stmts = 0; 2084 2085 /* The first group of checks is independent of the vector size. */ 2086 fatal = true; 2087 2088 /* Find all data references in the loop (which correspond to vdefs/vuses) 2089 and analyze their evolution in the loop. */ 2090 2091 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 2092 2093 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 2094 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))) 2095 { 2096 if (dump_enabled_p ()) 2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2098 "not vectorized: loop nest containing two " 2099 "or more consecutive inner loops cannot be " 2100 "vectorized\n"); 2101 return false; 2102 } 2103 2104 for (unsigned i = 0; i < loop->num_nodes; i++) 2105 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 2106 !gsi_end_p (gsi); gsi_next (&gsi)) 2107 { 2108 gimple *stmt = gsi_stmt (gsi); 2109 if (is_gimple_debug (stmt)) 2110 continue; 2111 ++n_stmts; 2112 if (!find_data_references_in_stmt (loop, stmt, 2113 &LOOP_VINFO_DATAREFS (loop_vinfo))) 2114 { 2115 if (is_gimple_call (stmt) && loop->safelen) 2116 { 2117 tree fndecl = gimple_call_fndecl (stmt), op; 2118 if (fndecl != NULL_TREE) 2119 { 2120 cgraph_node *node = cgraph_node::get (fndecl); 2121 if (node != NULL && node->simd_clones != NULL) 2122 { 2123 unsigned int j, n = gimple_call_num_args (stmt); 2124 for (j = 0; j < n; j++) 2125 { 2126 op = gimple_call_arg (stmt, j); 2127 if (DECL_P (op) 2128 || (REFERENCE_CLASS_P (op) 2129 && get_base_address (op))) 2130 break; 2131 } 2132 op = gimple_call_lhs (stmt); 2133 /* Ignore #pragma omp declare simd functions 2134 if they don't have data references in the 2135 call stmt itself. */ 2136 if (j == n 2137 && !(op 2138 && (DECL_P (op) 2139 || (REFERENCE_CLASS_P (op) 2140 && get_base_address (op))))) 2141 continue; 2142 } 2143 } 2144 } 2145 if (dump_enabled_p ()) 2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2147 "not vectorized: loop contains function " 2148 "calls or data references that cannot " 2149 "be analyzed\n"); 2150 return false; 2151 } 2152 } 2153 2154 /* Analyze the data references and also adjust the minimal 2155 vectorization factor according to the loads and stores. */ 2156 2157 ok = vect_analyze_data_refs (loop_vinfo, &min_vf); 2158 if (!ok) 2159 { 2160 if (dump_enabled_p ()) 2161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2162 "bad data references.\n"); 2163 return false; 2164 } 2165 2166 /* Classify all cross-iteration scalar data-flow cycles. 2167 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 2168 vect_analyze_scalar_cycles (loop_vinfo); 2169 2170 vect_pattern_recog (loop_vinfo); 2171 2172 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 2173 2174 /* Analyze the access patterns of the data-refs in the loop (consecutive, 2175 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 2176 2177 ok = vect_analyze_data_ref_accesses (loop_vinfo); 2178 if (!ok) 2179 { 2180 if (dump_enabled_p ()) 2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2182 "bad data access.\n"); 2183 return false; 2184 } 2185 2186 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 2187 2188 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); 2189 if (!ok) 2190 { 2191 if (dump_enabled_p ()) 2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2193 "unexpected pattern.\n"); 2194 return false; 2195 } 2196 2197 /* While the rest of the analysis below depends on it in some way. */ 2198 fatal = false; 2199 2200 /* Analyze data dependences between the data-refs in the loop 2201 and adjust the maximum vectorization factor according to 2202 the dependences. 2203 FORNOW: fail at the first data dependence that we encounter. */ 2204 2205 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 2206 if (!ok 2207 || (max_vf != MAX_VECTORIZATION_FACTOR 2208 && maybe_lt (max_vf, min_vf))) 2209 { 2210 if (dump_enabled_p ()) 2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2212 "bad data dependence.\n"); 2213 return false; 2214 } 2215 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; 2216 2217 ok = vect_determine_vectorization_factor (loop_vinfo); 2218 if (!ok) 2219 { 2220 if (dump_enabled_p ()) 2221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2222 "can't determine vectorization factor.\n"); 2223 return false; 2224 } 2225 if (max_vf != MAX_VECTORIZATION_FACTOR 2226 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2227 { 2228 if (dump_enabled_p ()) 2229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2230 "bad data dependence.\n"); 2231 return false; 2232 } 2233 2234 /* Compute the scalar iteration cost. */ 2235 vect_compute_single_scalar_iteration_cost (loop_vinfo); 2236 2237 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2238 unsigned th; 2239 2240 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 2241 ok = vect_analyze_slp (loop_vinfo, n_stmts); 2242 if (!ok) 2243 return false; 2244 2245 /* If there are any SLP instances mark them as pure_slp. */ 2246 bool slp = vect_make_slp_decision (loop_vinfo); 2247 if (slp) 2248 { 2249 /* Find stmts that need to be both vectorized and SLPed. */ 2250 vect_detect_hybrid_slp (loop_vinfo); 2251 2252 /* Update the vectorization factor based on the SLP decision. */ 2253 vect_update_vf_for_slp (loop_vinfo); 2254 } 2255 2256 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); 2257 2258 /* We don't expect to have to roll back to anything other than an empty 2259 set of rgroups. */ 2260 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); 2261 2262 /* This is the point where we can re-start analysis with SLP forced off. */ 2263 start_over: 2264 2265 /* Now the vectorization factor is final. */ 2266 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2267 gcc_assert (known_ne (vectorization_factor, 0U)); 2268 2269 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 2270 { 2271 dump_printf_loc (MSG_NOTE, vect_location, 2272 "vectorization_factor = "); 2273 dump_dec (MSG_NOTE, vectorization_factor); 2274 dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n", 2275 LOOP_VINFO_INT_NITERS (loop_vinfo)); 2276 } 2277 2278 HOST_WIDE_INT max_niter 2279 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 2280 2281 /* Analyze the alignment of the data-refs in the loop. 2282 Fail if a data reference is found that cannot be vectorized. */ 2283 2284 ok = vect_analyze_data_refs_alignment (loop_vinfo); 2285 if (!ok) 2286 { 2287 if (dump_enabled_p ()) 2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2289 "bad data alignment.\n"); 2290 return false; 2291 } 2292 2293 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 2294 It is important to call pruning after vect_analyze_data_ref_accesses, 2295 since we use grouping information gathered by interleaving analysis. */ 2296 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 2297 if (!ok) 2298 return false; 2299 2300 /* Do not invoke vect_enhance_data_refs_alignment for eplilogue 2301 vectorization. */ 2302 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2303 { 2304 /* This pass will decide on using loop versioning and/or loop peeling in 2305 order to enhance the alignment of data references in the loop. */ 2306 ok = vect_enhance_data_refs_alignment (loop_vinfo); 2307 if (!ok) 2308 { 2309 if (dump_enabled_p ()) 2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2311 "bad data alignment.\n"); 2312 return false; 2313 } 2314 } 2315 2316 if (slp) 2317 { 2318 /* Analyze operations in the SLP instances. Note this may 2319 remove unsupported SLP instances which makes the above 2320 SLP kind detection invalid. */ 2321 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 2322 vect_slp_analyze_operations (loop_vinfo); 2323 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 2324 goto again; 2325 } 2326 2327 /* Scan all the remaining operations in the loop that are not subject 2328 to SLP and make sure they are vectorizable. */ 2329 ok = vect_analyze_loop_operations (loop_vinfo); 2330 if (!ok) 2331 { 2332 if (dump_enabled_p ()) 2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2334 "bad operation or unsupported loop bound.\n"); 2335 return false; 2336 } 2337 2338 /* Decide whether to use a fully-masked loop for this vectorization 2339 factor. */ 2340 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2341 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) 2342 && vect_verify_full_masking (loop_vinfo)); 2343 if (dump_enabled_p ()) 2344 { 2345 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2346 dump_printf_loc (MSG_NOTE, vect_location, 2347 "using a fully-masked loop.\n"); 2348 else 2349 dump_printf_loc (MSG_NOTE, vect_location, 2350 "not using a fully-masked loop.\n"); 2351 } 2352 2353 /* If epilog loop is required because of data accesses with gaps, 2354 one additional iteration needs to be peeled. Check if there is 2355 enough iterations for vectorization. */ 2356 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2357 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2358 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2359 { 2360 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2361 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); 2362 2363 if (known_lt (wi::to_widest (scalar_niters), vf)) 2364 { 2365 if (dump_enabled_p ()) 2366 dump_printf_loc (MSG_NOTE, vect_location, 2367 "loop has no enough iterations to support" 2368 " peeling for gaps.\n"); 2369 return false; 2370 } 2371 } 2372 2373 /* Check the costings of the loop make vectorizing worthwhile. */ 2374 res = vect_analyze_loop_costing (loop_vinfo); 2375 if (res < 0) 2376 goto again; 2377 if (!res) 2378 { 2379 if (dump_enabled_p ()) 2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2381 "Loop costings not worthwhile.\n"); 2382 return false; 2383 } 2384 2385 /* Decide whether we need to create an epilogue loop to handle 2386 remaining scalar iterations. */ 2387 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 2388 2389 unsigned HOST_WIDE_INT const_vf; 2390 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2391 /* The main loop handles all iterations. */ 2392 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2393 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2394 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 2395 { 2396 /* Work out the (constant) number of iterations that need to be 2397 peeled for reasons other than niters. */ 2398 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2399 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2400 peel_niter += 1; 2401 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, 2402 LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2403 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2404 } 2405 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2406 /* ??? When peeling for gaps but not alignment, we could 2407 try to check whether the (variable) niters is known to be 2408 VF * N + 1. That's something of a niche case though. */ 2409 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2410 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) 2411 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 2412 < (unsigned) exact_log2 (const_vf)) 2413 /* In case of versioning, check if the maximum number of 2414 iterations is greater than th. If they are identical, 2415 the epilogue is unnecessary. */ 2416 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2417 || ((unsigned HOST_WIDE_INT) max_niter 2418 > (th / const_vf) * const_vf)))) 2419 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 2420 2421 /* If an epilogue loop is required make sure we can create one. */ 2422 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2423 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2424 { 2425 if (dump_enabled_p ()) 2426 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2427 if (!vect_can_advance_ivs_p (loop_vinfo) 2428 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2429 single_exit (LOOP_VINFO_LOOP 2430 (loop_vinfo)))) 2431 { 2432 if (dump_enabled_p ()) 2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2434 "not vectorized: can't create required " 2435 "epilog loop\n"); 2436 goto again; 2437 } 2438 } 2439 2440 /* During peeling, we need to check if number of loop iterations is 2441 enough for both peeled prolog loop and vector loop. This check 2442 can be merged along with threshold check of loop versioning, so 2443 increase threshold for this case if necessary. */ 2444 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 2445 { 2446 poly_uint64 niters_th = 0; 2447 2448 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2449 { 2450 /* Niters for peeled prolog loop. */ 2451 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2452 { 2453 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2454 tree vectype 2455 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); 2456 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; 2457 } 2458 else 2459 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2460 } 2461 2462 /* Niters for at least one iteration of vectorized loop. */ 2463 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2464 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2465 /* One additional iteration because of peeling for gap. */ 2466 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2467 niters_th += 1; 2468 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2469 } 2470 2471 gcc_assert (known_eq (vectorization_factor, 2472 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2473 2474 /* Ok to vectorize! */ 2475 return true; 2476 2477 again: 2478 /* Try again with SLP forced off but if we didn't do any SLP there is 2479 no point in re-trying. */ 2480 if (!slp) 2481 return false; 2482 2483 /* If there are reduction chains re-trying will fail anyway. */ 2484 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2485 return false; 2486 2487 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2488 via interleaving or lane instructions. */ 2489 slp_instance instance; 2490 slp_tree node; 2491 unsigned i, j; 2492 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2493 { 2494 stmt_vec_info vinfo; 2495 vinfo = vinfo_for_stmt 2496 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]); 2497 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2498 continue; 2499 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2500 unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo); 2501 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2502 if (! vect_store_lanes_supported (vectype, size, false) 2503 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) 2504 && ! vect_grouped_store_supported (vectype, size)) 2505 return false; 2506 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2507 { 2508 vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]); 2509 vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo)); 2510 bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo); 2511 size = STMT_VINFO_GROUP_SIZE (vinfo); 2512 vectype = STMT_VINFO_VECTYPE (vinfo); 2513 if (! vect_load_lanes_supported (vectype, size, false) 2514 && ! vect_grouped_load_supported (vectype, single_element_p, 2515 size)) 2516 return false; 2517 } 2518 } 2519 2520 if (dump_enabled_p ()) 2521 dump_printf_loc (MSG_NOTE, vect_location, 2522 "re-trying with SLP disabled\n"); 2523 2524 /* Roll back state appropriately. No SLP this time. */ 2525 slp = false; 2526 /* Restore vectorization factor as it were without SLP. */ 2527 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2528 /* Free the SLP instances. */ 2529 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2530 vect_free_slp_instance (instance); 2531 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2532 /* Reset SLP type to loop_vect on all stmts. */ 2533 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2534 { 2535 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2536 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2537 !gsi_end_p (si); gsi_next (&si)) 2538 { 2539 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si)); 2540 STMT_SLP_TYPE (stmt_info) = loop_vect; 2541 } 2542 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2543 !gsi_end_p (si); gsi_next (&si)) 2544 { 2545 stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si)); 2546 STMT_SLP_TYPE (stmt_info) = loop_vect; 2547 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2548 { 2549 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)); 2550 STMT_SLP_TYPE (stmt_info) = loop_vect; 2551 for (gimple_stmt_iterator pi 2552 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info)); 2553 !gsi_end_p (pi); gsi_next (&pi)) 2554 { 2555 gimple *pstmt = gsi_stmt (pi); 2556 STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect; 2557 } 2558 } 2559 } 2560 } 2561 /* Free optimized alias test DDRS. */ 2562 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0); 2563 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2564 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); 2565 /* Reset target cost data. */ 2566 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2567 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) 2568 = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); 2569 /* Reset accumulated rgroup information. */ 2570 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); 2571 /* Reset assorted flags. */ 2572 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2573 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2574 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2575 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; 2576 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; 2577 2578 goto start_over; 2579 } 2580 2581 /* Function vect_analyze_loop. 2582 2583 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2584 for it. The different analyses will record information in the 2585 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must 2586 be vectorized. */ 2587 loop_vec_info 2588 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo) 2589 { 2590 loop_vec_info loop_vinfo; 2591 auto_vector_sizes vector_sizes; 2592 2593 /* Autodetect first vector size we try. */ 2594 current_vector_size = 0; 2595 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 2596 unsigned int next_size = 0; 2597 2598 if (dump_enabled_p ()) 2599 dump_printf_loc (MSG_NOTE, vect_location, 2600 "===== analyze_loop_nest =====\n"); 2601 2602 if (loop_outer (loop) 2603 && loop_vec_info_for_loop (loop_outer (loop)) 2604 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2605 { 2606 if (dump_enabled_p ()) 2607 dump_printf_loc (MSG_NOTE, vect_location, 2608 "outer-loop already vectorized.\n"); 2609 return NULL; 2610 } 2611 2612 poly_uint64 autodetected_vector_size = 0; 2613 while (1) 2614 { 2615 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2616 loop_vinfo = vect_analyze_loop_form (loop); 2617 if (!loop_vinfo) 2618 { 2619 if (dump_enabled_p ()) 2620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2621 "bad loop form.\n"); 2622 return NULL; 2623 } 2624 2625 bool fatal = false; 2626 2627 if (orig_loop_vinfo) 2628 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; 2629 2630 if (vect_analyze_loop_2 (loop_vinfo, fatal)) 2631 { 2632 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2633 2634 return loop_vinfo; 2635 } 2636 2637 delete loop_vinfo; 2638 2639 if (next_size == 0) 2640 autodetected_vector_size = current_vector_size; 2641 2642 if (next_size < vector_sizes.length () 2643 && known_eq (vector_sizes[next_size], autodetected_vector_size)) 2644 next_size += 1; 2645 2646 if (fatal 2647 || next_size == vector_sizes.length () 2648 || known_eq (current_vector_size, 0U)) 2649 return NULL; 2650 2651 /* Try the next biggest vector size. */ 2652 current_vector_size = vector_sizes[next_size++]; 2653 if (dump_enabled_p ()) 2654 { 2655 dump_printf_loc (MSG_NOTE, vect_location, 2656 "***** Re-trying analysis with " 2657 "vector size "); 2658 dump_dec (MSG_NOTE, current_vector_size); 2659 dump_printf (MSG_NOTE, "\n"); 2660 } 2661 } 2662 } 2663 2664 /* Return true if there is an in-order reduction function for CODE, storing 2665 it in *REDUC_FN if so. */ 2666 2667 static bool 2668 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn) 2669 { 2670 switch (code) 2671 { 2672 case PLUS_EXPR: 2673 *reduc_fn = IFN_FOLD_LEFT_PLUS; 2674 return true; 2675 2676 default: 2677 return false; 2678 } 2679 } 2680 2681 /* Function reduction_fn_for_scalar_code 2682 2683 Input: 2684 CODE - tree_code of a reduction operations. 2685 2686 Output: 2687 REDUC_FN - the corresponding internal function to be used to reduce the 2688 vector of partial results into a single scalar result, or IFN_LAST 2689 if the operation is a supported reduction operation, but does not have 2690 such an internal function. 2691 2692 Return FALSE if CODE currently cannot be vectorized as reduction. */ 2693 2694 static bool 2695 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) 2696 { 2697 switch (code) 2698 { 2699 case MAX_EXPR: 2700 *reduc_fn = IFN_REDUC_MAX; 2701 return true; 2702 2703 case MIN_EXPR: 2704 *reduc_fn = IFN_REDUC_MIN; 2705 return true; 2706 2707 case PLUS_EXPR: 2708 *reduc_fn = IFN_REDUC_PLUS; 2709 return true; 2710 2711 case BIT_AND_EXPR: 2712 *reduc_fn = IFN_REDUC_AND; 2713 return true; 2714 2715 case BIT_IOR_EXPR: 2716 *reduc_fn = IFN_REDUC_IOR; 2717 return true; 2718 2719 case BIT_XOR_EXPR: 2720 *reduc_fn = IFN_REDUC_XOR; 2721 return true; 2722 2723 case MULT_EXPR: 2724 case MINUS_EXPR: 2725 *reduc_fn = IFN_LAST; 2726 return true; 2727 2728 default: 2729 return false; 2730 } 2731 } 2732 2733 /* If there is a neutral value X such that SLP reduction NODE would not 2734 be affected by the introduction of additional X elements, return that X, 2735 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN 2736 is true if the SLP statements perform a single reduction, false if each 2737 statement performs an independent reduction. */ 2738 2739 static tree 2740 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code, 2741 bool reduc_chain) 2742 { 2743 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 2744 gimple *stmt = stmts[0]; 2745 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 2746 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 2747 tree scalar_type = TREE_TYPE (vector_type); 2748 struct loop *loop = gimple_bb (stmt)->loop_father; 2749 gcc_assert (loop); 2750 2751 switch (code) 2752 { 2753 case WIDEN_SUM_EXPR: 2754 case DOT_PROD_EXPR: 2755 case SAD_EXPR: 2756 case PLUS_EXPR: 2757 case MINUS_EXPR: 2758 case BIT_IOR_EXPR: 2759 case BIT_XOR_EXPR: 2760 return build_zero_cst (scalar_type); 2761 2762 case MULT_EXPR: 2763 return build_one_cst (scalar_type); 2764 2765 case BIT_AND_EXPR: 2766 return build_all_ones_cst (scalar_type); 2767 2768 case MAX_EXPR: 2769 case MIN_EXPR: 2770 /* For MIN/MAX the initial values are neutral. A reduction chain 2771 has only a single initial value, so that value is neutral for 2772 all statements. */ 2773 if (reduc_chain) 2774 return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop)); 2775 return NULL_TREE; 2776 2777 default: 2778 return NULL_TREE; 2779 } 2780 } 2781 2782 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 2783 STMT is printed with a message MSG. */ 2784 2785 static void 2786 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 2787 { 2788 dump_printf_loc (msg_type, vect_location, "%s", msg); 2789 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0); 2790 } 2791 2792 2793 /* Detect SLP reduction of the form: 2794 2795 #a1 = phi <a5, a0> 2796 a2 = operation (a1) 2797 a3 = operation (a2) 2798 a4 = operation (a3) 2799 a5 = operation (a4) 2800 2801 #a = phi <a5> 2802 2803 PHI is the reduction phi node (#a1 = phi <a5, a0> above) 2804 FIRST_STMT is the first reduction stmt in the chain 2805 (a2 = operation (a1)). 2806 2807 Return TRUE if a reduction chain was detected. */ 2808 2809 static bool 2810 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi, 2811 gimple *first_stmt) 2812 { 2813 struct loop *loop = (gimple_bb (phi))->loop_father; 2814 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 2815 enum tree_code code; 2816 gimple *loop_use_stmt = NULL; 2817 stmt_vec_info use_stmt_info; 2818 tree lhs; 2819 imm_use_iterator imm_iter; 2820 use_operand_p use_p; 2821 int nloop_uses, size = 0, n_out_of_loop_uses; 2822 bool found = false; 2823 2824 if (loop != vect_loop) 2825 return false; 2826 2827 auto_vec<stmt_vec_info, 8> reduc_chain; 2828 lhs = PHI_RESULT (phi); 2829 code = gimple_assign_rhs_code (first_stmt); 2830 while (1) 2831 { 2832 nloop_uses = 0; 2833 n_out_of_loop_uses = 0; 2834 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 2835 { 2836 gimple *use_stmt = USE_STMT (use_p); 2837 if (is_gimple_debug (use_stmt)) 2838 continue; 2839 2840 /* Check if we got back to the reduction phi. */ 2841 if (use_stmt == phi) 2842 { 2843 loop_use_stmt = use_stmt; 2844 found = true; 2845 break; 2846 } 2847 2848 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 2849 { 2850 loop_use_stmt = use_stmt; 2851 nloop_uses++; 2852 } 2853 else 2854 n_out_of_loop_uses++; 2855 2856 /* There are can be either a single use in the loop or two uses in 2857 phi nodes. */ 2858 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses)) 2859 return false; 2860 } 2861 2862 if (found) 2863 break; 2864 2865 /* We reached a statement with no loop uses. */ 2866 if (nloop_uses == 0) 2867 return false; 2868 2869 /* This is a loop exit phi, and we haven't reached the reduction phi. */ 2870 if (gimple_code (loop_use_stmt) == GIMPLE_PHI) 2871 return false; 2872 2873 if (!is_gimple_assign (loop_use_stmt) 2874 || code != gimple_assign_rhs_code (loop_use_stmt) 2875 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt))) 2876 return false; 2877 2878 /* Insert USE_STMT into reduction chain. */ 2879 use_stmt_info = vinfo_for_stmt (loop_use_stmt); 2880 reduc_chain.safe_push (use_stmt_info); 2881 2882 lhs = gimple_assign_lhs (loop_use_stmt); 2883 size++; 2884 } 2885 2886 if (!found || loop_use_stmt != phi || size < 2) 2887 return false; 2888 2889 /* Swap the operands, if needed, to make the reduction operand be the second 2890 operand. */ 2891 lhs = PHI_RESULT (phi); 2892 for (unsigned i = 0; i < reduc_chain.length (); ++i) 2893 { 2894 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt); 2895 if (gimple_assign_rhs2 (next_stmt) == lhs) 2896 { 2897 tree op = gimple_assign_rhs1 (next_stmt); 2898 gimple *def_stmt = NULL; 2899 2900 if (TREE_CODE (op) == SSA_NAME) 2901 def_stmt = SSA_NAME_DEF_STMT (op); 2902 2903 /* Check that the other def is either defined in the loop 2904 ("vect_internal_def"), or it's an induction (defined by a 2905 loop-header phi-node). */ 2906 if (def_stmt 2907 && gimple_bb (def_stmt) 2908 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2909 && (is_gimple_assign (def_stmt) 2910 || is_gimple_call (def_stmt) 2911 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2912 == vect_induction_def 2913 || (gimple_code (def_stmt) == GIMPLE_PHI 2914 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2915 == vect_internal_def 2916 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2917 { 2918 lhs = gimple_assign_lhs (next_stmt); 2919 continue; 2920 } 2921 2922 return false; 2923 } 2924 else 2925 { 2926 tree op = gimple_assign_rhs2 (next_stmt); 2927 gimple *def_stmt = NULL; 2928 2929 if (TREE_CODE (op) == SSA_NAME) 2930 def_stmt = SSA_NAME_DEF_STMT (op); 2931 2932 /* Check that the other def is either defined in the loop 2933 ("vect_internal_def"), or it's an induction (defined by a 2934 loop-header phi-node). */ 2935 if (def_stmt 2936 && gimple_bb (def_stmt) 2937 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 2938 && (is_gimple_assign (def_stmt) 2939 || is_gimple_call (def_stmt) 2940 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2941 == vect_induction_def 2942 || (gimple_code (def_stmt) == GIMPLE_PHI 2943 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 2944 == vect_internal_def 2945 && !is_loop_header_bb_p (gimple_bb (def_stmt))))) 2946 { 2947 if (dump_enabled_p ()) 2948 { 2949 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: "); 2950 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0); 2951 } 2952 2953 swap_ssa_operands (next_stmt, 2954 gimple_assign_rhs1_ptr (next_stmt), 2955 gimple_assign_rhs2_ptr (next_stmt)); 2956 update_stmt (next_stmt); 2957 2958 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt))) 2959 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 2960 } 2961 else 2962 return false; 2963 } 2964 2965 lhs = gimple_assign_lhs (next_stmt); 2966 } 2967 2968 /* Build up the actual chain. */ 2969 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) 2970 { 2971 GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt; 2972 GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt; 2973 } 2974 GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt; 2975 GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; 2976 2977 /* Save the chain for further analysis in SLP detection. */ 2978 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt); 2979 GROUP_SIZE (reduc_chain[0]) = size; 2980 2981 return true; 2982 } 2983 2984 /* Return true if we need an in-order reduction for operation CODE 2985 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 2986 overflow must wrap. */ 2987 2988 static bool 2989 needs_fold_left_reduction_p (tree type, tree_code code, 2990 bool need_wrapping_integral_overflow) 2991 { 2992 /* CHECKME: check for !flag_finite_math_only too? */ 2993 if (SCALAR_FLOAT_TYPE_P (type)) 2994 switch (code) 2995 { 2996 case MIN_EXPR: 2997 case MAX_EXPR: 2998 return false; 2999 3000 default: 3001 return !flag_associative_math; 3002 } 3003 3004 if (INTEGRAL_TYPE_P (type)) 3005 { 3006 if (!operation_no_trapping_overflow (type, code)) 3007 return true; 3008 if (need_wrapping_integral_overflow 3009 && !TYPE_OVERFLOW_WRAPS (type) 3010 && operation_can_overflow (code)) 3011 return true; 3012 return false; 3013 } 3014 3015 if (SAT_FIXED_POINT_TYPE_P (type)) 3016 return true; 3017 3018 return false; 3019 } 3020 3021 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 3022 reduction operation CODE has a handled computation expression. */ 3023 3024 bool 3025 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg, 3026 enum tree_code code) 3027 { 3028 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3029 auto_bitmap visited; 3030 tree lookfor = PHI_RESULT (phi); 3031 ssa_op_iter curri; 3032 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 3033 while (USE_FROM_PTR (curr) != loop_arg) 3034 curr = op_iter_next_use (&curri); 3035 curri.i = curri.numops; 3036 do 3037 { 3038 path.safe_push (std::make_pair (curri, curr)); 3039 tree use = USE_FROM_PTR (curr); 3040 if (use == lookfor) 3041 break; 3042 gimple *def = SSA_NAME_DEF_STMT (use); 3043 if (gimple_nop_p (def) 3044 || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) 3045 { 3046 pop: 3047 do 3048 { 3049 std::pair<ssa_op_iter, use_operand_p> x = path.pop (); 3050 curri = x.first; 3051 curr = x.second; 3052 do 3053 curr = op_iter_next_use (&curri); 3054 /* Skip already visited or non-SSA operands (from iterating 3055 over PHI args). */ 3056 while (curr != NULL_USE_OPERAND_P 3057 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3058 || ! bitmap_set_bit (visited, 3059 SSA_NAME_VERSION 3060 (USE_FROM_PTR (curr))))); 3061 } 3062 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); 3063 if (curr == NULL_USE_OPERAND_P) 3064 break; 3065 } 3066 else 3067 { 3068 if (gimple_code (def) == GIMPLE_PHI) 3069 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); 3070 else 3071 curr = op_iter_init_use (&curri, def, SSA_OP_USE); 3072 while (curr != NULL_USE_OPERAND_P 3073 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 3074 || ! bitmap_set_bit (visited, 3075 SSA_NAME_VERSION 3076 (USE_FROM_PTR (curr))))) 3077 curr = op_iter_next_use (&curri); 3078 if (curr == NULL_USE_OPERAND_P) 3079 goto pop; 3080 } 3081 } 3082 while (1); 3083 if (dump_file && (dump_flags & TDF_DETAILS)) 3084 { 3085 dump_printf_loc (MSG_NOTE, loc, "reduction path: "); 3086 unsigned i; 3087 std::pair<ssa_op_iter, use_operand_p> *x; 3088 FOR_EACH_VEC_ELT (path, i, x) 3089 { 3090 dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second)); 3091 dump_printf (MSG_NOTE, " "); 3092 } 3093 dump_printf (MSG_NOTE, "\n"); 3094 } 3095 3096 /* Check whether the reduction path detected is valid. */ 3097 bool fail = path.length () == 0; 3098 bool neg = false; 3099 for (unsigned i = 1; i < path.length (); ++i) 3100 { 3101 gimple *use_stmt = USE_STMT (path[i].second); 3102 tree op = USE_FROM_PTR (path[i].second); 3103 if (! has_single_use (op) 3104 || ! is_gimple_assign (use_stmt)) 3105 { 3106 fail = true; 3107 break; 3108 } 3109 if (gimple_assign_rhs_code (use_stmt) != code) 3110 { 3111 if (code == PLUS_EXPR 3112 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 3113 { 3114 /* Track whether we negate the reduction value each iteration. */ 3115 if (gimple_assign_rhs2 (use_stmt) == op) 3116 neg = ! neg; 3117 } 3118 else 3119 { 3120 fail = true; 3121 break; 3122 } 3123 } 3124 } 3125 return ! fail && ! neg; 3126 } 3127 3128 3129 /* Function vect_is_simple_reduction 3130 3131 (1) Detect a cross-iteration def-use cycle that represents a simple 3132 reduction computation. We look for the following pattern: 3133 3134 loop_header: 3135 a1 = phi < a0, a2 > 3136 a3 = ... 3137 a2 = operation (a3, a1) 3138 3139 or 3140 3141 a3 = ... 3142 loop_header: 3143 a1 = phi < a0, a2 > 3144 a2 = operation (a3, a1) 3145 3146 such that: 3147 1. operation is commutative and associative and it is safe to 3148 change the order of the computation 3149 2. no uses for a2 in the loop (a2 is used out of the loop) 3150 3. no uses of a1 in the loop besides the reduction operation 3151 4. no uses of a1 outside the loop. 3152 3153 Conditions 1,4 are tested here. 3154 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 3155 3156 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 3157 nested cycles. 3158 3159 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 3160 reductions: 3161 3162 a1 = phi < a0, a2 > 3163 inner loop (def of a3) 3164 a2 = phi < a3 > 3165 3166 (4) Detect condition expressions, ie: 3167 for (int i = 0; i < N; i++) 3168 if (a[i] < val) 3169 ret_val = a[i]; 3170 3171 */ 3172 3173 static gimple * 3174 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, 3175 bool *double_reduc, 3176 bool need_wrapping_integral_overflow, 3177 enum vect_reduction_type *v_reduc_type) 3178 { 3179 struct loop *loop = (gimple_bb (phi))->loop_father; 3180 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); 3181 gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL; 3182 enum tree_code orig_code, code; 3183 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; 3184 tree type; 3185 int nloop_uses; 3186 tree name; 3187 imm_use_iterator imm_iter; 3188 use_operand_p use_p; 3189 bool phi_def; 3190 3191 *double_reduc = false; 3192 *v_reduc_type = TREE_CODE_REDUCTION; 3193 3194 tree phi_name = PHI_RESULT (phi); 3195 /* ??? If there are no uses of the PHI result the inner loop reduction 3196 won't be detected as possibly double-reduction by vectorizable_reduction 3197 because that tries to walk the PHI arg from the preheader edge which 3198 can be constant. See PR60382. */ 3199 if (has_zero_uses (phi_name)) 3200 return NULL; 3201 nloop_uses = 0; 3202 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 3203 { 3204 gimple *use_stmt = USE_STMT (use_p); 3205 if (is_gimple_debug (use_stmt)) 3206 continue; 3207 3208 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3209 { 3210 if (dump_enabled_p ()) 3211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3212 "intermediate value used outside loop.\n"); 3213 3214 return NULL; 3215 } 3216 3217 nloop_uses++; 3218 if (nloop_uses > 1) 3219 { 3220 if (dump_enabled_p ()) 3221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3222 "reduction value used in loop.\n"); 3223 return NULL; 3224 } 3225 3226 phi_use_stmt = use_stmt; 3227 } 3228 3229 edge latch_e = loop_latch_edge (loop); 3230 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 3231 if (TREE_CODE (loop_arg) != SSA_NAME) 3232 { 3233 if (dump_enabled_p ()) 3234 { 3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3236 "reduction: not ssa_name: "); 3237 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg); 3238 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 3239 } 3240 return NULL; 3241 } 3242 3243 def_stmt = SSA_NAME_DEF_STMT (loop_arg); 3244 if (is_gimple_assign (def_stmt)) 3245 { 3246 name = gimple_assign_lhs (def_stmt); 3247 phi_def = false; 3248 } 3249 else if (gimple_code (def_stmt) == GIMPLE_PHI) 3250 { 3251 name = PHI_RESULT (def_stmt); 3252 phi_def = true; 3253 } 3254 else 3255 { 3256 if (dump_enabled_p ()) 3257 { 3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3259 "reduction: unhandled reduction operation: "); 3260 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0); 3261 } 3262 return NULL; 3263 } 3264 3265 if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))) 3266 return NULL; 3267 3268 nloop_uses = 0; 3269 auto_vec<gphi *, 3> lcphis; 3270 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) 3271 { 3272 gimple *use_stmt = USE_STMT (use_p); 3273 if (is_gimple_debug (use_stmt)) 3274 continue; 3275 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3276 nloop_uses++; 3277 else 3278 /* We can have more than one loop-closed PHI. */ 3279 lcphis.safe_push (as_a <gphi *> (use_stmt)); 3280 if (nloop_uses > 1) 3281 { 3282 if (dump_enabled_p ()) 3283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3284 "reduction used in loop.\n"); 3285 return NULL; 3286 } 3287 } 3288 3289 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 3290 defined in the inner loop. */ 3291 if (phi_def) 3292 { 3293 op1 = PHI_ARG_DEF (def_stmt, 0); 3294 3295 if (gimple_phi_num_args (def_stmt) != 1 3296 || TREE_CODE (op1) != SSA_NAME) 3297 { 3298 if (dump_enabled_p ()) 3299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3300 "unsupported phi node definition.\n"); 3301 3302 return NULL; 3303 } 3304 3305 def1 = SSA_NAME_DEF_STMT (op1); 3306 if (gimple_bb (def1) 3307 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 3308 && loop->inner 3309 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 3310 && is_gimple_assign (def1) 3311 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 3312 { 3313 if (dump_enabled_p ()) 3314 report_vect_op (MSG_NOTE, def_stmt, 3315 "detected double reduction: "); 3316 3317 *double_reduc = true; 3318 return def_stmt; 3319 } 3320 3321 return NULL; 3322 } 3323 3324 /* If we are vectorizing an inner reduction we are executing that 3325 in the original order only in case we are not dealing with a 3326 double reduction. */ 3327 bool check_reduction = true; 3328 if (flow_loop_nested_p (vect_loop, loop)) 3329 { 3330 gphi *lcphi; 3331 unsigned i; 3332 check_reduction = false; 3333 FOR_EACH_VEC_ELT (lcphis, i, lcphi) 3334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi)) 3335 { 3336 gimple *use_stmt = USE_STMT (use_p); 3337 if (is_gimple_debug (use_stmt)) 3338 continue; 3339 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt))) 3340 check_reduction = true; 3341 } 3342 } 3343 3344 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); 3345 code = orig_code = gimple_assign_rhs_code (def_stmt); 3346 3347 /* We can handle "res -= x[i]", which is non-associative by 3348 simply rewriting this into "res += -x[i]". Avoid changing 3349 gimple instruction for the first simple tests and only do this 3350 if we're allowed to change code at all. */ 3351 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name) 3352 code = PLUS_EXPR; 3353 3354 if (code == COND_EXPR) 3355 { 3356 if (! nested_in_vect_loop) 3357 *v_reduc_type = COND_REDUCTION; 3358 3359 op3 = gimple_assign_rhs1 (def_stmt); 3360 if (COMPARISON_CLASS_P (op3)) 3361 { 3362 op4 = TREE_OPERAND (op3, 1); 3363 op3 = TREE_OPERAND (op3, 0); 3364 } 3365 if (op3 == phi_name || op4 == phi_name) 3366 { 3367 if (dump_enabled_p ()) 3368 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3369 "reduction: condition depends on previous" 3370 " iteration: "); 3371 return NULL; 3372 } 3373 3374 op1 = gimple_assign_rhs2 (def_stmt); 3375 op2 = gimple_assign_rhs3 (def_stmt); 3376 } 3377 else if (!commutative_tree_code (code) || !associative_tree_code (code)) 3378 { 3379 if (dump_enabled_p ()) 3380 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3381 "reduction: not commutative/associative: "); 3382 return NULL; 3383 } 3384 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS) 3385 { 3386 op1 = gimple_assign_rhs1 (def_stmt); 3387 op2 = gimple_assign_rhs2 (def_stmt); 3388 } 3389 else 3390 { 3391 if (dump_enabled_p ()) 3392 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3393 "reduction: not handled operation: "); 3394 return NULL; 3395 } 3396 3397 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) 3398 { 3399 if (dump_enabled_p ()) 3400 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3401 "reduction: both uses not ssa_names: "); 3402 3403 return NULL; 3404 } 3405 3406 type = TREE_TYPE (gimple_assign_lhs (def_stmt)); 3407 if ((TREE_CODE (op1) == SSA_NAME 3408 && !types_compatible_p (type,TREE_TYPE (op1))) 3409 || (TREE_CODE (op2) == SSA_NAME 3410 && !types_compatible_p (type, TREE_TYPE (op2))) 3411 || (op3 && TREE_CODE (op3) == SSA_NAME 3412 && !types_compatible_p (type, TREE_TYPE (op3))) 3413 || (op4 && TREE_CODE (op4) == SSA_NAME 3414 && !types_compatible_p (type, TREE_TYPE (op4)))) 3415 { 3416 if (dump_enabled_p ()) 3417 { 3418 dump_printf_loc (MSG_NOTE, vect_location, 3419 "reduction: multiple types: operation type: "); 3420 dump_generic_expr (MSG_NOTE, TDF_SLIM, type); 3421 dump_printf (MSG_NOTE, ", operands types: "); 3422 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3423 TREE_TYPE (op1)); 3424 dump_printf (MSG_NOTE, ","); 3425 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3426 TREE_TYPE (op2)); 3427 if (op3) 3428 { 3429 dump_printf (MSG_NOTE, ","); 3430 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3431 TREE_TYPE (op3)); 3432 } 3433 3434 if (op4) 3435 { 3436 dump_printf (MSG_NOTE, ","); 3437 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3438 TREE_TYPE (op4)); 3439 } 3440 dump_printf (MSG_NOTE, "\n"); 3441 } 3442 3443 return NULL; 3444 } 3445 3446 /* Check whether it's ok to change the order of the computation. 3447 Generally, when vectorizing a reduction we change the order of the 3448 computation. This may change the behavior of the program in some 3449 cases, so we need to check that this is ok. One exception is when 3450 vectorizing an outer-loop: the inner-loop is executed sequentially, 3451 and therefore vectorizing reductions in the inner-loop during 3452 outer-loop vectorization is safe. */ 3453 if (check_reduction 3454 && *v_reduc_type == TREE_CODE_REDUCTION 3455 && needs_fold_left_reduction_p (type, code, 3456 need_wrapping_integral_overflow)) 3457 *v_reduc_type = FOLD_LEFT_REDUCTION; 3458 3459 /* Reduction is safe. We're dealing with one of the following: 3460 1) integer arithmetic and no trapv 3461 2) floating point arithmetic, and special flags permit this optimization 3462 3) nested cycle (i.e., outer loop vectorization). */ 3463 if (TREE_CODE (op1) == SSA_NAME) 3464 def1 = SSA_NAME_DEF_STMT (op1); 3465 3466 if (TREE_CODE (op2) == SSA_NAME) 3467 def2 = SSA_NAME_DEF_STMT (op2); 3468 3469 if (code != COND_EXPR 3470 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2)))) 3471 { 3472 if (dump_enabled_p ()) 3473 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: "); 3474 return NULL; 3475 } 3476 3477 /* Check that one def is the reduction def, defined by PHI, 3478 the other def is either defined in the loop ("vect_internal_def"), 3479 or it's an induction (defined by a loop-header phi-node). */ 3480 3481 if (def2 && def2 == phi 3482 && (code == COND_EXPR 3483 || !def1 || gimple_nop_p (def1) 3484 || !flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3485 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) 3486 && (is_gimple_assign (def1) 3487 || is_gimple_call (def1) 3488 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3489 == vect_induction_def 3490 || (gimple_code (def1) == GIMPLE_PHI 3491 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) 3492 == vect_internal_def 3493 && !is_loop_header_bb_p (gimple_bb (def1))))))) 3494 { 3495 if (dump_enabled_p ()) 3496 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3497 return def_stmt; 3498 } 3499 3500 if (def1 && def1 == phi 3501 && (code == COND_EXPR 3502 || !def2 || gimple_nop_p (def2) 3503 || !flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3504 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) 3505 && (is_gimple_assign (def2) 3506 || is_gimple_call (def2) 3507 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3508 == vect_induction_def 3509 || (gimple_code (def2) == GIMPLE_PHI 3510 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) 3511 == vect_internal_def 3512 && !is_loop_header_bb_p (gimple_bb (def2))))))) 3513 { 3514 if (! nested_in_vect_loop && orig_code != MINUS_EXPR) 3515 { 3516 /* Check if we can swap operands (just for simplicity - so that 3517 the rest of the code can assume that the reduction variable 3518 is always the last (second) argument). */ 3519 if (code == COND_EXPR) 3520 { 3521 /* Swap cond_expr by inverting the condition. */ 3522 tree cond_expr = gimple_assign_rhs1 (def_stmt); 3523 enum tree_code invert_code = ERROR_MARK; 3524 enum tree_code cond_code = TREE_CODE (cond_expr); 3525 3526 if (TREE_CODE_CLASS (cond_code) == tcc_comparison) 3527 { 3528 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0)); 3529 invert_code = invert_tree_comparison (cond_code, honor_nans); 3530 } 3531 if (invert_code != ERROR_MARK) 3532 { 3533 TREE_SET_CODE (cond_expr, invert_code); 3534 swap_ssa_operands (def_stmt, 3535 gimple_assign_rhs2_ptr (def_stmt), 3536 gimple_assign_rhs3_ptr (def_stmt)); 3537 } 3538 else 3539 { 3540 if (dump_enabled_p ()) 3541 report_vect_op (MSG_NOTE, def_stmt, 3542 "detected reduction: cannot swap operands " 3543 "for cond_expr"); 3544 return NULL; 3545 } 3546 } 3547 else 3548 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), 3549 gimple_assign_rhs2_ptr (def_stmt)); 3550 3551 if (dump_enabled_p ()) 3552 report_vect_op (MSG_NOTE, def_stmt, 3553 "detected reduction: need to swap operands: "); 3554 3555 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt))) 3556 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true; 3557 } 3558 else 3559 { 3560 if (dump_enabled_p ()) 3561 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: "); 3562 } 3563 3564 return def_stmt; 3565 } 3566 3567 /* Try to find SLP reduction chain. */ 3568 if (! nested_in_vect_loop 3569 && code != COND_EXPR 3570 && orig_code != MINUS_EXPR 3571 && vect_is_slp_reduction (loop_info, phi, def_stmt)) 3572 { 3573 if (dump_enabled_p ()) 3574 report_vect_op (MSG_NOTE, def_stmt, 3575 "reduction: detected reduction chain: "); 3576 3577 return def_stmt; 3578 } 3579 3580 /* Dissolve group eventually half-built by vect_is_slp_reduction. */ 3581 gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt)); 3582 while (first) 3583 { 3584 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); 3585 GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL; 3586 GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL; 3587 first = next; 3588 } 3589 3590 /* Look for the expression computing loop_arg from loop PHI result. */ 3591 if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg, 3592 code)) 3593 return def_stmt; 3594 3595 if (dump_enabled_p ()) 3596 { 3597 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, 3598 "reduction: unknown pattern: "); 3599 } 3600 3601 return NULL; 3602 } 3603 3604 /* Wrapper around vect_is_simple_reduction, which will modify code 3605 in-place if it enables detection of more reductions. Arguments 3606 as there. */ 3607 3608 gimple * 3609 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi, 3610 bool *double_reduc, 3611 bool need_wrapping_integral_overflow) 3612 { 3613 enum vect_reduction_type v_reduc_type; 3614 gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc, 3615 need_wrapping_integral_overflow, 3616 &v_reduc_type); 3617 if (def) 3618 { 3619 stmt_vec_info reduc_def_info = vinfo_for_stmt (phi); 3620 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type; 3621 STMT_VINFO_REDUC_DEF (reduc_def_info) = def; 3622 reduc_def_info = vinfo_for_stmt (def); 3623 STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type; 3624 STMT_VINFO_REDUC_DEF (reduc_def_info) = phi; 3625 } 3626 return def; 3627 } 3628 3629 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3630 int 3631 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3632 int *peel_iters_epilogue, 3633 stmt_vector_for_cost *scalar_cost_vec, 3634 stmt_vector_for_cost *prologue_cost_vec, 3635 stmt_vector_for_cost *epilogue_cost_vec) 3636 { 3637 int retval = 0; 3638 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3639 3640 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3641 { 3642 *peel_iters_epilogue = assumed_vf / 2; 3643 if (dump_enabled_p ()) 3644 dump_printf_loc (MSG_NOTE, vect_location, 3645 "cost model: epilogue peel iters set to vf/2 " 3646 "because loop iterations are unknown .\n"); 3647 3648 /* If peeled iterations are known but number of scalar loop 3649 iterations are unknown, count a taken branch per peeled loop. */ 3650 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3651 NULL, 0, vect_prologue); 3652 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3653 NULL, 0, vect_epilogue); 3654 } 3655 else 3656 { 3657 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3658 peel_iters_prologue = niters < peel_iters_prologue ? 3659 niters : peel_iters_prologue; 3660 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; 3661 /* If we need to peel for gaps, but no peeling is required, we have to 3662 peel VF iterations. */ 3663 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue) 3664 *peel_iters_epilogue = assumed_vf; 3665 } 3666 3667 stmt_info_for_cost *si; 3668 int j; 3669 if (peel_iters_prologue) 3670 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3671 { 3672 stmt_vec_info stmt_info 3673 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3674 retval += record_stmt_cost (prologue_cost_vec, 3675 si->count * peel_iters_prologue, 3676 si->kind, stmt_info, si->misalign, 3677 vect_prologue); 3678 } 3679 if (*peel_iters_epilogue) 3680 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3681 { 3682 stmt_vec_info stmt_info 3683 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3684 retval += record_stmt_cost (epilogue_cost_vec, 3685 si->count * *peel_iters_epilogue, 3686 si->kind, stmt_info, si->misalign, 3687 vect_epilogue); 3688 } 3689 3690 return retval; 3691 } 3692 3693 /* Function vect_estimate_min_profitable_iters 3694 3695 Return the number of iterations required for the vector version of the 3696 loop to be profitable relative to the cost of the scalar version of the 3697 loop. 3698 3699 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3700 of iterations for vectorization. -1 value means loop vectorization 3701 is not profitable. This returned value may be used for dynamic 3702 profitability check. 3703 3704 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3705 for static check against estimated number of iterations. */ 3706 3707 static void 3708 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3709 int *ret_min_profitable_niters, 3710 int *ret_min_profitable_estimate) 3711 { 3712 int min_profitable_iters; 3713 int min_profitable_estimate; 3714 int peel_iters_prologue; 3715 int peel_iters_epilogue; 3716 unsigned vec_inside_cost = 0; 3717 int vec_outside_cost = 0; 3718 unsigned vec_prologue_cost = 0; 3719 unsigned vec_epilogue_cost = 0; 3720 int scalar_single_iter_cost = 0; 3721 int scalar_outside_cost = 0; 3722 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3723 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3724 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3725 3726 /* Cost model disabled. */ 3727 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3728 { 3729 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3730 *ret_min_profitable_niters = 0; 3731 *ret_min_profitable_estimate = 0; 3732 return; 3733 } 3734 3735 /* Requires loop versioning tests to handle misalignment. */ 3736 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3737 { 3738 /* FIXME: Make cost depend on complexity of individual check. */ 3739 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3740 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3741 vect_prologue); 3742 dump_printf (MSG_NOTE, 3743 "cost model: Adding cost of checks for loop " 3744 "versioning to treat misalignment.\n"); 3745 } 3746 3747 /* Requires loop versioning with alias checks. */ 3748 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3749 { 3750 /* FIXME: Make cost depend on complexity of individual check. */ 3751 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3752 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3753 vect_prologue); 3754 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); 3755 if (len) 3756 /* Count LEN - 1 ANDs and LEN comparisons. */ 3757 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt, 3758 NULL, 0, vect_prologue); 3759 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); 3760 if (len) 3761 { 3762 /* Count LEN - 1 ANDs and LEN comparisons. */ 3763 unsigned int nstmts = len * 2 - 1; 3764 /* +1 for each bias that needs adding. */ 3765 for (unsigned int i = 0; i < len; ++i) 3766 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3767 nstmts += 1; 3768 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, 3769 NULL, 0, vect_prologue); 3770 } 3771 dump_printf (MSG_NOTE, 3772 "cost model: Adding cost of checks for loop " 3773 "versioning aliasing.\n"); 3774 } 3775 3776 /* Requires loop versioning with niter checks. */ 3777 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3778 { 3779 /* FIXME: Make cost depend on complexity of individual check. */ 3780 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3781 vect_prologue); 3782 dump_printf (MSG_NOTE, 3783 "cost model: Adding cost of checks for loop " 3784 "versioning niters.\n"); 3785 } 3786 3787 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3788 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3789 vect_prologue); 3790 3791 /* Count statements in scalar loop. Using this as scalar cost for a single 3792 iteration for now. 3793 3794 TODO: Add outer loop support. 3795 3796 TODO: Consider assigning different costs to different scalar 3797 statements. */ 3798 3799 scalar_single_iter_cost 3800 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); 3801 3802 /* Add additional cost for the peeled instructions in prologue and epilogue 3803 loop. (For fully-masked loops there will be no peeling.) 3804 3805 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 3806 at compile-time - we assume it's vf/2 (the worst would be vf-1). 3807 3808 TODO: Build an expression that represents peel_iters for prologue and 3809 epilogue to be used in a run-time test. */ 3810 3811 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3812 { 3813 peel_iters_prologue = 0; 3814 peel_iters_epilogue = 0; 3815 3816 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 3817 { 3818 /* We need to peel exactly one iteration. */ 3819 peel_iters_epilogue += 1; 3820 stmt_info_for_cost *si; 3821 int j; 3822 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 3823 j, si) 3824 { 3825 struct _stmt_vec_info *stmt_info 3826 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3827 (void) add_stmt_cost (target_cost_data, si->count, 3828 si->kind, stmt_info, si->misalign, 3829 vect_epilogue); 3830 } 3831 } 3832 } 3833 else if (npeel < 0) 3834 { 3835 peel_iters_prologue = assumed_vf / 2; 3836 dump_printf (MSG_NOTE, "cost model: " 3837 "prologue peel iters set to vf/2.\n"); 3838 3839 /* If peeling for alignment is unknown, loop bound of main loop becomes 3840 unknown. */ 3841 peel_iters_epilogue = assumed_vf / 2; 3842 dump_printf (MSG_NOTE, "cost model: " 3843 "epilogue peel iters set to vf/2 because " 3844 "peeling for alignment is unknown.\n"); 3845 3846 /* If peeled iterations are unknown, count a taken branch and a not taken 3847 branch per peeled loop. Even if scalar loop iterations are known, 3848 vector iterations are not known since peeled prologue iterations are 3849 not known. Hence guards remain the same. */ 3850 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3851 NULL, 0, vect_prologue); 3852 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3853 NULL, 0, vect_prologue); 3854 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3855 NULL, 0, vect_epilogue); 3856 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3857 NULL, 0, vect_epilogue); 3858 stmt_info_for_cost *si; 3859 int j; 3860 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 3861 { 3862 struct _stmt_vec_info *stmt_info 3863 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3864 (void) add_stmt_cost (target_cost_data, 3865 si->count * peel_iters_prologue, 3866 si->kind, stmt_info, si->misalign, 3867 vect_prologue); 3868 (void) add_stmt_cost (target_cost_data, 3869 si->count * peel_iters_epilogue, 3870 si->kind, stmt_info, si->misalign, 3871 vect_epilogue); 3872 } 3873 } 3874 else 3875 { 3876 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 3877 stmt_info_for_cost *si; 3878 int j; 3879 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3880 3881 prologue_cost_vec.create (2); 3882 epilogue_cost_vec.create (2); 3883 peel_iters_prologue = npeel; 3884 3885 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue, 3886 &peel_iters_epilogue, 3887 &LOOP_VINFO_SCALAR_ITERATION_COST 3888 (loop_vinfo), 3889 &prologue_cost_vec, 3890 &epilogue_cost_vec); 3891 3892 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si) 3893 { 3894 struct _stmt_vec_info *stmt_info 3895 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3896 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3897 si->misalign, vect_prologue); 3898 } 3899 3900 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si) 3901 { 3902 struct _stmt_vec_info *stmt_info 3903 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 3904 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 3905 si->misalign, vect_epilogue); 3906 } 3907 3908 prologue_cost_vec.release (); 3909 epilogue_cost_vec.release (); 3910 } 3911 3912 /* FORNOW: The scalar outside cost is incremented in one of the 3913 following ways: 3914 3915 1. The vectorizer checks for alignment and aliasing and generates 3916 a condition that allows dynamic vectorization. A cost model 3917 check is ANDED with the versioning condition. Hence scalar code 3918 path now has the added cost of the versioning check. 3919 3920 if (cost > th & versioning_check) 3921 jmp to vector code 3922 3923 Hence run-time scalar is incremented by not-taken branch cost. 3924 3925 2. The vectorizer then checks if a prologue is required. If the 3926 cost model check was not done before during versioning, it has to 3927 be done before the prologue check. 3928 3929 if (cost <= th) 3930 prologue = scalar_iters 3931 if (prologue == 0) 3932 jmp to vector code 3933 else 3934 execute prologue 3935 if (prologue == num_iters) 3936 go to exit 3937 3938 Hence the run-time scalar cost is incremented by a taken branch, 3939 plus a not-taken branch, plus a taken branch cost. 3940 3941 3. The vectorizer then checks if an epilogue is required. If the 3942 cost model check was not done before during prologue check, it 3943 has to be done with the epilogue check. 3944 3945 if (prologue == 0) 3946 jmp to vector code 3947 else 3948 execute prologue 3949 if (prologue == num_iters) 3950 go to exit 3951 vector code: 3952 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 3953 jmp to epilogue 3954 3955 Hence the run-time scalar cost should be incremented by 2 taken 3956 branches. 3957 3958 TODO: The back end may reorder the BBS's differently and reverse 3959 conditions/branch directions. Change the estimates below to 3960 something more reasonable. */ 3961 3962 /* If the number of iterations is known and we do not do versioning, we can 3963 decide whether to vectorize at compile time. Hence the scalar version 3964 do not carry cost model guard costs. */ 3965 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 3966 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3967 { 3968 /* Cost model check occurs at versioning. */ 3969 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3970 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 3971 else 3972 { 3973 /* Cost model check occurs at prologue generation. */ 3974 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 3975 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 3976 + vect_get_stmt_cost (cond_branch_not_taken); 3977 /* Cost model check occurs at epilogue generation. */ 3978 else 3979 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 3980 } 3981 } 3982 3983 /* Complete the target-specific cost calculations. */ 3984 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3985 &vec_inside_cost, &vec_epilogue_cost); 3986 3987 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3988 3989 if (dump_enabled_p ()) 3990 { 3991 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3992 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3993 vec_inside_cost); 3994 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 3995 vec_prologue_cost); 3996 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 3997 vec_epilogue_cost); 3998 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 3999 scalar_single_iter_cost); 4000 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 4001 scalar_outside_cost); 4002 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 4003 vec_outside_cost); 4004 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 4005 peel_iters_prologue); 4006 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 4007 peel_iters_epilogue); 4008 } 4009 4010 /* Calculate number of iterations required to make the vector version 4011 profitable, relative to the loop bodies only. The following condition 4012 must hold true: 4013 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC 4014 where 4015 SIC = scalar iteration cost, VIC = vector iteration cost, 4016 VOC = vector outside cost, VF = vectorization factor, 4017 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations 4018 SOC = scalar outside cost for run time cost model check. */ 4019 4020 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost) 4021 { 4022 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 4023 * assumed_vf 4024 - vec_inside_cost * peel_iters_prologue 4025 - vec_inside_cost * peel_iters_epilogue); 4026 if (min_profitable_iters <= 0) 4027 min_profitable_iters = 0; 4028 else 4029 { 4030 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf) 4031 - vec_inside_cost); 4032 4033 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) 4034 <= (((int) vec_inside_cost * min_profitable_iters) 4035 + (((int) vec_outside_cost - scalar_outside_cost) 4036 * assumed_vf))) 4037 min_profitable_iters++; 4038 } 4039 } 4040 /* vector version will never be profitable. */ 4041 else 4042 { 4043 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 4044 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization " 4045 "did not happen for a simd loop"); 4046 4047 if (dump_enabled_p ()) 4048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4049 "cost model: the vector iteration cost = %d " 4050 "divided by the scalar iteration cost = %d " 4051 "is greater or equal to the vectorization factor = %d" 4052 ".\n", 4053 vec_inside_cost, scalar_single_iter_cost, assumed_vf); 4054 *ret_min_profitable_niters = -1; 4055 *ret_min_profitable_estimate = -1; 4056 return; 4057 } 4058 4059 dump_printf (MSG_NOTE, 4060 " Calculated minimum iters for profitability: %d\n", 4061 min_profitable_iters); 4062 4063 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 4064 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 4065 /* We want the vectorized loop to execute at least once. */ 4066 min_profitable_iters = assumed_vf + peel_iters_prologue; 4067 4068 if (dump_enabled_p ()) 4069 dump_printf_loc (MSG_NOTE, vect_location, 4070 " Runtime profitability threshold = %d\n", 4071 min_profitable_iters); 4072 4073 *ret_min_profitable_niters = min_profitable_iters; 4074 4075 /* Calculate number of iterations required to make the vector version 4076 profitable, relative to the loop bodies only. 4077 4078 Non-vectorized variant is SIC * niters and it must win over vector 4079 variant on the expected loop trip count. The following condition must hold true: 4080 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ 4081 4082 if (vec_outside_cost <= 0) 4083 min_profitable_estimate = 0; 4084 else 4085 { 4086 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 4087 * assumed_vf 4088 - vec_inside_cost * peel_iters_prologue 4089 - vec_inside_cost * peel_iters_epilogue) 4090 / ((scalar_single_iter_cost * assumed_vf) 4091 - vec_inside_cost); 4092 } 4093 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 4094 if (dump_enabled_p ()) 4095 dump_printf_loc (MSG_NOTE, vect_location, 4096 " Static estimate profitability threshold = %d\n", 4097 min_profitable_estimate); 4098 4099 *ret_min_profitable_estimate = min_profitable_estimate; 4100 } 4101 4102 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 4103 vector elements (not bits) for a vector with NELT elements. */ 4104 static void 4105 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, 4106 vec_perm_builder *sel) 4107 { 4108 /* The encoding is a single stepped pattern. Any wrap-around is handled 4109 by vec_perm_indices. */ 4110 sel->new_vector (nelt, 1, 3); 4111 for (unsigned int i = 0; i < 3; i++) 4112 sel->quick_push (i + offset); 4113 } 4114 4115 /* Checks whether the target supports whole-vector shifts for vectors of mode 4116 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 4117 it supports vec_perm_const with masks for all necessary shift amounts. */ 4118 static bool 4119 have_whole_vector_shift (machine_mode mode) 4120 { 4121 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 4122 return true; 4123 4124 /* Variable-length vectors should be handled via the optab. */ 4125 unsigned int nelt; 4126 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 4127 return false; 4128 4129 vec_perm_builder sel; 4130 vec_perm_indices indices; 4131 for (unsigned int i = nelt / 2; i >= 1; i /= 2) 4132 { 4133 calc_vec_perm_mask_for_shift (i, nelt, &sel); 4134 indices.new_vector (sel, 2, nelt); 4135 if (!can_vec_perm_const_p (mode, indices, false)) 4136 return false; 4137 } 4138 return true; 4139 } 4140 4141 /* TODO: Close dependency between vect_model_*_cost and vectorizable_* 4142 functions. Design better to avoid maintenance issues. */ 4143 4144 /* Function vect_model_reduction_cost. 4145 4146 Models cost for a reduction operation, including the vector ops 4147 generated within the strip-mine loop, the initial definition before 4148 the loop, and the epilogue code that must be generated. */ 4149 4150 static void 4151 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, 4152 int ncopies) 4153 { 4154 int prologue_cost = 0, epilogue_cost = 0, inside_cost; 4155 enum tree_code code; 4156 optab optab; 4157 tree vectype; 4158 gimple *orig_stmt; 4159 machine_mode mode; 4160 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4161 struct loop *loop = NULL; 4162 void *target_cost_data; 4163 4164 if (loop_vinfo) 4165 { 4166 loop = LOOP_VINFO_LOOP (loop_vinfo); 4167 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 4168 } 4169 else 4170 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info)); 4171 4172 /* Condition reductions generate two reductions in the loop. */ 4173 vect_reduction_type reduction_type 4174 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 4175 if (reduction_type == COND_REDUCTION) 4176 ncopies *= 2; 4177 4178 vectype = STMT_VINFO_VECTYPE (stmt_info); 4179 mode = TYPE_MODE (vectype); 4180 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 4181 4182 if (!orig_stmt) 4183 orig_stmt = STMT_VINFO_STMT (stmt_info); 4184 4185 code = gimple_assign_rhs_code (orig_stmt); 4186 4187 if (reduction_type == EXTRACT_LAST_REDUCTION 4188 || reduction_type == FOLD_LEFT_REDUCTION) 4189 { 4190 /* No extra instructions needed in the prologue. */ 4191 prologue_cost = 0; 4192 4193 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST) 4194 /* Count one reduction-like operation per vector. */ 4195 inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar, 4196 stmt_info, 0, vect_body); 4197 else 4198 { 4199 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ 4200 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); 4201 inside_cost = add_stmt_cost (target_cost_data, nelements, 4202 vec_to_scalar, stmt_info, 0, 4203 vect_body); 4204 inside_cost += add_stmt_cost (target_cost_data, nelements, 4205 scalar_stmt, stmt_info, 0, 4206 vect_body); 4207 } 4208 } 4209 else 4210 { 4211 /* Add in cost for initial definition. 4212 For cond reduction we have four vectors: initial index, step, 4213 initial result of the data reduction, initial value of the index 4214 reduction. */ 4215 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1; 4216 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts, 4217 scalar_to_vec, stmt_info, 0, 4218 vect_prologue); 4219 4220 /* Cost of reduction op inside loop. */ 4221 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 4222 stmt_info, 0, vect_body); 4223 } 4224 4225 /* Determine cost of epilogue code. 4226 4227 We have a reduction operator that will reduce the vector in one statement. 4228 Also requires scalar extract. */ 4229 4230 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt)) 4231 { 4232 if (reduc_fn != IFN_LAST) 4233 { 4234 if (reduction_type == COND_REDUCTION) 4235 { 4236 /* An EQ stmt and an COND_EXPR stmt. */ 4237 epilogue_cost += add_stmt_cost (target_cost_data, 2, 4238 vector_stmt, stmt_info, 0, 4239 vect_epilogue); 4240 /* Reduction of the max index and a reduction of the found 4241 values. */ 4242 epilogue_cost += add_stmt_cost (target_cost_data, 2, 4243 vec_to_scalar, stmt_info, 0, 4244 vect_epilogue); 4245 /* A broadcast of the max value. */ 4246 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4247 scalar_to_vec, stmt_info, 0, 4248 vect_epilogue); 4249 } 4250 else 4251 { 4252 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt, 4253 stmt_info, 0, vect_epilogue); 4254 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4255 vec_to_scalar, stmt_info, 0, 4256 vect_epilogue); 4257 } 4258 } 4259 else if (reduction_type == COND_REDUCTION) 4260 { 4261 unsigned estimated_nunits = vect_nunits_for_cost (vectype); 4262 /* Extraction of scalar elements. */ 4263 epilogue_cost += add_stmt_cost (target_cost_data, 4264 2 * estimated_nunits, 4265 vec_to_scalar, stmt_info, 0, 4266 vect_epilogue); 4267 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ 4268 epilogue_cost += add_stmt_cost (target_cost_data, 4269 2 * estimated_nunits - 3, 4270 scalar_stmt, stmt_info, 0, 4271 vect_epilogue); 4272 } 4273 else if (reduction_type == EXTRACT_LAST_REDUCTION 4274 || reduction_type == FOLD_LEFT_REDUCTION) 4275 /* No extra instructions need in the epilogue. */ 4276 ; 4277 else 4278 { 4279 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 4280 tree bitsize = 4281 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); 4282 int element_bitsize = tree_to_uhwi (bitsize); 4283 int nelements = vec_size_in_bits / element_bitsize; 4284 4285 if (code == COND_EXPR) 4286 code = MAX_EXPR; 4287 4288 optab = optab_for_tree_code (code, vectype, optab_default); 4289 4290 /* We have a whole vector shift available. */ 4291 if (optab != unknown_optab 4292 && VECTOR_MODE_P (mode) 4293 && optab_handler (optab, mode) != CODE_FOR_nothing 4294 && have_whole_vector_shift (mode)) 4295 { 4296 /* Final reduction via vector shifts and the reduction operator. 4297 Also requires scalar extract. */ 4298 epilogue_cost += add_stmt_cost (target_cost_data, 4299 exact_log2 (nelements) * 2, 4300 vector_stmt, stmt_info, 0, 4301 vect_epilogue); 4302 epilogue_cost += add_stmt_cost (target_cost_data, 1, 4303 vec_to_scalar, stmt_info, 0, 4304 vect_epilogue); 4305 } 4306 else 4307 /* Use extracts and reduction op for final reduction. For N 4308 elements, we have N extracts and N-1 reduction ops. */ 4309 epilogue_cost += add_stmt_cost (target_cost_data, 4310 nelements + nelements - 1, 4311 vector_stmt, stmt_info, 0, 4312 vect_epilogue); 4313 } 4314 } 4315 4316 if (dump_enabled_p ()) 4317 dump_printf (MSG_NOTE, 4318 "vect_model_reduction_cost: inside_cost = %d, " 4319 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 4320 prologue_cost, epilogue_cost); 4321 } 4322 4323 4324 /* Function vect_model_induction_cost. 4325 4326 Models cost for induction operations. */ 4327 4328 static void 4329 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) 4330 { 4331 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4332 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 4333 unsigned inside_cost, prologue_cost; 4334 4335 if (PURE_SLP_STMT (stmt_info)) 4336 return; 4337 4338 /* loop cost for vec_loop. */ 4339 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt, 4340 stmt_info, 0, vect_body); 4341 4342 /* prologue cost for vec_init and vec_step. */ 4343 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec, 4344 stmt_info, 0, vect_prologue); 4345 4346 if (dump_enabled_p ()) 4347 dump_printf_loc (MSG_NOTE, vect_location, 4348 "vect_model_induction_cost: inside_cost = %d, " 4349 "prologue_cost = %d .\n", inside_cost, prologue_cost); 4350 } 4351 4352 4353 4354 /* Function get_initial_def_for_reduction 4355 4356 Input: 4357 STMT - a stmt that performs a reduction operation in the loop. 4358 INIT_VAL - the initial value of the reduction variable 4359 4360 Output: 4361 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 4362 of the reduction (used for adjusting the epilog - see below). 4363 Return a vector variable, initialized according to the operation that STMT 4364 performs. This vector will be used as the initial value of the 4365 vector of partial results. 4366 4367 Option1 (adjust in epilog): Initialize the vector as follows: 4368 add/bit or/xor: [0,0,...,0,0] 4369 mult/bit and: [1,1,...,1,1] 4370 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 4371 and when necessary (e.g. add/mult case) let the caller know 4372 that it needs to adjust the result by init_val. 4373 4374 Option2: Initialize the vector as follows: 4375 add/bit or/xor: [init_val,0,0,...,0] 4376 mult/bit and: [init_val,1,1,...,1] 4377 min/max/cond_expr: [init_val,init_val,...,init_val] 4378 and no adjustments are needed. 4379 4380 For example, for the following code: 4381 4382 s = init_val; 4383 for (i=0;i<n;i++) 4384 s = s + a[i]; 4385 4386 STMT is 's = s + a[i]', and the reduction variable is 's'. 4387 For a vector of 4 units, we want to return either [0,0,0,init_val], 4388 or [0,0,0,0] and let the caller know that it needs to adjust 4389 the result at the end by 'init_val'. 4390 4391 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 4392 initialization vector is simpler (same element in all entries), if 4393 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4394 4395 A cost model should help decide between these two schemes. */ 4396 4397 tree 4398 get_initial_def_for_reduction (gimple *stmt, tree init_val, 4399 tree *adjustment_def) 4400 { 4401 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 4402 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4403 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4404 tree scalar_type = TREE_TYPE (init_val); 4405 tree vectype = get_vectype_for_scalar_type (scalar_type); 4406 enum tree_code code = gimple_assign_rhs_code (stmt); 4407 tree def_for_init; 4408 tree init_def; 4409 bool nested_in_vect_loop = false; 4410 REAL_VALUE_TYPE real_init_val = dconst0; 4411 int int_init_val = 0; 4412 gimple *def_stmt = NULL; 4413 gimple_seq stmts = NULL; 4414 4415 gcc_assert (vectype); 4416 4417 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4418 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4419 4420 if (nested_in_vect_loop_p (loop, stmt)) 4421 nested_in_vect_loop = true; 4422 else 4423 gcc_assert (loop == (gimple_bb (stmt))->loop_father); 4424 4425 /* In case of double reduction we only create a vector variable to be put 4426 in the reduction phi node. The actual statement creation is done in 4427 vect_create_epilog_for_reduction. */ 4428 if (adjustment_def && nested_in_vect_loop 4429 && TREE_CODE (init_val) == SSA_NAME 4430 && (def_stmt = SSA_NAME_DEF_STMT (init_val)) 4431 && gimple_code (def_stmt) == GIMPLE_PHI 4432 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 4433 && vinfo_for_stmt (def_stmt) 4434 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) 4435 == vect_double_reduction_def) 4436 { 4437 *adjustment_def = NULL; 4438 return vect_create_destination_var (init_val, vectype); 4439 } 4440 4441 vect_reduction_type reduction_type 4442 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo); 4443 4444 /* In case of a nested reduction do not use an adjustment def as 4445 that case is not supported by the epilogue generation correctly 4446 if ncopies is not one. */ 4447 if (adjustment_def && nested_in_vect_loop) 4448 { 4449 *adjustment_def = NULL; 4450 return vect_get_vec_def_for_operand (init_val, stmt); 4451 } 4452 4453 switch (code) 4454 { 4455 case WIDEN_SUM_EXPR: 4456 case DOT_PROD_EXPR: 4457 case SAD_EXPR: 4458 case PLUS_EXPR: 4459 case MINUS_EXPR: 4460 case BIT_IOR_EXPR: 4461 case BIT_XOR_EXPR: 4462 case MULT_EXPR: 4463 case BIT_AND_EXPR: 4464 { 4465 /* ADJUSTMENT_DEF is NULL when called from 4466 vect_create_epilog_for_reduction to vectorize double reduction. */ 4467 if (adjustment_def) 4468 *adjustment_def = init_val; 4469 4470 if (code == MULT_EXPR) 4471 { 4472 real_init_val = dconst1; 4473 int_init_val = 1; 4474 } 4475 4476 if (code == BIT_AND_EXPR) 4477 int_init_val = -1; 4478 4479 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4480 def_for_init = build_real (scalar_type, real_init_val); 4481 else 4482 def_for_init = build_int_cst (scalar_type, int_init_val); 4483 4484 if (adjustment_def) 4485 /* Option1: the first element is '0' or '1' as well. */ 4486 init_def = gimple_build_vector_from_val (&stmts, vectype, 4487 def_for_init); 4488 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4489 { 4490 /* Option2 (variable length): the first element is INIT_VAL. */ 4491 init_def = build_vector_from_val (vectype, def_for_init); 4492 gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT, 4493 2, init_def, init_val); 4494 init_def = make_ssa_name (vectype); 4495 gimple_call_set_lhs (call, init_def); 4496 gimple_seq_add_stmt (&stmts, call); 4497 } 4498 else 4499 { 4500 /* Option2: the first element is INIT_VAL. */ 4501 tree_vector_builder elts (vectype, 1, 2); 4502 elts.quick_push (init_val); 4503 elts.quick_push (def_for_init); 4504 init_def = gimple_build_vector (&stmts, &elts); 4505 } 4506 } 4507 break; 4508 4509 case MIN_EXPR: 4510 case MAX_EXPR: 4511 case COND_EXPR: 4512 { 4513 if (adjustment_def) 4514 { 4515 *adjustment_def = NULL_TREE; 4516 if (reduction_type != COND_REDUCTION 4517 && reduction_type != EXTRACT_LAST_REDUCTION) 4518 { 4519 init_def = vect_get_vec_def_for_operand (init_val, stmt); 4520 break; 4521 } 4522 } 4523 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4524 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); 4525 } 4526 break; 4527 4528 default: 4529 gcc_unreachable (); 4530 } 4531 4532 if (stmts) 4533 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4534 return init_def; 4535 } 4536 4537 /* Get at the initial defs for the reduction PHIs in SLP_NODE. 4538 NUMBER_OF_VECTORS is the number of vector defs to create. 4539 If NEUTRAL_OP is nonnull, introducing extra elements of that 4540 value will not change the result. */ 4541 4542 static void 4543 get_initial_defs_for_reduction (slp_tree slp_node, 4544 vec<tree> *vec_oprnds, 4545 unsigned int number_of_vectors, 4546 bool reduc_chain, tree neutral_op) 4547 { 4548 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 4549 gimple *stmt = stmts[0]; 4550 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 4551 unsigned HOST_WIDE_INT nunits; 4552 unsigned j, number_of_places_left_in_vector; 4553 tree vector_type; 4554 tree vop; 4555 int group_size = stmts.length (); 4556 unsigned int vec_num, i; 4557 unsigned number_of_copies = 1; 4558 vec<tree> voprnds; 4559 voprnds.create (number_of_vectors); 4560 struct loop *loop; 4561 auto_vec<tree, 16> permute_results; 4562 4563 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 4564 4565 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); 4566 4567 loop = (gimple_bb (stmt))->loop_father; 4568 gcc_assert (loop); 4569 edge pe = loop_preheader_edge (loop); 4570 4571 gcc_assert (!reduc_chain || neutral_op); 4572 4573 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 4574 created vectors. It is greater than 1 if unrolling is performed. 4575 4576 For example, we have two scalar operands, s1 and s2 (e.g., group of 4577 strided accesses of size two), while NUNITS is four (i.e., four scalars 4578 of this type can be packed in a vector). The output vector will contain 4579 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 4580 will be 2). 4581 4582 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors 4583 containing the operands. 4584 4585 For example, NUNITS is four as before, and the group size is 8 4586 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 4587 {s5, s6, s7, s8}. */ 4588 4589 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4590 nunits = group_size; 4591 4592 number_of_copies = nunits * number_of_vectors / group_size; 4593 4594 number_of_places_left_in_vector = nunits; 4595 bool constant_p = true; 4596 tree_vector_builder elts (vector_type, nunits, 1); 4597 elts.quick_grow (nunits); 4598 for (j = 0; j < number_of_copies; j++) 4599 { 4600 for (i = group_size - 1; stmts.iterate (i, &stmt); i--) 4601 { 4602 tree op; 4603 /* Get the def before the loop. In reduction chain we have only 4604 one initial value. */ 4605 if ((j != (number_of_copies - 1) 4606 || (reduc_chain && i != 0)) 4607 && neutral_op) 4608 op = neutral_op; 4609 else 4610 op = PHI_ARG_DEF_FROM_EDGE (stmt, pe); 4611 4612 /* Create 'vect_ = {op0,op1,...,opn}'. */ 4613 number_of_places_left_in_vector--; 4614 elts[number_of_places_left_in_vector] = op; 4615 if (!CONSTANT_CLASS_P (op)) 4616 constant_p = false; 4617 4618 if (number_of_places_left_in_vector == 0) 4619 { 4620 gimple_seq ctor_seq = NULL; 4621 tree init; 4622 if (constant_p && !neutral_op 4623 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 4624 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 4625 /* Build the vector directly from ELTS. */ 4626 init = gimple_build_vector (&ctor_seq, &elts); 4627 else if (neutral_op) 4628 { 4629 /* Build a vector of the neutral value and shift the 4630 other elements into place. */ 4631 init = gimple_build_vector_from_val (&ctor_seq, vector_type, 4632 neutral_op); 4633 int k = nunits; 4634 while (k > 0 && elts[k - 1] == neutral_op) 4635 k -= 1; 4636 while (k > 0) 4637 { 4638 k -= 1; 4639 gcall *call = gimple_build_call_internal 4640 (IFN_VEC_SHL_INSERT, 2, init, elts[k]); 4641 init = make_ssa_name (vector_type); 4642 gimple_call_set_lhs (call, init); 4643 gimple_seq_add_stmt (&ctor_seq, call); 4644 } 4645 } 4646 else 4647 { 4648 /* First time round, duplicate ELTS to fill the 4649 required number of vectors, then cherry pick the 4650 appropriate result for each iteration. */ 4651 if (vec_oprnds->is_empty ()) 4652 duplicate_and_interleave (&ctor_seq, vector_type, elts, 4653 number_of_vectors, 4654 permute_results); 4655 init = permute_results[number_of_vectors - j - 1]; 4656 } 4657 if (ctor_seq != NULL) 4658 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4659 voprnds.quick_push (init); 4660 4661 number_of_places_left_in_vector = nunits; 4662 elts.new_vector (vector_type, nunits, 1); 4663 elts.quick_grow (nunits); 4664 constant_p = true; 4665 } 4666 } 4667 } 4668 4669 /* Since the vectors are created in the reverse order, we should invert 4670 them. */ 4671 vec_num = voprnds.length (); 4672 for (j = vec_num; j != 0; j--) 4673 { 4674 vop = voprnds[j - 1]; 4675 vec_oprnds->quick_push (vop); 4676 } 4677 4678 voprnds.release (); 4679 4680 /* In case that VF is greater than the unrolling factor needed for the SLP 4681 group of stmts, NUMBER_OF_VECTORS to be created is greater than 4682 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have 4683 to replicate the vectors. */ 4684 tree neutral_vec = NULL; 4685 while (number_of_vectors > vec_oprnds->length ()) 4686 { 4687 if (neutral_op) 4688 { 4689 if (!neutral_vec) 4690 { 4691 gimple_seq ctor_seq = NULL; 4692 neutral_vec = gimple_build_vector_from_val 4693 (&ctor_seq, vector_type, neutral_op); 4694 if (ctor_seq != NULL) 4695 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4696 } 4697 vec_oprnds->quick_push (neutral_vec); 4698 } 4699 else 4700 { 4701 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) 4702 vec_oprnds->quick_push (vop); 4703 } 4704 } 4705 } 4706 4707 4708 /* Function vect_create_epilog_for_reduction 4709 4710 Create code at the loop-epilog to finalize the result of a reduction 4711 computation. 4712 4713 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 4714 reduction statements. 4715 STMT is the scalar reduction stmt that is being vectorized. 4716 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the 4717 number of elements that we can fit in a vectype (nunits). In this case 4718 we have to generate more than one vector stmt - i.e - we need to "unroll" 4719 the vector stmt by a factor VF/nunits. For more details see documentation 4720 in vectorizable_operation. 4721 REDUC_FN is the internal function for the epilog reduction. 4722 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 4723 computation. 4724 REDUC_INDEX is the index of the operand in the right hand side of the 4725 statement that is defined by REDUCTION_PHI. 4726 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. 4727 SLP_NODE is an SLP node containing a group of reduction statements. The 4728 first one in this group is STMT. 4729 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case 4730 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to 4731 be smaller than any value of the IV in the loop, for MIN_EXPR larger than 4732 any value of the IV in the loop. 4733 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION. 4734 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is 4735 null if this is not an SLP reduction 4736 4737 This function: 4738 1. Creates the reduction def-use cycles: sets the arguments for 4739 REDUCTION_PHIS: 4740 The loop-entry argument is the vectorized initial-value of the reduction. 4741 The loop-latch argument is taken from VECT_DEFS - the vector of partial 4742 sums. 4743 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4744 by calling the function specified by REDUC_FN if available, or by 4745 other means (whole-vector shifts or a scalar loop). 4746 The function also creates a new phi node at the loop exit to preserve 4747 loop-closed form, as illustrated below. 4748 4749 The flow at the entry to this function: 4750 4751 loop: 4752 vec_def = phi <null, null> # REDUCTION_PHI 4753 VECT_DEF = vector_stmt # vectorized form of STMT 4754 s_loop = scalar_stmt # (scalar) STMT 4755 loop_exit: 4756 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4757 use <s_out0> 4758 use <s_out0> 4759 4760 The above is transformed by this function into: 4761 4762 loop: 4763 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4764 VECT_DEF = vector_stmt # vectorized form of STMT 4765 s_loop = scalar_stmt # (scalar) STMT 4766 loop_exit: 4767 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4768 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4769 v_out2 = reduce <v_out1> 4770 s_out3 = extract_field <v_out2, 0> 4771 s_out4 = adjust_result <s_out3> 4772 use <s_out4> 4773 use <s_out4> 4774 */ 4775 4776 static void 4777 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, 4778 gimple *reduc_def_stmt, 4779 int ncopies, internal_fn reduc_fn, 4780 vec<gimple *> reduction_phis, 4781 bool double_reduc, 4782 slp_tree slp_node, 4783 slp_instance slp_node_instance, 4784 tree induc_val, enum tree_code induc_code, 4785 tree neutral_op) 4786 { 4787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4788 stmt_vec_info prev_phi_info; 4789 tree vectype; 4790 machine_mode mode; 4791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4792 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 4793 basic_block exit_bb; 4794 tree scalar_dest; 4795 tree scalar_type; 4796 gimple *new_phi = NULL, *phi; 4797 gimple_stmt_iterator exit_gsi; 4798 tree vec_dest; 4799 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; 4800 gimple *epilog_stmt = NULL; 4801 enum tree_code code = gimple_assign_rhs_code (stmt); 4802 gimple *exit_phi; 4803 tree bitsize; 4804 tree adjustment_def = NULL; 4805 tree vec_initial_def = NULL; 4806 tree expr, def, initial_def = NULL; 4807 tree orig_name, scalar_result; 4808 imm_use_iterator imm_iter, phi_imm_iter; 4809 use_operand_p use_p, phi_use_p; 4810 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL; 4811 bool nested_in_vect_loop = false; 4812 auto_vec<gimple *> new_phis; 4813 auto_vec<gimple *> inner_phis; 4814 enum vect_def_type dt = vect_unknown_def_type; 4815 int j, i; 4816 auto_vec<tree> scalar_results; 4817 unsigned int group_size = 1, k, ratio; 4818 auto_vec<tree> vec_initial_defs; 4819 auto_vec<gimple *> phis; 4820 bool slp_reduc = false; 4821 bool direct_slp_reduc; 4822 tree new_phi_result; 4823 gimple *inner_phi = NULL; 4824 tree induction_index = NULL_TREE; 4825 4826 if (slp_node) 4827 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4828 4829 if (nested_in_vect_loop_p (loop, stmt)) 4830 { 4831 outer_loop = loop; 4832 loop = loop->inner; 4833 nested_in_vect_loop = true; 4834 gcc_assert (!slp_node); 4835 } 4836 4837 vectype = STMT_VINFO_VECTYPE (stmt_info); 4838 gcc_assert (vectype); 4839 mode = TYPE_MODE (vectype); 4840 4841 /* 1. Create the reduction def-use cycle: 4842 Set the arguments of REDUCTION_PHIS, i.e., transform 4843 4844 loop: 4845 vec_def = phi <null, null> # REDUCTION_PHI 4846 VECT_DEF = vector_stmt # vectorized form of STMT 4847 ... 4848 4849 into: 4850 4851 loop: 4852 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4853 VECT_DEF = vector_stmt # vectorized form of STMT 4854 ... 4855 4856 (in case of SLP, do it for all the phis). */ 4857 4858 /* Get the loop-entry arguments. */ 4859 enum vect_def_type initial_def_dt = vect_unknown_def_type; 4860 if (slp_node) 4861 { 4862 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 4863 vec_initial_defs.reserve (vec_num); 4864 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, 4865 &vec_initial_defs, vec_num, 4866 GROUP_FIRST_ELEMENT (stmt_info), 4867 neutral_op); 4868 } 4869 else 4870 { 4871 /* Get at the scalar def before the loop, that defines the initial value 4872 of the reduction variable. */ 4873 gimple *def_stmt; 4874 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 4875 loop_preheader_edge (loop)); 4876 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 4877 and we can't use zero for induc_val, use initial_def. Similarly 4878 for REDUC_MIN and initial_def larger than the base. */ 4879 if (TREE_CODE (initial_def) == INTEGER_CST 4880 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4881 == INTEGER_INDUC_COND_REDUCTION) 4882 && !integer_zerop (induc_val) 4883 && ((induc_code == MAX_EXPR 4884 && tree_int_cst_lt (initial_def, induc_val)) 4885 || (induc_code == MIN_EXPR 4886 && tree_int_cst_lt (induc_val, initial_def)))) 4887 induc_val = initial_def; 4888 vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt); 4889 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def, 4890 &adjustment_def); 4891 vec_initial_defs.create (1); 4892 vec_initial_defs.quick_push (vec_initial_def); 4893 } 4894 4895 /* Set phi nodes arguments. */ 4896 FOR_EACH_VEC_ELT (reduction_phis, i, phi) 4897 { 4898 tree vec_init_def = vec_initial_defs[i]; 4899 tree def = vect_defs[i]; 4900 for (j = 0; j < ncopies; j++) 4901 { 4902 if (j != 0) 4903 { 4904 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 4905 if (nested_in_vect_loop) 4906 vec_init_def 4907 = vect_get_vec_def_for_stmt_copy (initial_def_dt, 4908 vec_init_def); 4909 } 4910 4911 /* Set the loop-entry arg of the reduction-phi. */ 4912 4913 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 4914 == INTEGER_INDUC_COND_REDUCTION) 4915 { 4916 /* Initialise the reduction phi to zero. This prevents initial 4917 values of non-zero interferring with the reduction op. */ 4918 gcc_assert (ncopies == 1); 4919 gcc_assert (i == 0); 4920 4921 tree vec_init_def_type = TREE_TYPE (vec_init_def); 4922 tree induc_val_vec 4923 = build_vector_from_val (vec_init_def_type, induc_val); 4924 4925 add_phi_arg (as_a <gphi *> (phi), induc_val_vec, 4926 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4927 } 4928 else 4929 add_phi_arg (as_a <gphi *> (phi), vec_init_def, 4930 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4931 4932 /* Set the loop-latch arg for the reduction-phi. */ 4933 if (j > 0) 4934 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def); 4935 4936 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop), 4937 UNKNOWN_LOCATION); 4938 4939 if (dump_enabled_p ()) 4940 { 4941 dump_printf_loc (MSG_NOTE, vect_location, 4942 "transform reduction: created def-use cycle: "); 4943 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 4944 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0); 4945 } 4946 } 4947 } 4948 4949 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 4950 which is updated with the current index of the loop for every match of 4951 the original loop's cond_expr (VEC_STMT). This results in a vector 4952 containing the last time the condition passed for that vector lane. 4953 The first match will be a 1 to allow 0 to be used for non-matching 4954 indexes. If there are no matches at all then the vector will be all 4955 zeroes. */ 4956 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) 4957 { 4958 tree indx_before_incr, indx_after_incr; 4959 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 4960 4961 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); 4962 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); 4963 4964 int scalar_precision 4965 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 4966 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 4967 tree cr_index_vector_type = build_vector_type 4968 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); 4969 4970 /* First we create a simple vector induction variable which starts 4971 with the values {1,2,3,...} (SERIES_VECT) and increments by the 4972 vector size (STEP). */ 4973 4974 /* Create a {1,2,3,...} vector. */ 4975 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); 4976 4977 /* Create a vector of the step value. */ 4978 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 4979 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 4980 4981 /* Create an induction variable. */ 4982 gimple_stmt_iterator incr_gsi; 4983 bool insert_after; 4984 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4985 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 4986 insert_after, &indx_before_incr, &indx_after_incr); 4987 4988 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 4989 filled with zeros (VEC_ZERO). */ 4990 4991 /* Create a vector of 0s. */ 4992 tree zero = build_zero_cst (cr_index_scalar_type); 4993 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 4994 4995 /* Create a vector phi node. */ 4996 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 4997 new_phi = create_phi_node (new_phi_tree, loop->header); 4998 set_vinfo_for_stmt (new_phi, 4999 new_stmt_vec_info (new_phi, loop_vinfo)); 5000 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 5001 loop_preheader_edge (loop), UNKNOWN_LOCATION); 5002 5003 /* Now take the condition from the loops original cond_expr 5004 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for 5005 every match uses values from the induction variable 5006 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 5007 (NEW_PHI_TREE). 5008 Finally, we update the phi (NEW_PHI_TREE) to take the value of 5009 the new cond_expr (INDEX_COND_EXPR). */ 5010 5011 /* Duplicate the condition from vec_stmt. */ 5012 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); 5013 5014 /* Create a conditional, where the condition is taken from vec_stmt 5015 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and 5016 else is the phi (NEW_PHI_TREE). */ 5017 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, 5018 ccompare, indx_before_incr, 5019 new_phi_tree); 5020 induction_index = make_ssa_name (cr_index_vector_type); 5021 gimple *index_condition = gimple_build_assign (induction_index, 5022 index_cond_expr); 5023 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); 5024 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition, 5025 loop_vinfo); 5026 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 5027 set_vinfo_for_stmt (index_condition, index_vec_info); 5028 5029 /* Update the phi with the vec cond. */ 5030 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 5031 loop_latch_edge (loop), UNKNOWN_LOCATION); 5032 } 5033 5034 /* 2. Create epilog code. 5035 The reduction epilog code operates across the elements of the vector 5036 of partial results computed by the vectorized loop. 5037 The reduction epilog code consists of: 5038 5039 step 1: compute the scalar result in a vector (v_out2) 5040 step 2: extract the scalar result (s_out3) from the vector (v_out2) 5041 step 3: adjust the scalar result (s_out3) if needed. 5042 5043 Step 1 can be accomplished using one the following three schemes: 5044 (scheme 1) using reduc_fn, if available. 5045 (scheme 2) using whole-vector shifts, if available. 5046 (scheme 3) using a scalar loop. In this case steps 1+2 above are 5047 combined. 5048 5049 The overall epilog code looks like this: 5050 5051 s_out0 = phi <s_loop> # original EXIT_PHI 5052 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5053 v_out2 = reduce <v_out1> # step 1 5054 s_out3 = extract_field <v_out2, 0> # step 2 5055 s_out4 = adjust_result <s_out3> # step 3 5056 5057 (step 3 is optional, and steps 1 and 2 may be combined). 5058 Lastly, the uses of s_out0 are replaced by s_out4. */ 5059 5060 5061 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 5062 v_out1 = phi <VECT_DEF> 5063 Store them in NEW_PHIS. */ 5064 5065 exit_bb = single_exit (loop)->dest; 5066 prev_phi_info = NULL; 5067 new_phis.create (vect_defs.length ()); 5068 FOR_EACH_VEC_ELT (vect_defs, i, def) 5069 { 5070 for (j = 0; j < ncopies; j++) 5071 { 5072 tree new_def = copy_ssa_name (def); 5073 phi = create_phi_node (new_def, exit_bb); 5074 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo)); 5075 if (j == 0) 5076 new_phis.quick_push (phi); 5077 else 5078 { 5079 def = vect_get_vec_def_for_stmt_copy (dt, def); 5080 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; 5081 } 5082 5083 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 5084 prev_phi_info = vinfo_for_stmt (phi); 5085 } 5086 } 5087 5088 /* The epilogue is created for the outer-loop, i.e., for the loop being 5089 vectorized. Create exit phis for the outer loop. */ 5090 if (double_reduc) 5091 { 5092 loop = outer_loop; 5093 exit_bb = single_exit (loop)->dest; 5094 inner_phis.create (vect_defs.length ()); 5095 FOR_EACH_VEC_ELT (new_phis, i, phi) 5096 { 5097 tree new_result = copy_ssa_name (PHI_RESULT (phi)); 5098 gphi *outer_phi = create_phi_node (new_result, exit_bb); 5099 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 5100 PHI_RESULT (phi)); 5101 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 5102 loop_vinfo)); 5103 inner_phis.quick_push (phi); 5104 new_phis[i] = outer_phi; 5105 prev_phi_info = vinfo_for_stmt (outer_phi); 5106 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi))) 5107 { 5108 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); 5109 new_result = copy_ssa_name (PHI_RESULT (phi)); 5110 outer_phi = create_phi_node (new_result, exit_bb); 5111 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx, 5112 PHI_RESULT (phi)); 5113 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi, 5114 loop_vinfo)); 5115 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi; 5116 prev_phi_info = vinfo_for_stmt (outer_phi); 5117 } 5118 } 5119 } 5120 5121 exit_gsi = gsi_after_labels (exit_bb); 5122 5123 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 5124 (i.e. when reduc_fn is not available) and in the final adjustment 5125 code (if needed). Also get the original scalar reduction variable as 5126 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 5127 represents a reduction pattern), the tree-code and scalar-def are 5128 taken from the original stmt that the pattern-stmt (STMT) replaces. 5129 Otherwise (it is a regular reduction) - the tree-code and scalar-def 5130 are taken from STMT. */ 5131 5132 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); 5133 if (!orig_stmt) 5134 { 5135 /* Regular reduction */ 5136 orig_stmt = stmt; 5137 } 5138 else 5139 { 5140 /* Reduction pattern */ 5141 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); 5142 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); 5143 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); 5144 } 5145 5146 code = gimple_assign_rhs_code (orig_stmt); 5147 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, 5148 partial results are added and not subtracted. */ 5149 if (code == MINUS_EXPR) 5150 code = PLUS_EXPR; 5151 5152 scalar_dest = gimple_assign_lhs (orig_stmt); 5153 scalar_type = TREE_TYPE (scalar_dest); 5154 scalar_results.create (group_size); 5155 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 5156 bitsize = TYPE_SIZE (scalar_type); 5157 5158 /* In case this is a reduction in an inner-loop while vectorizing an outer 5159 loop - we don't need to extract a single scalar result at the end of the 5160 inner-loop (unless it is double reduction, i.e., the use of reduction is 5161 outside the outer-loop). The final vector of partial results will be used 5162 in the vectorized outer-loop, or reduced to a scalar result at the end of 5163 the outer-loop. */ 5164 if (nested_in_vect_loop && !double_reduc) 5165 goto vect_finalize_reduction; 5166 5167 /* SLP reduction without reduction chain, e.g., 5168 # a1 = phi <a2, a0> 5169 # b1 = phi <b2, b0> 5170 a2 = operation (a1) 5171 b2 = operation (b1) */ 5172 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))); 5173 5174 /* True if we should implement SLP_REDUC using native reduction operations 5175 instead of scalar operations. */ 5176 direct_slp_reduc = (reduc_fn != IFN_LAST 5177 && slp_reduc 5178 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); 5179 5180 /* In case of reduction chain, e.g., 5181 # a1 = phi <a3, a0> 5182 a2 = operation (a1) 5183 a3 = operation (a2), 5184 5185 we may end up with more than one vector result. Here we reduce them to 5186 one vector. */ 5187 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc) 5188 { 5189 tree first_vect = PHI_RESULT (new_phis[0]); 5190 gassign *new_vec_stmt = NULL; 5191 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5192 for (k = 1; k < new_phis.length (); k++) 5193 { 5194 gimple *next_phi = new_phis[k]; 5195 tree second_vect = PHI_RESULT (next_phi); 5196 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 5197 new_vec_stmt = gimple_build_assign (tem, code, 5198 first_vect, second_vect); 5199 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 5200 first_vect = tem; 5201 } 5202 5203 new_phi_result = first_vect; 5204 if (new_vec_stmt) 5205 { 5206 new_phis.truncate (0); 5207 new_phis.safe_push (new_vec_stmt); 5208 } 5209 } 5210 /* Likewise if we couldn't use a single defuse cycle. */ 5211 else if (ncopies > 1) 5212 { 5213 gcc_assert (new_phis.length () == 1); 5214 tree first_vect = PHI_RESULT (new_phis[0]); 5215 gassign *new_vec_stmt = NULL; 5216 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5217 gimple *next_phi = new_phis[0]; 5218 for (int k = 1; k < ncopies; ++k) 5219 { 5220 next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi)); 5221 tree second_vect = PHI_RESULT (next_phi); 5222 tree tem = make_ssa_name (vec_dest, new_vec_stmt); 5223 new_vec_stmt = gimple_build_assign (tem, code, 5224 first_vect, second_vect); 5225 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); 5226 first_vect = tem; 5227 } 5228 new_phi_result = first_vect; 5229 new_phis.truncate (0); 5230 new_phis.safe_push (new_vec_stmt); 5231 } 5232 else 5233 new_phi_result = PHI_RESULT (new_phis[0]); 5234 5235 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 5236 && reduc_fn != IFN_LAST) 5237 { 5238 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 5239 various data values where the condition matched and another vector 5240 (INDUCTION_INDEX) containing all the indexes of those matches. We 5241 need to extract the last matching index (which will be the index with 5242 highest value) and use this to index into the data vector. 5243 For the case where there were no matches, the data vector will contain 5244 all default values and the index vector will be all zeros. */ 5245 5246 /* Get various versions of the type of the vector of indexes. */ 5247 tree index_vec_type = TREE_TYPE (induction_index); 5248 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 5249 tree index_scalar_type = TREE_TYPE (index_vec_type); 5250 tree index_vec_cmp_type = build_same_sized_truth_vector_type 5251 (index_vec_type); 5252 5253 /* Get an unsigned integer version of the type of the data vector. */ 5254 int scalar_precision 5255 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 5256 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 5257 tree vectype_unsigned = build_vector_type 5258 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); 5259 5260 /* First we need to create a vector (ZERO_VEC) of zeros and another 5261 vector (MAX_INDEX_VEC) filled with the last matching index, which we 5262 can create using a MAX reduction and then expanding. 5263 In the case where the loop never made any matches, the max index will 5264 be zero. */ 5265 5266 /* Vector of {0, 0, 0,...}. */ 5267 tree zero_vec = make_ssa_name (vectype); 5268 tree zero_vec_rhs = build_zero_cst (vectype); 5269 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs); 5270 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT); 5271 5272 /* Find maximum value from the vector of found indexes. */ 5273 tree max_index = make_ssa_name (index_scalar_type); 5274 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5275 1, induction_index); 5276 gimple_call_set_lhs (max_index_stmt, max_index); 5277 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 5278 5279 /* Vector of {max_index, max_index, max_index,...}. */ 5280 tree max_index_vec = make_ssa_name (index_vec_type); 5281 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 5282 max_index); 5283 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 5284 max_index_vec_rhs); 5285 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 5286 5287 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 5288 with the vector (INDUCTION_INDEX) of found indexes, choosing values 5289 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) 5290 otherwise. Only one value should match, resulting in a vector 5291 (VEC_COND) with one data value and the rest zeros. 5292 In the case where the loop never made any matches, every index will 5293 match, resulting in a vector with all data values (which will all be 5294 the default value). */ 5295 5296 /* Compare the max index vector to the vector of found indexes to find 5297 the position of the max value. */ 5298 tree vec_compare = make_ssa_name (index_vec_cmp_type); 5299 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 5300 induction_index, 5301 max_index_vec); 5302 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 5303 5304 /* Use the compare to choose either values from the data vector or 5305 zero. */ 5306 tree vec_cond = make_ssa_name (vectype); 5307 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 5308 vec_compare, new_phi_result, 5309 zero_vec); 5310 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 5311 5312 /* Finally we need to extract the data value from the vector (VEC_COND) 5313 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 5314 reduction, but because this doesn't exist, we can use a MAX reduction 5315 instead. The data value might be signed or a float so we need to cast 5316 it first. 5317 In the case where the loop never made any matches, the data values are 5318 all identical, and so will reduce down correctly. */ 5319 5320 /* Make the matched data values unsigned. */ 5321 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 5322 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 5323 vec_cond); 5324 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 5325 VIEW_CONVERT_EXPR, 5326 vec_cond_cast_rhs); 5327 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 5328 5329 /* Reduce down to a scalar value. */ 5330 tree data_reduc = make_ssa_name (scalar_type_unsigned); 5331 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 5332 1, vec_cond_cast); 5333 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 5334 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 5335 5336 /* Convert the reduced value back to the result type and set as the 5337 result. */ 5338 gimple_seq stmts = NULL; 5339 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 5340 data_reduc); 5341 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5342 scalar_results.safe_push (new_temp); 5343 } 5344 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION 5345 && reduc_fn == IFN_LAST) 5346 { 5347 /* Condition reduction without supported IFN_REDUC_MAX. Generate 5348 idx = 0; 5349 idx_val = induction_index[0]; 5350 val = data_reduc[0]; 5351 for (idx = 0, val = init, i = 0; i < nelts; ++i) 5352 if (induction_index[i] > idx_val) 5353 val = data_reduc[i], idx_val = induction_index[i]; 5354 return val; */ 5355 5356 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); 5357 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); 5358 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); 5359 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); 5360 /* Enforced by vectorizable_reduction, which ensures we have target 5361 support before allowing a conditional reduction on variable-length 5362 vectors. */ 5363 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); 5364 tree idx_val = NULL_TREE, val = NULL_TREE; 5365 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) 5366 { 5367 tree old_idx_val = idx_val; 5368 tree old_val = val; 5369 idx_val = make_ssa_name (idx_eltype); 5370 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, 5371 build3 (BIT_FIELD_REF, idx_eltype, 5372 induction_index, 5373 bitsize_int (el_size), 5374 bitsize_int (off))); 5375 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5376 val = make_ssa_name (data_eltype); 5377 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, 5378 build3 (BIT_FIELD_REF, 5379 data_eltype, 5380 new_phi_result, 5381 bitsize_int (el_size), 5382 bitsize_int (off))); 5383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5384 if (off != 0) 5385 { 5386 tree new_idx_val = idx_val; 5387 tree new_val = val; 5388 if (off != v_size - el_size) 5389 { 5390 new_idx_val = make_ssa_name (idx_eltype); 5391 epilog_stmt = gimple_build_assign (new_idx_val, 5392 MAX_EXPR, idx_val, 5393 old_idx_val); 5394 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5395 } 5396 new_val = make_ssa_name (data_eltype); 5397 epilog_stmt = gimple_build_assign (new_val, 5398 COND_EXPR, 5399 build2 (GT_EXPR, 5400 boolean_type_node, 5401 idx_val, 5402 old_idx_val), 5403 val, old_val); 5404 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5405 idx_val = new_idx_val; 5406 val = new_val; 5407 } 5408 } 5409 /* Convert the reduced value back to the result type and set as the 5410 result. */ 5411 gimple_seq stmts = NULL; 5412 val = gimple_convert (&stmts, scalar_type, val); 5413 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5414 scalar_results.safe_push (val); 5415 } 5416 5417 /* 2.3 Create the reduction code, using one of the three schemes described 5418 above. In SLP we simply need to extract all the elements from the 5419 vector (without reducing them), so we use scalar shifts. */ 5420 else if (reduc_fn != IFN_LAST && !slp_reduc) 5421 { 5422 tree tmp; 5423 tree vec_elem_type; 5424 5425 /* Case 1: Create: 5426 v_out2 = reduc_expr <v_out1> */ 5427 5428 if (dump_enabled_p ()) 5429 dump_printf_loc (MSG_NOTE, vect_location, 5430 "Reduce using direct vector reduction.\n"); 5431 5432 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 5433 if (!useless_type_conversion_p (scalar_type, vec_elem_type)) 5434 { 5435 tree tmp_dest 5436 = vect_create_destination_var (scalar_dest, vec_elem_type); 5437 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5438 new_phi_result); 5439 gimple_set_lhs (epilog_stmt, tmp_dest); 5440 new_temp = make_ssa_name (tmp_dest, epilog_stmt); 5441 gimple_set_lhs (epilog_stmt, new_temp); 5442 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5443 5444 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR, 5445 new_temp); 5446 } 5447 else 5448 { 5449 epilog_stmt = gimple_build_call_internal (reduc_fn, 1, 5450 new_phi_result); 5451 gimple_set_lhs (epilog_stmt, new_scalar_dest); 5452 } 5453 5454 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5455 gimple_set_lhs (epilog_stmt, new_temp); 5456 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5457 5458 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5459 == INTEGER_INDUC_COND_REDUCTION) 5460 && !operand_equal_p (initial_def, induc_val, 0)) 5461 { 5462 /* Earlier we set the initial value to be a vector if induc_val 5463 values. Check the result and if it is induc_val then replace 5464 with the original initial value, unless induc_val is 5465 the same as initial_def already. */ 5466 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5467 induc_val); 5468 5469 tmp = make_ssa_name (new_scalar_dest); 5470 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5471 initial_def, new_temp); 5472 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5473 new_temp = tmp; 5474 } 5475 5476 scalar_results.safe_push (new_temp); 5477 } 5478 else if (direct_slp_reduc) 5479 { 5480 /* Here we create one vector for each of the GROUP_SIZE results, 5481 with the elements for other SLP statements replaced with the 5482 neutral value. We can then do a normal reduction on each vector. */ 5483 5484 /* Enforced by vectorizable_reduction. */ 5485 gcc_assert (new_phis.length () == 1); 5486 gcc_assert (pow2p_hwi (group_size)); 5487 5488 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; 5489 vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); 5490 gimple_seq seq = NULL; 5491 5492 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5493 and the same element size as VECTYPE. */ 5494 tree index = build_index_vector (vectype, 0, 1); 5495 tree index_type = TREE_TYPE (index); 5496 tree index_elt_type = TREE_TYPE (index_type); 5497 tree mask_type = build_same_sized_truth_vector_type (index_type); 5498 5499 /* Create a vector that, for each element, identifies which of 5500 the GROUP_SIZE results should use it. */ 5501 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5502 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5503 build_vector_from_val (index_type, index_mask)); 5504 5505 /* Get a neutral vector value. This is simply a splat of the neutral 5506 scalar value if we have one, otherwise the initial scalar value 5507 is itself a neutral value. */ 5508 tree vector_identity = NULL_TREE; 5509 if (neutral_op) 5510 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5511 neutral_op); 5512 for (unsigned int i = 0; i < group_size; ++i) 5513 { 5514 /* If there's no univeral neutral value, we can use the 5515 initial scalar value from the original PHI. This is used 5516 for MIN and MAX reduction, for example. */ 5517 if (!neutral_op) 5518 { 5519 tree scalar_value 5520 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i], 5521 loop_preheader_edge (loop)); 5522 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5523 scalar_value); 5524 } 5525 5526 /* Calculate the equivalent of: 5527 5528 sel[j] = (index[j] == i); 5529 5530 which selects the elements of NEW_PHI_RESULT that should 5531 be included in the result. */ 5532 tree compare_val = build_int_cst (index_elt_type, i); 5533 compare_val = build_vector_from_val (index_type, compare_val); 5534 tree sel = gimple_build (&seq, EQ_EXPR, mask_type, 5535 index, compare_val); 5536 5537 /* Calculate the equivalent of: 5538 5539 vec = seq ? new_phi_result : vector_identity; 5540 5541 VEC is now suitable for a full vector reduction. */ 5542 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, 5543 sel, new_phi_result, vector_identity); 5544 5545 /* Do the reduction and convert it to the appropriate type. */ 5546 gcall *call = gimple_build_call_internal (reduc_fn, 1, vec); 5547 tree scalar = make_ssa_name (TREE_TYPE (vectype)); 5548 gimple_call_set_lhs (call, scalar); 5549 gimple_seq_add_stmt (&seq, call); 5550 scalar = gimple_convert (&seq, scalar_type, scalar); 5551 scalar_results.safe_push (scalar); 5552 } 5553 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); 5554 } 5555 else 5556 { 5557 bool reduce_with_shift; 5558 tree vec_temp; 5559 5560 /* COND reductions all do the final reduction with MAX_EXPR 5561 or MIN_EXPR. */ 5562 if (code == COND_EXPR) 5563 { 5564 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5565 == INTEGER_INDUC_COND_REDUCTION) 5566 code = induc_code; 5567 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5568 == CONST_COND_REDUCTION) 5569 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 5570 else 5571 code = MAX_EXPR; 5572 } 5573 5574 /* See if the target wants to do the final (shift) reduction 5575 in a vector mode of smaller size and first reduce upper/lower 5576 halves against each other. */ 5577 enum machine_mode mode1 = mode; 5578 tree vectype1 = vectype; 5579 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); 5580 unsigned sz1 = sz; 5581 if (!slp_reduc 5582 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5583 sz1 = GET_MODE_SIZE (mode1).to_constant (); 5584 5585 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); 5586 reduce_with_shift = have_whole_vector_shift (mode1); 5587 if (!VECTOR_MODE_P (mode1)) 5588 reduce_with_shift = false; 5589 else 5590 { 5591 optab optab = optab_for_tree_code (code, vectype1, optab_default); 5592 if (optab_handler (optab, mode1) == CODE_FOR_nothing) 5593 reduce_with_shift = false; 5594 } 5595 5596 /* First reduce the vector to the desired vector size we should 5597 do shift reduction on by combining upper and lower halves. */ 5598 new_temp = new_phi_result; 5599 while (sz > sz1) 5600 { 5601 gcc_assert (!slp_reduc); 5602 sz /= 2; 5603 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); 5604 5605 /* The target has to make sure we support lowpart/highpart 5606 extraction, either via direct vector extract or through 5607 an integer mode punning. */ 5608 tree dst1, dst2; 5609 if (convert_optab_handler (vec_extract_optab, 5610 TYPE_MODE (TREE_TYPE (new_temp)), 5611 TYPE_MODE (vectype1)) 5612 != CODE_FOR_nothing) 5613 { 5614 /* Extract sub-vectors directly once vec_extract becomes 5615 a conversion optab. */ 5616 dst1 = make_ssa_name (vectype1); 5617 epilog_stmt 5618 = gimple_build_assign (dst1, BIT_FIELD_REF, 5619 build3 (BIT_FIELD_REF, vectype1, 5620 new_temp, TYPE_SIZE (vectype1), 5621 bitsize_int (0))); 5622 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5623 dst2 = make_ssa_name (vectype1); 5624 epilog_stmt 5625 = gimple_build_assign (dst2, BIT_FIELD_REF, 5626 build3 (BIT_FIELD_REF, vectype1, 5627 new_temp, TYPE_SIZE (vectype1), 5628 bitsize_int (sz * BITS_PER_UNIT))); 5629 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5630 } 5631 else 5632 { 5633 /* Extract via punning to appropriately sized integer mode 5634 vector. */ 5635 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, 5636 1); 5637 tree etype = build_vector_type (eltype, 2); 5638 gcc_assert (convert_optab_handler (vec_extract_optab, 5639 TYPE_MODE (etype), 5640 TYPE_MODE (eltype)) 5641 != CODE_FOR_nothing); 5642 tree tem = make_ssa_name (etype); 5643 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, 5644 build1 (VIEW_CONVERT_EXPR, 5645 etype, new_temp)); 5646 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5647 new_temp = tem; 5648 tem = make_ssa_name (eltype); 5649 epilog_stmt 5650 = gimple_build_assign (tem, BIT_FIELD_REF, 5651 build3 (BIT_FIELD_REF, eltype, 5652 new_temp, TYPE_SIZE (eltype), 5653 bitsize_int (0))); 5654 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5655 dst1 = make_ssa_name (vectype1); 5656 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, 5657 build1 (VIEW_CONVERT_EXPR, 5658 vectype1, tem)); 5659 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5660 tem = make_ssa_name (eltype); 5661 epilog_stmt 5662 = gimple_build_assign (tem, BIT_FIELD_REF, 5663 build3 (BIT_FIELD_REF, eltype, 5664 new_temp, TYPE_SIZE (eltype), 5665 bitsize_int (sz * BITS_PER_UNIT))); 5666 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5667 dst2 = make_ssa_name (vectype1); 5668 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5669 build1 (VIEW_CONVERT_EXPR, 5670 vectype1, tem)); 5671 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5672 } 5673 5674 new_temp = make_ssa_name (vectype1); 5675 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); 5676 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5677 } 5678 5679 if (reduce_with_shift && !slp_reduc) 5680 { 5681 int element_bitsize = tree_to_uhwi (bitsize); 5682 /* Enforced by vectorizable_reduction, which disallows SLP reductions 5683 for variable-length vectors and also requires direct target support 5684 for loop reductions. */ 5685 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5686 int nelements = vec_size_in_bits / element_bitsize; 5687 vec_perm_builder sel; 5688 vec_perm_indices indices; 5689 5690 int elt_offset; 5691 5692 tree zero_vec = build_zero_cst (vectype1); 5693 /* Case 2: Create: 5694 for (offset = nelements/2; offset >= 1; offset/=2) 5695 { 5696 Create: va' = vec_shift <va, offset> 5697 Create: va = vop <va, va'> 5698 } */ 5699 5700 tree rhs; 5701 5702 if (dump_enabled_p ()) 5703 dump_printf_loc (MSG_NOTE, vect_location, 5704 "Reduce using vector shifts\n"); 5705 5706 mode1 = TYPE_MODE (vectype1); 5707 vec_dest = vect_create_destination_var (scalar_dest, vectype1); 5708 for (elt_offset = nelements / 2; 5709 elt_offset >= 1; 5710 elt_offset /= 2) 5711 { 5712 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5713 indices.new_vector (sel, 2, nelements); 5714 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5715 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, 5716 new_temp, zero_vec, mask); 5717 new_name = make_ssa_name (vec_dest, epilog_stmt); 5718 gimple_assign_set_lhs (epilog_stmt, new_name); 5719 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5720 5721 epilog_stmt = gimple_build_assign (vec_dest, code, new_name, 5722 new_temp); 5723 new_temp = make_ssa_name (vec_dest, epilog_stmt); 5724 gimple_assign_set_lhs (epilog_stmt, new_temp); 5725 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5726 } 5727 5728 /* 2.4 Extract the final scalar result. Create: 5729 s_out3 = extract_field <v_out2, bitpos> */ 5730 5731 if (dump_enabled_p ()) 5732 dump_printf_loc (MSG_NOTE, vect_location, 5733 "extract scalar result\n"); 5734 5735 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 5736 bitsize, bitsize_zero_node); 5737 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5738 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5739 gimple_assign_set_lhs (epilog_stmt, new_temp); 5740 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5741 scalar_results.safe_push (new_temp); 5742 } 5743 else 5744 { 5745 /* Case 3: Create: 5746 s = extract_field <v_out2, 0> 5747 for (offset = element_size; 5748 offset < vector_size; 5749 offset += element_size;) 5750 { 5751 Create: s' = extract_field <v_out2, offset> 5752 Create: s = op <s, s'> // For non SLP cases 5753 } */ 5754 5755 if (dump_enabled_p ()) 5756 dump_printf_loc (MSG_NOTE, vect_location, 5757 "Reduce using scalar code.\n"); 5758 5759 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5760 int element_bitsize = tree_to_uhwi (bitsize); 5761 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5762 { 5763 int bit_offset; 5764 if (gimple_code (new_phi) == GIMPLE_PHI) 5765 vec_temp = PHI_RESULT (new_phi); 5766 else 5767 vec_temp = gimple_assign_lhs (new_phi); 5768 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, 5769 bitsize_zero_node); 5770 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5771 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5772 gimple_assign_set_lhs (epilog_stmt, new_temp); 5773 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5774 5775 /* In SLP we don't need to apply reduction operation, so we just 5776 collect s' values in SCALAR_RESULTS. */ 5777 if (slp_reduc) 5778 scalar_results.safe_push (new_temp); 5779 5780 for (bit_offset = element_bitsize; 5781 bit_offset < vec_size_in_bits; 5782 bit_offset += element_bitsize) 5783 { 5784 tree bitpos = bitsize_int (bit_offset); 5785 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, 5786 bitsize, bitpos); 5787 5788 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5789 new_name = make_ssa_name (new_scalar_dest, epilog_stmt); 5790 gimple_assign_set_lhs (epilog_stmt, new_name); 5791 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5792 5793 if (slp_reduc) 5794 { 5795 /* In SLP we don't need to apply reduction operation, so 5796 we just collect s' values in SCALAR_RESULTS. */ 5797 new_temp = new_name; 5798 scalar_results.safe_push (new_name); 5799 } 5800 else 5801 { 5802 epilog_stmt = gimple_build_assign (new_scalar_dest, code, 5803 new_name, new_temp); 5804 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5805 gimple_assign_set_lhs (epilog_stmt, new_temp); 5806 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5807 } 5808 } 5809 } 5810 5811 /* The only case where we need to reduce scalar results in SLP, is 5812 unrolling. If the size of SCALAR_RESULTS is greater than 5813 GROUP_SIZE, we reduce them combining elements modulo 5814 GROUP_SIZE. */ 5815 if (slp_reduc) 5816 { 5817 tree res, first_res, new_res; 5818 gimple *new_stmt; 5819 5820 /* Reduce multiple scalar results in case of SLP unrolling. */ 5821 for (j = group_size; scalar_results.iterate (j, &res); 5822 j++) 5823 { 5824 first_res = scalar_results[j % group_size]; 5825 new_stmt = gimple_build_assign (new_scalar_dest, code, 5826 first_res, res); 5827 new_res = make_ssa_name (new_scalar_dest, new_stmt); 5828 gimple_assign_set_lhs (new_stmt, new_res); 5829 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); 5830 scalar_results[j % group_size] = new_res; 5831 } 5832 } 5833 else 5834 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5835 scalar_results.safe_push (new_temp); 5836 } 5837 5838 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 5839 == INTEGER_INDUC_COND_REDUCTION) 5840 && !operand_equal_p (initial_def, induc_val, 0)) 5841 { 5842 /* Earlier we set the initial value to be a vector if induc_val 5843 values. Check the result and if it is induc_val then replace 5844 with the original initial value, unless induc_val is 5845 the same as initial_def already. */ 5846 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5847 induc_val); 5848 5849 tree tmp = make_ssa_name (new_scalar_dest); 5850 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5851 initial_def, new_temp); 5852 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5853 scalar_results[0] = tmp; 5854 } 5855 } 5856 5857 vect_finalize_reduction: 5858 5859 if (double_reduc) 5860 loop = loop->inner; 5861 5862 /* 2.5 Adjust the final result by the initial value of the reduction 5863 variable. (When such adjustment is not needed, then 5864 'adjustment_def' is zero). For example, if code is PLUS we create: 5865 new_temp = loop_exit_def + adjustment_def */ 5866 5867 if (adjustment_def) 5868 { 5869 gcc_assert (!slp_reduc); 5870 if (nested_in_vect_loop) 5871 { 5872 new_phi = new_phis[0]; 5873 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); 5874 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); 5875 new_dest = vect_create_destination_var (scalar_dest, vectype); 5876 } 5877 else 5878 { 5879 new_temp = scalar_results[0]; 5880 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5881 expr = build2 (code, scalar_type, new_temp, adjustment_def); 5882 new_dest = vect_create_destination_var (scalar_dest, scalar_type); 5883 } 5884 5885 epilog_stmt = gimple_build_assign (new_dest, expr); 5886 new_temp = make_ssa_name (new_dest, epilog_stmt); 5887 gimple_assign_set_lhs (epilog_stmt, new_temp); 5888 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5889 if (nested_in_vect_loop) 5890 { 5891 set_vinfo_for_stmt (epilog_stmt, 5892 new_stmt_vec_info (epilog_stmt, loop_vinfo)); 5893 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = 5894 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); 5895 5896 if (!double_reduc) 5897 scalar_results.quick_push (new_temp); 5898 else 5899 scalar_results[0] = new_temp; 5900 } 5901 else 5902 scalar_results[0] = new_temp; 5903 5904 new_phis[0] = epilog_stmt; 5905 } 5906 5907 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5908 phis with new adjusted scalar results, i.e., replace use <s_out0> 5909 with use <s_out4>. 5910 5911 Transform: 5912 loop_exit: 5913 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5914 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5915 v_out2 = reduce <v_out1> 5916 s_out3 = extract_field <v_out2, 0> 5917 s_out4 = adjust_result <s_out3> 5918 use <s_out0> 5919 use <s_out0> 5920 5921 into: 5922 5923 loop_exit: 5924 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5925 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5926 v_out2 = reduce <v_out1> 5927 s_out3 = extract_field <v_out2, 0> 5928 s_out4 = adjust_result <s_out3> 5929 use <s_out4> 5930 use <s_out4> */ 5931 5932 5933 /* In SLP reduction chain we reduce vector results into one vector if 5934 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of 5935 the last stmt in the reduction chain, since we are looking for the loop 5936 exit phi node. */ 5937 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 5938 { 5939 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5940 /* Handle reduction patterns. */ 5941 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt))) 5942 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)); 5943 5944 scalar_dest = gimple_assign_lhs (dest_stmt); 5945 group_size = 1; 5946 } 5947 5948 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 5949 case that GROUP_SIZE is greater than vectorization factor). Therefore, we 5950 need to match SCALAR_RESULTS with corresponding statements. The first 5951 (GROUP_SIZE / number of new vector stmts) scalar results correspond to 5952 the first vector stmt, etc. 5953 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ 5954 if (group_size > new_phis.length ()) 5955 { 5956 ratio = group_size / new_phis.length (); 5957 gcc_assert (!(group_size % new_phis.length ())); 5958 } 5959 else 5960 ratio = 1; 5961 5962 for (k = 0; k < group_size; k++) 5963 { 5964 if (k % ratio == 0) 5965 { 5966 epilog_stmt = new_phis[k / ratio]; 5967 reduction_phi = reduction_phis[k / ratio]; 5968 if (double_reduc) 5969 inner_phi = inner_phis[k / ratio]; 5970 } 5971 5972 if (slp_reduc) 5973 { 5974 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5975 5976 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt)); 5977 /* SLP statements can't participate in patterns. */ 5978 gcc_assert (!orig_stmt); 5979 scalar_dest = gimple_assign_lhs (current_stmt); 5980 } 5981 5982 phis.create (3); 5983 /* Find the loop-closed-use at the loop exit of the original scalar 5984 result. (The reduction result is expected to have two immediate uses - 5985 one at the latch block, and one at the loop exit). */ 5986 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5987 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))) 5988 && !is_gimple_debug (USE_STMT (use_p))) 5989 phis.safe_push (USE_STMT (use_p)); 5990 5991 /* While we expect to have found an exit_phi because of loop-closed-ssa 5992 form we can end up without one if the scalar cycle is dead. */ 5993 5994 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5995 { 5996 if (outer_loop) 5997 { 5998 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 5999 gphi *vect_phi; 6000 6001 /* FORNOW. Currently not supporting the case that an inner-loop 6002 reduction is not used in the outer-loop (but only outside the 6003 outer-loop), unless it is double reduction. */ 6004 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 6005 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) 6006 || double_reduc); 6007 6008 if (double_reduc) 6009 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi; 6010 else 6011 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt; 6012 if (!double_reduc 6013 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) 6014 != vect_double_reduction_def) 6015 continue; 6016 6017 /* Handle double reduction: 6018 6019 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) 6020 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) 6021 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) 6022 stmt4: s2 = phi <s4> - double reduction stmt (outer loop) 6023 6024 At that point the regular reduction (stmt2 and stmt3) is 6025 already vectorized, as well as the exit phi node, stmt4. 6026 Here we vectorize the phi node of double reduction, stmt1, and 6027 update all relevant statements. */ 6028 6029 /* Go through all the uses of s2 to find double reduction phi 6030 node, i.e., stmt1 above. */ 6031 orig_name = PHI_RESULT (exit_phi); 6032 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 6033 { 6034 stmt_vec_info use_stmt_vinfo; 6035 stmt_vec_info new_phi_vinfo; 6036 tree vect_phi_init, preheader_arg, vect_phi_res; 6037 basic_block bb = gimple_bb (use_stmt); 6038 gimple *use; 6039 6040 /* Check that USE_STMT is really double reduction phi 6041 node. */ 6042 if (gimple_code (use_stmt) != GIMPLE_PHI 6043 || gimple_phi_num_args (use_stmt) != 2 6044 || bb->loop_father != outer_loop) 6045 continue; 6046 use_stmt_vinfo = vinfo_for_stmt (use_stmt); 6047 if (!use_stmt_vinfo 6048 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 6049 != vect_double_reduction_def) 6050 continue; 6051 6052 /* Create vector phi node for double reduction: 6053 vs1 = phi <vs0, vs2> 6054 vs1 was created previously in this function by a call to 6055 vect_get_vec_def_for_operand and is stored in 6056 vec_initial_def; 6057 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI; 6058 vs0 is created here. */ 6059 6060 /* Create vector phi node. */ 6061 vect_phi = create_phi_node (vec_initial_def, bb); 6062 new_phi_vinfo = new_stmt_vec_info (vect_phi, 6063 loop_vec_info_for_loop (outer_loop)); 6064 set_vinfo_for_stmt (vect_phi, new_phi_vinfo); 6065 6066 /* Create vs0 - initial def of the double reduction phi. */ 6067 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 6068 loop_preheader_edge (outer_loop)); 6069 vect_phi_init = get_initial_def_for_reduction 6070 (stmt, preheader_arg, NULL); 6071 6072 /* Update phi node arguments with vs0 and vs2. */ 6073 add_phi_arg (vect_phi, vect_phi_init, 6074 loop_preheader_edge (outer_loop), 6075 UNKNOWN_LOCATION); 6076 add_phi_arg (vect_phi, PHI_RESULT (inner_phi), 6077 loop_latch_edge (outer_loop), UNKNOWN_LOCATION); 6078 if (dump_enabled_p ()) 6079 { 6080 dump_printf_loc (MSG_NOTE, vect_location, 6081 "created double reduction phi node: "); 6082 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0); 6083 } 6084 6085 vect_phi_res = PHI_RESULT (vect_phi); 6086 6087 /* Replace the use, i.e., set the correct vs1 in the regular 6088 reduction phi node. FORNOW, NCOPIES is always 1, so the 6089 loop is redundant. */ 6090 use = reduction_phi; 6091 for (j = 0; j < ncopies; j++) 6092 { 6093 edge pr_edge = loop_preheader_edge (loop); 6094 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 6095 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); 6096 } 6097 } 6098 } 6099 } 6100 6101 phis.release (); 6102 if (nested_in_vect_loop) 6103 { 6104 if (double_reduc) 6105 loop = outer_loop; 6106 else 6107 continue; 6108 } 6109 6110 phis.create (3); 6111 /* Find the loop-closed-use at the loop exit of the original scalar 6112 result. (The reduction result is expected to have two immediate uses, 6113 one at the latch block, and one at the loop exit). For double 6114 reductions we are looking for exit phis of the outer loop. */ 6115 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 6116 { 6117 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 6118 { 6119 if (!is_gimple_debug (USE_STMT (use_p))) 6120 phis.safe_push (USE_STMT (use_p)); 6121 } 6122 else 6123 { 6124 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 6125 { 6126 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 6127 6128 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 6129 { 6130 if (!flow_bb_inside_loop_p (loop, 6131 gimple_bb (USE_STMT (phi_use_p))) 6132 && !is_gimple_debug (USE_STMT (phi_use_p))) 6133 phis.safe_push (USE_STMT (phi_use_p)); 6134 } 6135 } 6136 } 6137 } 6138 6139 FOR_EACH_VEC_ELT (phis, i, exit_phi) 6140 { 6141 /* Replace the uses: */ 6142 orig_name = PHI_RESULT (exit_phi); 6143 scalar_result = scalar_results[k]; 6144 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 6145 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 6146 SET_USE (use_p, scalar_result); 6147 } 6148 6149 phis.release (); 6150 } 6151 } 6152 6153 /* Return a vector of type VECTYPE that is equal to the vector select 6154 operation "MASK ? VEC : IDENTITY". Insert the select statements 6155 before GSI. */ 6156 6157 static tree 6158 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, 6159 tree vec, tree identity) 6160 { 6161 tree cond = make_temp_ssa_name (vectype, NULL, "cond"); 6162 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, 6163 mask, vec, identity); 6164 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6165 return cond; 6166 } 6167 6168 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right 6169 order, starting with LHS. Insert the extraction statements before GSI and 6170 associate the new scalar SSA names with variable SCALAR_DEST. 6171 Return the SSA name for the result. */ 6172 6173 static tree 6174 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, 6175 tree_code code, tree lhs, tree vector_rhs) 6176 { 6177 tree vectype = TREE_TYPE (vector_rhs); 6178 tree scalar_type = TREE_TYPE (vectype); 6179 tree bitsize = TYPE_SIZE (scalar_type); 6180 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 6181 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); 6182 6183 for (unsigned HOST_WIDE_INT bit_offset = 0; 6184 bit_offset < vec_size_in_bits; 6185 bit_offset += element_bitsize) 6186 { 6187 tree bitpos = bitsize_int (bit_offset); 6188 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, 6189 bitsize, bitpos); 6190 6191 gassign *stmt = gimple_build_assign (scalar_dest, rhs); 6192 rhs = make_ssa_name (scalar_dest, stmt); 6193 gimple_assign_set_lhs (stmt, rhs); 6194 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6195 6196 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); 6197 tree new_name = make_ssa_name (scalar_dest, stmt); 6198 gimple_assign_set_lhs (stmt, new_name); 6199 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 6200 lhs = new_name; 6201 } 6202 return lhs; 6203 } 6204 6205 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT is the 6206 statement that sets the live-out value. REDUC_DEF_STMT is the phi 6207 statement. CODE is the operation performed by STMT and OPS are 6208 its scalar operands. REDUC_INDEX is the index of the operand in 6209 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 6210 implements in-order reduction, or IFN_LAST if we should open-code it. 6211 VECTYPE_IN is the type of the vector input. MASKS specifies the masks 6212 that should be used to control the operation in a fully-masked loop. */ 6213 6214 static bool 6215 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi, 6216 gimple **vec_stmt, slp_tree slp_node, 6217 gimple *reduc_def_stmt, 6218 tree_code code, internal_fn reduc_fn, 6219 tree ops[3], tree vectype_in, 6220 int reduc_index, vec_loop_masks *masks) 6221 { 6222 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6223 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6225 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6226 gimple *new_stmt = NULL; 6227 6228 int ncopies; 6229 if (slp_node) 6230 ncopies = 1; 6231 else 6232 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6233 6234 gcc_assert (!nested_in_vect_loop_p (loop, stmt)); 6235 gcc_assert (ncopies == 1); 6236 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 6237 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1)); 6238 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6239 == FOLD_LEFT_REDUCTION); 6240 6241 if (slp_node) 6242 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 6243 TYPE_VECTOR_SUBPARTS (vectype_in))); 6244 6245 tree op0 = ops[1 - reduc_index]; 6246 6247 int group_size = 1; 6248 gimple *scalar_dest_def; 6249 auto_vec<tree> vec_oprnds0; 6250 if (slp_node) 6251 { 6252 vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node); 6253 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 6254 scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 6255 } 6256 else 6257 { 6258 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt); 6259 vec_oprnds0.create (1); 6260 vec_oprnds0.quick_push (loop_vec_def0); 6261 scalar_dest_def = stmt; 6262 } 6263 6264 tree scalar_dest = gimple_assign_lhs (scalar_dest_def); 6265 tree scalar_type = TREE_TYPE (scalar_dest); 6266 tree reduc_var = gimple_phi_result (reduc_def_stmt); 6267 6268 int vec_num = vec_oprnds0.length (); 6269 gcc_assert (vec_num == 1 || slp_node); 6270 tree vec_elem_type = TREE_TYPE (vectype_out); 6271 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); 6272 6273 tree vector_identity = NULL_TREE; 6274 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6275 vector_identity = build_zero_cst (vectype_out); 6276 6277 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); 6278 int i; 6279 tree def0; 6280 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 6281 { 6282 tree mask = NULL_TREE; 6283 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 6284 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); 6285 6286 /* Handle MINUS by adding the negative. */ 6287 if (reduc_fn != IFN_LAST && code == MINUS_EXPR) 6288 { 6289 tree negated = make_ssa_name (vectype_out); 6290 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 6291 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 6292 def0 = negated; 6293 } 6294 6295 if (mask) 6296 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 6297 vector_identity); 6298 6299 /* On the first iteration the input is simply the scalar phi 6300 result, and for subsequent iterations it is the output of 6301 the preceding operation. */ 6302 if (reduc_fn != IFN_LAST) 6303 { 6304 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); 6305 /* For chained SLP reductions the output of the previous reduction 6306 operation serves as the input of the next. For the final statement 6307 the output cannot be a temporary - we reuse the original 6308 scalar destination of the last statement. */ 6309 if (i != vec_num - 1) 6310 { 6311 gimple_set_lhs (new_stmt, scalar_dest_var); 6312 reduc_var = make_ssa_name (scalar_dest_var, new_stmt); 6313 gimple_set_lhs (new_stmt, reduc_var); 6314 } 6315 } 6316 else 6317 { 6318 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code, 6319 reduc_var, def0); 6320 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 6321 /* Remove the statement, so that we can use the same code paths 6322 as for statements that we've just created. */ 6323 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 6324 gsi_remove (&tmp_gsi, true); 6325 } 6326 6327 if (i == vec_num - 1) 6328 { 6329 gimple_set_lhs (new_stmt, scalar_dest); 6330 vect_finish_replace_stmt (scalar_dest_def, new_stmt); 6331 } 6332 else 6333 vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi); 6334 6335 if (slp_node) 6336 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 6337 } 6338 6339 if (!slp_node) 6340 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 6341 6342 return true; 6343 } 6344 6345 /* Function is_nonwrapping_integer_induction. 6346 6347 Check if STMT (which is part of loop LOOP) both increments and 6348 does not cause overflow. */ 6349 6350 static bool 6351 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop) 6352 { 6353 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); 6354 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 6355 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 6356 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt)); 6357 widest_int ni, max_loop_value, lhs_max; 6358 bool overflow = false; 6359 6360 /* Make sure the loop is integer based. */ 6361 if (TREE_CODE (base) != INTEGER_CST 6362 || TREE_CODE (step) != INTEGER_CST) 6363 return false; 6364 6365 /* Check that the max size of the loop will not wrap. */ 6366 6367 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 6368 return true; 6369 6370 if (! max_stmt_executions (loop, &ni)) 6371 return false; 6372 6373 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 6374 &overflow); 6375 if (overflow) 6376 return false; 6377 6378 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 6379 TYPE_SIGN (lhs_type), &overflow); 6380 if (overflow) 6381 return false; 6382 6383 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 6384 <= TYPE_PRECISION (lhs_type)); 6385 } 6386 6387 /* Function vectorizable_reduction. 6388 6389 Check if STMT performs a reduction operation that can be vectorized. 6390 If VEC_STMT is also passed, vectorize the STMT: create a vectorized 6391 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 6392 Return FALSE if not a vectorizable STMT, TRUE otherwise. 6393 6394 This function also handles reduction idioms (patterns) that have been 6395 recognized in advance during vect_pattern_recog. In this case, STMT may be 6396 of this form: 6397 X = pattern_expr (arg0, arg1, ..., X) 6398 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original 6399 sequence that had been detected and replaced by the pattern-stmt (STMT). 6400 6401 This function also handles reduction of condition expressions, for example: 6402 for (int i = 0; i < N; i++) 6403 if (a[i] < value) 6404 last = a[i]; 6405 This is handled by vectorising the loop and creating an additional vector 6406 containing the loop indexes for which "a[i] < value" was true. In the 6407 function epilogue this is reduced to a single max value and then used to 6408 index into the vector of results. 6409 6410 In some cases of reduction patterns, the type of the reduction variable X is 6411 different than the type of the other arguments of STMT. 6412 In such cases, the vectype that is used when transforming STMT into a vector 6413 stmt is different than the vectype that is used to determine the 6414 vectorization factor, because it consists of a different number of elements 6415 than the actual number of elements that are being operated upon in parallel. 6416 6417 For example, consider an accumulation of shorts into an int accumulator. 6418 On some targets it's possible to vectorize this pattern operating on 8 6419 shorts at a time (hence, the vectype for purposes of determining the 6420 vectorization factor should be V8HI); on the other hand, the vectype that 6421 is used to create the vector form is actually V4SI (the type of the result). 6422 6423 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 6424 indicates what is the actual level of parallelism (V8HI in the example), so 6425 that the right vectorization factor would be derived. This vectype 6426 corresponds to the type of arguments to the reduction stmt, and should *NOT* 6427 be used to create the vectorized stmt. The right vectype for the vectorized 6428 stmt is obtained from the type of the result X: 6429 get_vectype_for_scalar_type (TREE_TYPE (X)) 6430 6431 This means that, contrary to "regular" reductions (or "regular" stmts in 6432 general), the following equation: 6433 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) 6434 does *NOT* necessarily hold for reduction patterns. */ 6435 6436 bool 6437 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, 6438 gimple **vec_stmt, slp_tree slp_node, 6439 slp_instance slp_node_instance) 6440 { 6441 tree vec_dest; 6442 tree scalar_dest; 6443 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6444 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6445 tree vectype_in = NULL_TREE; 6446 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6447 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6448 enum tree_code code, orig_code; 6449 internal_fn reduc_fn; 6450 machine_mode vec_mode; 6451 int op_type; 6452 optab optab; 6453 tree new_temp = NULL_TREE; 6454 gimple *def_stmt; 6455 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; 6456 gimple *cond_reduc_def_stmt = NULL; 6457 enum tree_code cond_reduc_op_code = ERROR_MARK; 6458 tree scalar_type; 6459 bool is_simple_use; 6460 gimple *orig_stmt; 6461 stmt_vec_info orig_stmt_info = NULL; 6462 int i; 6463 int ncopies; 6464 int epilog_copies; 6465 stmt_vec_info prev_stmt_info, prev_phi_info; 6466 bool single_defuse_cycle = false; 6467 gimple *new_stmt = NULL; 6468 int j; 6469 tree ops[3]; 6470 enum vect_def_type dts[3]; 6471 bool nested_cycle = false, found_nested_cycle_def = false; 6472 bool double_reduc = false; 6473 basic_block def_bb; 6474 struct loop * def_stmt_loop, *outer_loop = NULL; 6475 tree def_arg; 6476 gimple *def_arg_stmt; 6477 auto_vec<tree> vec_oprnds0; 6478 auto_vec<tree> vec_oprnds1; 6479 auto_vec<tree> vec_oprnds2; 6480 auto_vec<tree> vect_defs; 6481 auto_vec<gimple *> phis; 6482 int vec_num; 6483 tree def0, tem; 6484 bool first_p = true; 6485 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 6486 tree cond_reduc_val = NULL_TREE; 6487 6488 /* Make sure it was already recognized as a reduction computation. */ 6489 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def 6490 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle) 6491 return false; 6492 6493 if (nested_in_vect_loop_p (loop, stmt)) 6494 { 6495 outer_loop = loop; 6496 loop = loop->inner; 6497 nested_cycle = true; 6498 } 6499 6500 /* In case of reduction chain we switch to the first stmt in the chain, but 6501 we don't update STMT_INFO, since only the last stmt is marked as reduction 6502 and has reduction properties. */ 6503 if (GROUP_FIRST_ELEMENT (stmt_info) 6504 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 6505 { 6506 stmt = GROUP_FIRST_ELEMENT (stmt_info); 6507 first_p = false; 6508 } 6509 6510 if (gimple_code (stmt) == GIMPLE_PHI) 6511 { 6512 /* Analysis is fully done on the reduction stmt invocation. */ 6513 if (! vec_stmt) 6514 { 6515 if (slp_node) 6516 slp_node_instance->reduc_phis = slp_node; 6517 6518 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6519 return true; 6520 } 6521 6522 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6523 /* Leave the scalar phi in place. Note that checking 6524 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works 6525 for reductions involving a single statement. */ 6526 return true; 6527 6528 gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info); 6529 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt))) 6530 reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt)); 6531 6532 if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt)) 6533 == EXTRACT_LAST_REDUCTION) 6534 /* Leave the scalar phi in place. */ 6535 return true; 6536 6537 gcc_assert (is_gimple_assign (reduc_stmt)); 6538 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) 6539 { 6540 tree op = gimple_op (reduc_stmt, k); 6541 if (op == gimple_phi_result (stmt)) 6542 continue; 6543 if (k == 1 6544 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) 6545 continue; 6546 if (!vectype_in 6547 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6548 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op))))) 6549 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op)); 6550 break; 6551 } 6552 gcc_assert (vectype_in); 6553 6554 if (slp_node) 6555 ncopies = 1; 6556 else 6557 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6558 6559 use_operand_p use_p; 6560 gimple *use_stmt; 6561 if (ncopies > 1 6562 && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt)) 6563 <= vect_used_only_live) 6564 && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt) 6565 && (use_stmt == reduc_stmt 6566 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) 6567 == reduc_stmt))) 6568 single_defuse_cycle = true; 6569 6570 /* Create the destination vector */ 6571 scalar_dest = gimple_assign_lhs (reduc_stmt); 6572 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6573 6574 if (slp_node) 6575 /* The size vect_schedule_slp_instance computes is off for us. */ 6576 vec_num = vect_get_num_vectors 6577 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6578 * SLP_TREE_SCALAR_STMTS (slp_node).length (), 6579 vectype_in); 6580 else 6581 vec_num = 1; 6582 6583 /* Generate the reduction PHIs upfront. */ 6584 prev_phi_info = NULL; 6585 for (j = 0; j < ncopies; j++) 6586 { 6587 if (j == 0 || !single_defuse_cycle) 6588 { 6589 for (i = 0; i < vec_num; i++) 6590 { 6591 /* Create the reduction-phi that defines the reduction 6592 operand. */ 6593 gimple *new_phi = create_phi_node (vec_dest, loop->header); 6594 set_vinfo_for_stmt (new_phi, 6595 new_stmt_vec_info (new_phi, loop_vinfo)); 6596 6597 if (slp_node) 6598 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); 6599 else 6600 { 6601 if (j == 0) 6602 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi; 6603 else 6604 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; 6605 prev_phi_info = vinfo_for_stmt (new_phi); 6606 } 6607 } 6608 } 6609 } 6610 6611 return true; 6612 } 6613 6614 /* 1. Is vectorizable reduction? */ 6615 /* Not supportable if the reduction variable is used in the loop, unless 6616 it's a reduction chain. */ 6617 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6618 && !GROUP_FIRST_ELEMENT (stmt_info)) 6619 return false; 6620 6621 /* Reductions that are not used even in an enclosing outer-loop, 6622 are expected to be "live" (used out of the loop). */ 6623 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 6624 && !STMT_VINFO_LIVE_P (stmt_info)) 6625 return false; 6626 6627 /* 2. Has this been recognized as a reduction pattern? 6628 6629 Check if STMT represents a pattern that has been recognized 6630 in earlier analysis stages. For stmts that represent a pattern, 6631 the STMT_VINFO_RELATED_STMT field records the last stmt in 6632 the original sequence that constitutes the pattern. */ 6633 6634 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)); 6635 if (orig_stmt) 6636 { 6637 orig_stmt_info = vinfo_for_stmt (orig_stmt); 6638 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 6639 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 6640 } 6641 6642 /* 3. Check the operands of the operation. The first operands are defined 6643 inside the loop body. The last operand is the reduction variable, 6644 which is defined by the loop-header-phi. */ 6645 6646 gcc_assert (is_gimple_assign (stmt)); 6647 6648 /* Flatten RHS. */ 6649 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) 6650 { 6651 case GIMPLE_BINARY_RHS: 6652 code = gimple_assign_rhs_code (stmt); 6653 op_type = TREE_CODE_LENGTH (code); 6654 gcc_assert (op_type == binary_op); 6655 ops[0] = gimple_assign_rhs1 (stmt); 6656 ops[1] = gimple_assign_rhs2 (stmt); 6657 break; 6658 6659 case GIMPLE_TERNARY_RHS: 6660 code = gimple_assign_rhs_code (stmt); 6661 op_type = TREE_CODE_LENGTH (code); 6662 gcc_assert (op_type == ternary_op); 6663 ops[0] = gimple_assign_rhs1 (stmt); 6664 ops[1] = gimple_assign_rhs2 (stmt); 6665 ops[2] = gimple_assign_rhs3 (stmt); 6666 break; 6667 6668 case GIMPLE_UNARY_RHS: 6669 return false; 6670 6671 default: 6672 gcc_unreachable (); 6673 } 6674 6675 if (code == COND_EXPR && slp_node) 6676 return false; 6677 6678 scalar_dest = gimple_assign_lhs (stmt); 6679 scalar_type = TREE_TYPE (scalar_dest); 6680 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 6681 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 6682 return false; 6683 6684 /* Do not try to vectorize bit-precision reductions. */ 6685 if (!type_has_mode_precision_p (scalar_type)) 6686 return false; 6687 6688 /* All uses but the last are expected to be defined in the loop. 6689 The last use is the reduction variable. In case of nested cycle this 6690 assumption is not true: we use reduc_index to record the index of the 6691 reduction variable. */ 6692 gimple *reduc_def_stmt = NULL; 6693 int reduc_index = -1; 6694 for (i = 0; i < op_type; i++) 6695 { 6696 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6697 if (i == 0 && code == COND_EXPR) 6698 continue; 6699 6700 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, 6701 &def_stmt, &dts[i], &tem); 6702 dt = dts[i]; 6703 gcc_assert (is_simple_use); 6704 if (dt == vect_reduction_def) 6705 { 6706 reduc_def_stmt = def_stmt; 6707 reduc_index = i; 6708 continue; 6709 } 6710 else if (tem) 6711 { 6712 /* To properly compute ncopies we are interested in the widest 6713 input type in case we're looking at a widening accumulation. */ 6714 if (!vectype_in 6715 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6716 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))) 6717 vectype_in = tem; 6718 } 6719 6720 if (dt != vect_internal_def 6721 && dt != vect_external_def 6722 && dt != vect_constant_def 6723 && dt != vect_induction_def 6724 && !(dt == vect_nested_cycle && nested_cycle)) 6725 return false; 6726 6727 if (dt == vect_nested_cycle) 6728 { 6729 found_nested_cycle_def = true; 6730 reduc_def_stmt = def_stmt; 6731 reduc_index = i; 6732 } 6733 6734 if (i == 1 && code == COND_EXPR) 6735 { 6736 /* Record how value of COND_EXPR is defined. */ 6737 if (dt == vect_constant_def) 6738 { 6739 cond_reduc_dt = dt; 6740 cond_reduc_val = ops[i]; 6741 } 6742 if (dt == vect_induction_def 6743 && def_stmt != NULL 6744 && is_nonwrapping_integer_induction (def_stmt, loop)) 6745 { 6746 cond_reduc_dt = dt; 6747 cond_reduc_def_stmt = def_stmt; 6748 } 6749 } 6750 } 6751 6752 if (!vectype_in) 6753 vectype_in = vectype_out; 6754 6755 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not 6756 directy used in stmt. */ 6757 if (reduc_index == -1) 6758 { 6759 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION) 6760 { 6761 if (dump_enabled_p ()) 6762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6763 "in-order reduction chain without SLP.\n"); 6764 return false; 6765 } 6766 6767 if (orig_stmt) 6768 reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info); 6769 else 6770 reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info); 6771 } 6772 6773 if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI) 6774 return false; 6775 6776 if (!(reduc_index == -1 6777 || dts[reduc_index] == vect_reduction_def 6778 || dts[reduc_index] == vect_nested_cycle 6779 || ((dts[reduc_index] == vect_internal_def 6780 || dts[reduc_index] == vect_external_def 6781 || dts[reduc_index] == vect_constant_def 6782 || dts[reduc_index] == vect_induction_def) 6783 && nested_cycle && found_nested_cycle_def))) 6784 { 6785 /* For pattern recognized stmts, orig_stmt might be a reduction, 6786 but some helper statements for the pattern might not, or 6787 might be COND_EXPRs with reduction uses in the condition. */ 6788 gcc_assert (orig_stmt); 6789 return false; 6790 } 6791 6792 stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt); 6793 enum vect_reduction_type v_reduc_type 6794 = STMT_VINFO_REDUC_TYPE (reduc_def_info); 6795 gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info); 6796 6797 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type; 6798 /* If we have a condition reduction, see if we can simplify it further. */ 6799 if (v_reduc_type == COND_REDUCTION) 6800 { 6801 /* TODO: We can't yet handle reduction chains, since we need to treat 6802 each COND_EXPR in the chain specially, not just the last one. 6803 E.g. for: 6804 6805 x_1 = PHI <x_3, ...> 6806 x_2 = a_2 ? ... : x_1; 6807 x_3 = a_3 ? ... : x_2; 6808 6809 we're interested in the last element in x_3 for which a_2 || a_3 6810 is true, whereas the current reduction chain handling would 6811 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3 6812 as a reduction operation. */ 6813 if (reduc_index == -1) 6814 { 6815 if (dump_enabled_p ()) 6816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6817 "conditional reduction chains not supported\n"); 6818 return false; 6819 } 6820 6821 /* vect_is_simple_reduction ensured that operand 2 is the 6822 loop-carried operand. */ 6823 gcc_assert (reduc_index == 2); 6824 6825 /* Loop peeling modifies initial value of reduction PHI, which 6826 makes the reduction stmt to be transformed different to the 6827 original stmt analyzed. We need to record reduction code for 6828 CONST_COND_REDUCTION type reduction at analyzing stage, thus 6829 it can be used directly at transform stage. */ 6830 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR 6831 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR) 6832 { 6833 /* Also set the reduction type to CONST_COND_REDUCTION. */ 6834 gcc_assert (cond_reduc_dt == vect_constant_def); 6835 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION; 6836 } 6837 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, 6838 vectype_in, OPTIMIZE_FOR_SPEED)) 6839 { 6840 if (dump_enabled_p ()) 6841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6842 "optimizing condition reduction with" 6843 " FOLD_EXTRACT_LAST.\n"); 6844 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION; 6845 } 6846 else if (cond_reduc_dt == vect_induction_def) 6847 { 6848 stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt); 6849 tree base 6850 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6851 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6852 6853 gcc_assert (TREE_CODE (base) == INTEGER_CST 6854 && TREE_CODE (step) == INTEGER_CST); 6855 cond_reduc_val = NULL_TREE; 6856 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6857 above base; punt if base is the minimum value of the type for 6858 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6859 if (tree_int_cst_sgn (step) == -1) 6860 { 6861 cond_reduc_op_code = MIN_EXPR; 6862 if (tree_int_cst_sgn (base) == -1) 6863 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6864 else if (tree_int_cst_lt (base, 6865 TYPE_MAX_VALUE (TREE_TYPE (base)))) 6866 cond_reduc_val 6867 = int_const_binop (PLUS_EXPR, base, integer_one_node); 6868 } 6869 else 6870 { 6871 cond_reduc_op_code = MAX_EXPR; 6872 if (tree_int_cst_sgn (base) == 1) 6873 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6874 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), 6875 base)) 6876 cond_reduc_val 6877 = int_const_binop (MINUS_EXPR, base, integer_one_node); 6878 } 6879 if (cond_reduc_val) 6880 { 6881 if (dump_enabled_p ()) 6882 dump_printf_loc (MSG_NOTE, vect_location, 6883 "condition expression based on " 6884 "integer induction.\n"); 6885 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6886 = INTEGER_INDUC_COND_REDUCTION; 6887 } 6888 } 6889 else if (cond_reduc_dt == vect_constant_def) 6890 { 6891 enum vect_def_type cond_initial_dt; 6892 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]); 6893 tree cond_initial_val 6894 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); 6895 6896 gcc_assert (cond_reduc_val != NULL_TREE); 6897 vect_is_simple_use (cond_initial_val, loop_vinfo, 6898 &def_stmt, &cond_initial_dt); 6899 if (cond_initial_dt == vect_constant_def 6900 && types_compatible_p (TREE_TYPE (cond_initial_val), 6901 TREE_TYPE (cond_reduc_val))) 6902 { 6903 tree e = fold_binary (LE_EXPR, boolean_type_node, 6904 cond_initial_val, cond_reduc_val); 6905 if (e && (integer_onep (e) || integer_zerop (e))) 6906 { 6907 if (dump_enabled_p ()) 6908 dump_printf_loc (MSG_NOTE, vect_location, 6909 "condition expression based on " 6910 "compile time constant.\n"); 6911 /* Record reduction code at analysis stage. */ 6912 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) 6913 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6914 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) 6915 = CONST_COND_REDUCTION; 6916 } 6917 } 6918 } 6919 } 6920 6921 if (orig_stmt) 6922 gcc_assert (tmp == orig_stmt 6923 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt); 6924 else 6925 /* We changed STMT to be the first stmt in reduction chain, hence we 6926 check that in this case the first element in the chain is STMT. */ 6927 gcc_assert (stmt == tmp 6928 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt); 6929 6930 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) 6931 return false; 6932 6933 if (slp_node) 6934 ncopies = 1; 6935 else 6936 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6937 6938 gcc_assert (ncopies >= 1); 6939 6940 vec_mode = TYPE_MODE (vectype_in); 6941 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6942 6943 if (code == COND_EXPR) 6944 { 6945 /* Only call during the analysis stage, otherwise we'll lose 6946 STMT_VINFO_TYPE. */ 6947 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL, 6948 ops[reduc_index], 0, NULL)) 6949 { 6950 if (dump_enabled_p ()) 6951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6952 "unsupported condition in reduction\n"); 6953 return false; 6954 } 6955 } 6956 else 6957 { 6958 /* 4. Supportable by target? */ 6959 6960 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR 6961 || code == LROTATE_EXPR || code == RROTATE_EXPR) 6962 { 6963 /* Shifts and rotates are only supported by vectorizable_shifts, 6964 not vectorizable_reduction. */ 6965 if (dump_enabled_p ()) 6966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6967 "unsupported shift or rotation.\n"); 6968 return false; 6969 } 6970 6971 /* 4.1. check support for the operation in the loop */ 6972 optab = optab_for_tree_code (code, vectype_in, optab_default); 6973 if (!optab) 6974 { 6975 if (dump_enabled_p ()) 6976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6977 "no optab.\n"); 6978 6979 return false; 6980 } 6981 6982 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing) 6983 { 6984 if (dump_enabled_p ()) 6985 dump_printf (MSG_NOTE, "op not supported by target.\n"); 6986 6987 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) 6988 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6989 return false; 6990 6991 if (dump_enabled_p ()) 6992 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 6993 } 6994 6995 /* Worthwhile without SIMD support? */ 6996 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in)) 6997 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6998 { 6999 if (dump_enabled_p ()) 7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7001 "not worthwhile without SIMD support.\n"); 7002 7003 return false; 7004 } 7005 } 7006 7007 /* 4.2. Check support for the epilog operation. 7008 7009 If STMT represents a reduction pattern, then the type of the 7010 reduction variable may be different than the type of the rest 7011 of the arguments. For example, consider the case of accumulation 7012 of shorts into an int accumulator; The original code: 7013 S1: int_a = (int) short_a; 7014 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 7015 7016 was replaced with: 7017 STMT: int_acc = widen_sum <short_a, int_acc> 7018 7019 This means that: 7020 1. The tree-code that is used to create the vector operation in the 7021 epilog code (that reduces the partial results) is not the 7022 tree-code of STMT, but is rather the tree-code of the original 7023 stmt from the pattern that STMT is replacing. I.e, in the example 7024 above we want to use 'widen_sum' in the loop, but 'plus' in the 7025 epilog. 7026 2. The type (mode) we use to check available target support 7027 for the vector operation to be created in the *epilog*, is 7028 determined by the type of the reduction variable (in the example 7029 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 7030 However the type (mode) we use to check available target support 7031 for the vector operation to be created *inside the loop*, is 7032 determined by the type of the other arguments to STMT (in the 7033 example we'd check this: optab_handler (widen_sum_optab, 7034 vect_short_mode)). 7035 7036 This is contrary to "regular" reductions, in which the types of all 7037 the arguments are the same as the type of the reduction variable. 7038 For "regular" reductions we can therefore use the same vector type 7039 (and also the same tree-code) when generating the epilog code and 7040 when generating the code inside the loop. */ 7041 7042 vect_reduction_type reduction_type 7043 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info); 7044 if (orig_stmt 7045 && (reduction_type == TREE_CODE_REDUCTION 7046 || reduction_type == FOLD_LEFT_REDUCTION)) 7047 { 7048 /* This is a reduction pattern: get the vectype from the type of the 7049 reduction variable, and get the tree-code from orig_stmt. */ 7050 orig_code = gimple_assign_rhs_code (orig_stmt); 7051 gcc_assert (vectype_out); 7052 vec_mode = TYPE_MODE (vectype_out); 7053 } 7054 else 7055 { 7056 /* Regular reduction: use the same vectype and tree-code as used for 7057 the vector code inside the loop can be used for the epilog code. */ 7058 orig_code = code; 7059 7060 if (code == MINUS_EXPR) 7061 orig_code = PLUS_EXPR; 7062 7063 /* For simple condition reductions, replace with the actual expression 7064 we want to base our reduction around. */ 7065 if (reduction_type == CONST_COND_REDUCTION) 7066 { 7067 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info); 7068 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR); 7069 } 7070 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION) 7071 orig_code = cond_reduc_op_code; 7072 } 7073 7074 if (nested_cycle) 7075 { 7076 def_bb = gimple_bb (reduc_def_stmt); 7077 def_stmt_loop = def_bb->loop_father; 7078 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 7079 loop_preheader_edge (def_stmt_loop)); 7080 if (TREE_CODE (def_arg) == SSA_NAME 7081 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg)) 7082 && gimple_code (def_arg_stmt) == GIMPLE_PHI 7083 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt)) 7084 && vinfo_for_stmt (def_arg_stmt) 7085 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt)) 7086 == vect_double_reduction_def) 7087 double_reduc = true; 7088 } 7089 7090 reduc_fn = IFN_LAST; 7091 7092 if (reduction_type == TREE_CODE_REDUCTION 7093 || reduction_type == FOLD_LEFT_REDUCTION 7094 || reduction_type == INTEGER_INDUC_COND_REDUCTION 7095 || reduction_type == CONST_COND_REDUCTION) 7096 { 7097 if (reduction_type == FOLD_LEFT_REDUCTION 7098 ? fold_left_reduction_fn (orig_code, &reduc_fn) 7099 : reduction_fn_for_scalar_code (orig_code, &reduc_fn)) 7100 { 7101 if (reduc_fn != IFN_LAST 7102 && !direct_internal_fn_supported_p (reduc_fn, vectype_out, 7103 OPTIMIZE_FOR_SPEED)) 7104 { 7105 if (dump_enabled_p ()) 7106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7107 "reduc op not supported by target.\n"); 7108 7109 reduc_fn = IFN_LAST; 7110 } 7111 } 7112 else 7113 { 7114 if (!nested_cycle || double_reduc) 7115 { 7116 if (dump_enabled_p ()) 7117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7118 "no reduc code for scalar code.\n"); 7119 7120 return false; 7121 } 7122 } 7123 } 7124 else if (reduction_type == COND_REDUCTION) 7125 { 7126 int scalar_precision 7127 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 7128 cr_index_scalar_type = make_unsigned_type (scalar_precision); 7129 cr_index_vector_type = build_vector_type (cr_index_scalar_type, 7130 nunits_out); 7131 7132 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 7133 OPTIMIZE_FOR_SPEED)) 7134 reduc_fn = IFN_REDUC_MAX; 7135 } 7136 7137 if (reduction_type != EXTRACT_LAST_REDUCTION 7138 && reduc_fn == IFN_LAST 7139 && !nunits_out.is_constant ()) 7140 { 7141 if (dump_enabled_p ()) 7142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7143 "missing target support for reduction on" 7144 " variable-length vectors.\n"); 7145 return false; 7146 } 7147 7148 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) 7149 && ncopies > 1) 7150 { 7151 if (dump_enabled_p ()) 7152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7153 "multiple types in double reduction or condition " 7154 "reduction.\n"); 7155 return false; 7156 } 7157 7158 /* For SLP reductions, see if there is a neutral value we can use. */ 7159 tree neutral_op = NULL_TREE; 7160 if (slp_node) 7161 neutral_op 7162 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code, 7163 GROUP_FIRST_ELEMENT (stmt_info) != NULL); 7164 7165 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 7166 { 7167 /* We can't support in-order reductions of code such as this: 7168 7169 for (int i = 0; i < n1; ++i) 7170 for (int j = 0; j < n2; ++j) 7171 l += a[j]; 7172 7173 since GCC effectively transforms the loop when vectorizing: 7174 7175 for (int i = 0; i < n1 / VF; ++i) 7176 for (int j = 0; j < n2; ++j) 7177 for (int k = 0; k < VF; ++k) 7178 l += a[j]; 7179 7180 which is a reassociation of the original operation. */ 7181 if (dump_enabled_p ()) 7182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7183 "in-order double reduction not supported.\n"); 7184 7185 return false; 7186 } 7187 7188 if (reduction_type == FOLD_LEFT_REDUCTION 7189 && slp_node 7190 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 7191 { 7192 /* We cannot use in-order reductions in this case because there is 7193 an implicit reassociation of the operations involved. */ 7194 if (dump_enabled_p ()) 7195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7196 "in-order unchained SLP reductions not supported.\n"); 7197 return false; 7198 } 7199 7200 /* For double reductions, and for SLP reductions with a neutral value, 7201 we construct a variable-length initial vector by loading a vector 7202 full of the neutral value and then shift-and-inserting the start 7203 values into the low-numbered elements. */ 7204 if ((double_reduc || neutral_op) 7205 && !nunits_out.is_constant () 7206 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, 7207 vectype_out, OPTIMIZE_FOR_SPEED)) 7208 { 7209 if (dump_enabled_p ()) 7210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7211 "reduction on variable-length vectors requires" 7212 " target support for a vector-shift-and-insert" 7213 " operation.\n"); 7214 return false; 7215 } 7216 7217 /* Check extra constraints for variable-length unchained SLP reductions. */ 7218 if (STMT_SLP_TYPE (stmt_info) 7219 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) 7220 && !nunits_out.is_constant ()) 7221 { 7222 /* We checked above that we could build the initial vector when 7223 there's a neutral element value. Check here for the case in 7224 which each SLP statement has its own initial value and in which 7225 that value needs to be repeated for every instance of the 7226 statement within the initial vector. */ 7227 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7228 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out)); 7229 if (!neutral_op 7230 && !can_duplicate_and_interleave_p (group_size, elt_mode)) 7231 { 7232 if (dump_enabled_p ()) 7233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7234 "unsupported form of SLP reduction for" 7235 " variable-length vectors: cannot build" 7236 " initial vector.\n"); 7237 return false; 7238 } 7239 /* The epilogue code relies on the number of elements being a multiple 7240 of the group size. The duplicate-and-interleave approach to setting 7241 up the the initial vector does too. */ 7242 if (!multiple_p (nunits_out, group_size)) 7243 { 7244 if (dump_enabled_p ()) 7245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7246 "unsupported form of SLP reduction for" 7247 " variable-length vectors: the vector size" 7248 " is not a multiple of the number of results.\n"); 7249 return false; 7250 } 7251 } 7252 7253 /* In case of widenning multiplication by a constant, we update the type 7254 of the constant to be the type of the other operand. We check that the 7255 constant fits the type in the pattern recognition pass. */ 7256 if (code == DOT_PROD_EXPR 7257 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1]))) 7258 { 7259 if (TREE_CODE (ops[0]) == INTEGER_CST) 7260 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]); 7261 else if (TREE_CODE (ops[1]) == INTEGER_CST) 7262 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]); 7263 else 7264 { 7265 if (dump_enabled_p ()) 7266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7267 "invalid types in dot-prod\n"); 7268 7269 return false; 7270 } 7271 } 7272 7273 if (reduction_type == COND_REDUCTION) 7274 { 7275 widest_int ni; 7276 7277 if (! max_loop_iterations (loop, &ni)) 7278 { 7279 if (dump_enabled_p ()) 7280 dump_printf_loc (MSG_NOTE, vect_location, 7281 "loop count not known, cannot create cond " 7282 "reduction.\n"); 7283 return false; 7284 } 7285 /* Convert backedges to iterations. */ 7286 ni += 1; 7287 7288 /* The additional index will be the same type as the condition. Check 7289 that the loop can fit into this less one (because we'll use up the 7290 zero slot for when there are no matches). */ 7291 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 7292 if (wi::geu_p (ni, wi::to_widest (max_index))) 7293 { 7294 if (dump_enabled_p ()) 7295 dump_printf_loc (MSG_NOTE, vect_location, 7296 "loop size is greater than data size.\n"); 7297 return false; 7298 } 7299 } 7300 7301 /* In case the vectorization factor (VF) is bigger than the number 7302 of elements that we can fit in a vectype (nunits), we have to generate 7303 more than one vector stmt - i.e - we need to "unroll" the 7304 vector stmt by a factor VF/nunits. For more details see documentation 7305 in vectorizable_operation. */ 7306 7307 /* If the reduction is used in an outer loop we need to generate 7308 VF intermediate results, like so (e.g. for ncopies=2): 7309 r0 = phi (init, r0) 7310 r1 = phi (init, r1) 7311 r0 = x0 + r0; 7312 r1 = x1 + r1; 7313 (i.e. we generate VF results in 2 registers). 7314 In this case we have a separate def-use cycle for each copy, and therefore 7315 for each copy we get the vector def for the reduction variable from the 7316 respective phi node created for this copy. 7317 7318 Otherwise (the reduction is unused in the loop nest), we can combine 7319 together intermediate results, like so (e.g. for ncopies=2): 7320 r = phi (init, r) 7321 r = x0 + r; 7322 r = x1 + r; 7323 (i.e. we generate VF/2 results in a single register). 7324 In this case for each copy we get the vector def for the reduction variable 7325 from the vectorized reduction operation generated in the previous iteration. 7326 7327 This only works when we see both the reduction PHI and its only consumer 7328 in vectorizable_reduction and there are no intermediate stmts 7329 participating. */ 7330 use_operand_p use_p; 7331 gimple *use_stmt; 7332 if (ncopies > 1 7333 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 7334 && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt) 7335 && (use_stmt == stmt 7336 || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt)) 7337 { 7338 single_defuse_cycle = true; 7339 epilog_copies = 1; 7340 } 7341 else 7342 epilog_copies = ncopies; 7343 7344 /* If the reduction stmt is one of the patterns that have lane 7345 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 7346 if ((ncopies > 1 7347 && ! single_defuse_cycle) 7348 && (code == DOT_PROD_EXPR 7349 || code == WIDEN_SUM_EXPR 7350 || code == SAD_EXPR)) 7351 { 7352 if (dump_enabled_p ()) 7353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7354 "multi def-use cycle not possible for lane-reducing " 7355 "reduction operation\n"); 7356 return false; 7357 } 7358 7359 if (slp_node) 7360 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7361 else 7362 vec_num = 1; 7363 7364 internal_fn cond_fn = get_conditional_internal_fn (code); 7365 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 7366 7367 if (!vec_stmt) /* transformation not required. */ 7368 { 7369 if (first_p) 7370 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies); 7371 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 7372 { 7373 if (reduction_type != FOLD_LEFT_REDUCTION 7374 && (cond_fn == IFN_LAST 7375 || !direct_internal_fn_supported_p (cond_fn, vectype_in, 7376 OPTIMIZE_FOR_SPEED))) 7377 { 7378 if (dump_enabled_p ()) 7379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7380 "can't use a fully-masked loop because no" 7381 " conditional operation is available.\n"); 7382 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7383 } 7384 else if (reduc_index == -1) 7385 { 7386 if (dump_enabled_p ()) 7387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7388 "can't use a fully-masked loop for chained" 7389 " reductions.\n"); 7390 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 7391 } 7392 else 7393 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, 7394 vectype_in); 7395 } 7396 if (dump_enabled_p () 7397 && reduction_type == FOLD_LEFT_REDUCTION) 7398 dump_printf_loc (MSG_NOTE, vect_location, 7399 "using an in-order (fold-left) reduction.\n"); 7400 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 7401 return true; 7402 } 7403 7404 /* Transform. */ 7405 7406 if (dump_enabled_p ()) 7407 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 7408 7409 /* FORNOW: Multiple types are not supported for condition. */ 7410 if (code == COND_EXPR) 7411 gcc_assert (ncopies == 1); 7412 7413 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 7414 7415 if (reduction_type == FOLD_LEFT_REDUCTION) 7416 return vectorize_fold_left_reduction 7417 (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code, 7418 reduc_fn, ops, vectype_in, reduc_index, masks); 7419 7420 if (reduction_type == EXTRACT_LAST_REDUCTION) 7421 { 7422 gcc_assert (!slp_node); 7423 return vectorizable_condition (stmt, gsi, vec_stmt, 7424 NULL, reduc_index, NULL); 7425 } 7426 7427 /* Create the destination vector */ 7428 vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 7429 7430 prev_stmt_info = NULL; 7431 prev_phi_info = NULL; 7432 if (!slp_node) 7433 { 7434 vec_oprnds0.create (1); 7435 vec_oprnds1.create (1); 7436 if (op_type == ternary_op) 7437 vec_oprnds2.create (1); 7438 } 7439 7440 phis.create (vec_num); 7441 vect_defs.create (vec_num); 7442 if (!slp_node) 7443 vect_defs.quick_push (NULL_TREE); 7444 7445 if (slp_node) 7446 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); 7447 else 7448 phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt))); 7449 7450 for (j = 0; j < ncopies; j++) 7451 { 7452 if (code == COND_EXPR) 7453 { 7454 gcc_assert (!slp_node); 7455 vectorizable_condition (stmt, gsi, vec_stmt, 7456 PHI_RESULT (phis[0]), 7457 reduc_index, NULL); 7458 /* Multiple types are not supported for condition. */ 7459 break; 7460 } 7461 7462 /* Handle uses. */ 7463 if (j == 0) 7464 { 7465 if (slp_node) 7466 { 7467 /* Get vec defs for all the operands except the reduction index, 7468 ensuring the ordering of the ops in the vector is kept. */ 7469 auto_vec<tree, 3> slp_ops; 7470 auto_vec<vec<tree>, 3> vec_defs; 7471 7472 slp_ops.quick_push (ops[0]); 7473 slp_ops.quick_push (ops[1]); 7474 if (op_type == ternary_op) 7475 slp_ops.quick_push (ops[2]); 7476 7477 vect_get_slp_defs (slp_ops, slp_node, &vec_defs); 7478 7479 vec_oprnds0.safe_splice (vec_defs[0]); 7480 vec_defs[0].release (); 7481 vec_oprnds1.safe_splice (vec_defs[1]); 7482 vec_defs[1].release (); 7483 if (op_type == ternary_op) 7484 { 7485 vec_oprnds2.safe_splice (vec_defs[2]); 7486 vec_defs[2].release (); 7487 } 7488 } 7489 else 7490 { 7491 vec_oprnds0.quick_push 7492 (vect_get_vec_def_for_operand (ops[0], stmt)); 7493 vec_oprnds1.quick_push 7494 (vect_get_vec_def_for_operand (ops[1], stmt)); 7495 if (op_type == ternary_op) 7496 vec_oprnds2.quick_push 7497 (vect_get_vec_def_for_operand (ops[2], stmt)); 7498 } 7499 } 7500 else 7501 { 7502 if (!slp_node) 7503 { 7504 gcc_assert (reduc_index != -1 || ! single_defuse_cycle); 7505 7506 if (single_defuse_cycle && reduc_index == 0) 7507 vec_oprnds0[0] = gimple_get_lhs (new_stmt); 7508 else 7509 vec_oprnds0[0] 7510 = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]); 7511 if (single_defuse_cycle && reduc_index == 1) 7512 vec_oprnds1[0] = gimple_get_lhs (new_stmt); 7513 else 7514 vec_oprnds1[0] 7515 = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]); 7516 if (op_type == ternary_op) 7517 { 7518 if (single_defuse_cycle && reduc_index == 2) 7519 vec_oprnds2[0] = gimple_get_lhs (new_stmt); 7520 else 7521 vec_oprnds2[0] 7522 = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]); 7523 } 7524 } 7525 } 7526 7527 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 7528 { 7529 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 7530 if (masked_loop_p) 7531 { 7532 /* Make sure that the reduction accumulator is vop[0]. */ 7533 if (reduc_index == 1) 7534 { 7535 gcc_assert (commutative_tree_code (code)); 7536 std::swap (vop[0], vop[1]); 7537 } 7538 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7539 vectype_in, i * ncopies + j); 7540 gcall *call = gimple_build_call_internal (cond_fn, 3, mask, 7541 vop[0], vop[1]); 7542 new_temp = make_ssa_name (vec_dest, call); 7543 gimple_call_set_lhs (call, new_temp); 7544 gimple_call_set_nothrow (call, true); 7545 new_stmt = call; 7546 } 7547 else 7548 { 7549 if (op_type == ternary_op) 7550 vop[2] = vec_oprnds2[i]; 7551 7552 new_temp = make_ssa_name (vec_dest, new_stmt); 7553 new_stmt = gimple_build_assign (new_temp, code, 7554 vop[0], vop[1], vop[2]); 7555 } 7556 vect_finish_stmt_generation (stmt, new_stmt, gsi); 7557 7558 if (slp_node) 7559 { 7560 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 7561 vect_defs.quick_push (new_temp); 7562 } 7563 else 7564 vect_defs[0] = new_temp; 7565 } 7566 7567 if (slp_node) 7568 continue; 7569 7570 if (j == 0) 7571 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; 7572 else 7573 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; 7574 7575 prev_stmt_info = vinfo_for_stmt (new_stmt); 7576 } 7577 7578 /* Finalize the reduction-phi (set its arguments) and create the 7579 epilog reduction code. */ 7580 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) 7581 vect_defs[0] = gimple_get_lhs (*vec_stmt); 7582 7583 vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt, 7584 epilog_copies, reduc_fn, phis, 7585 double_reduc, slp_node, slp_node_instance, 7586 cond_reduc_val, cond_reduc_op_code, 7587 neutral_op); 7588 7589 return true; 7590 } 7591 7592 /* Function vect_min_worthwhile_factor. 7593 7594 For a loop where we could vectorize the operation indicated by CODE, 7595 return the minimum vectorization factor that makes it worthwhile 7596 to use generic vectors. */ 7597 static unsigned int 7598 vect_min_worthwhile_factor (enum tree_code code) 7599 { 7600 switch (code) 7601 { 7602 case PLUS_EXPR: 7603 case MINUS_EXPR: 7604 case NEGATE_EXPR: 7605 return 4; 7606 7607 case BIT_AND_EXPR: 7608 case BIT_IOR_EXPR: 7609 case BIT_XOR_EXPR: 7610 case BIT_NOT_EXPR: 7611 return 2; 7612 7613 default: 7614 return INT_MAX; 7615 } 7616 } 7617 7618 /* Return true if VINFO indicates we are doing loop vectorization and if 7619 it is worth decomposing CODE operations into scalar operations for 7620 that loop's vectorization factor. */ 7621 7622 bool 7623 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code) 7624 { 7625 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 7626 unsigned HOST_WIDE_INT value; 7627 return (loop_vinfo 7628 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value) 7629 && value >= vect_min_worthwhile_factor (code)); 7630 } 7631 7632 /* Function vectorizable_induction 7633 7634 Check if PHI performs an induction computation that can be vectorized. 7635 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 7636 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 7637 Return FALSE if not a vectorizable STMT, TRUE otherwise. */ 7638 7639 bool 7640 vectorizable_induction (gimple *phi, 7641 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7642 gimple **vec_stmt, slp_tree slp_node) 7643 { 7644 stmt_vec_info stmt_info = vinfo_for_stmt (phi); 7645 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7646 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7647 unsigned ncopies; 7648 bool nested_in_vect_loop = false; 7649 struct loop *iv_loop; 7650 tree vec_def; 7651 edge pe = loop_preheader_edge (loop); 7652 basic_block new_bb; 7653 tree new_vec, vec_init, vec_step, t; 7654 tree new_name; 7655 gimple *new_stmt; 7656 gphi *induction_phi; 7657 tree induc_def, vec_dest; 7658 tree init_expr, step_expr; 7659 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 7660 unsigned i; 7661 tree expr; 7662 gimple_seq stmts; 7663 imm_use_iterator imm_iter; 7664 use_operand_p use_p; 7665 gimple *exit_phi; 7666 edge latch_e; 7667 tree loop_arg; 7668 gimple_stmt_iterator si; 7669 basic_block bb = gimple_bb (phi); 7670 7671 if (gimple_code (phi) != GIMPLE_PHI) 7672 return false; 7673 7674 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7675 return false; 7676 7677 /* Make sure it was recognized as induction computation. */ 7678 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 7679 return false; 7680 7681 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7682 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7683 7684 if (slp_node) 7685 ncopies = 1; 7686 else 7687 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7688 gcc_assert (ncopies >= 1); 7689 7690 /* FORNOW. These restrictions should be relaxed. */ 7691 if (nested_in_vect_loop_p (loop, phi)) 7692 { 7693 imm_use_iterator imm_iter; 7694 use_operand_p use_p; 7695 gimple *exit_phi; 7696 edge latch_e; 7697 tree loop_arg; 7698 7699 if (ncopies > 1) 7700 { 7701 if (dump_enabled_p ()) 7702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7703 "multiple types in nested loop.\n"); 7704 return false; 7705 } 7706 7707 /* FORNOW: outer loop induction with SLP not supported. */ 7708 if (STMT_SLP_TYPE (stmt_info)) 7709 return false; 7710 7711 exit_phi = NULL; 7712 latch_e = loop_latch_edge (loop->inner); 7713 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7714 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7715 { 7716 gimple *use_stmt = USE_STMT (use_p); 7717 if (is_gimple_debug (use_stmt)) 7718 continue; 7719 7720 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 7721 { 7722 exit_phi = use_stmt; 7723 break; 7724 } 7725 } 7726 if (exit_phi) 7727 { 7728 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); 7729 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 7730 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 7731 { 7732 if (dump_enabled_p ()) 7733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7734 "inner-loop induction only used outside " 7735 "of the outer vectorized loop.\n"); 7736 return false; 7737 } 7738 } 7739 7740 nested_in_vect_loop = true; 7741 iv_loop = loop->inner; 7742 } 7743 else 7744 iv_loop = loop; 7745 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); 7746 7747 if (slp_node && !nunits.is_constant ()) 7748 { 7749 /* The current SLP code creates the initial value element-by-element. */ 7750 if (dump_enabled_p ()) 7751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7752 "SLP induction not supported for variable-length" 7753 " vectors.\n"); 7754 return false; 7755 } 7756 7757 if (!vec_stmt) /* transformation not required. */ 7758 { 7759 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 7760 if (dump_enabled_p ()) 7761 dump_printf_loc (MSG_NOTE, vect_location, 7762 "=== vectorizable_induction ===\n"); 7763 vect_model_induction_cost (stmt_info, ncopies); 7764 return true; 7765 } 7766 7767 /* Transform. */ 7768 7769 /* Compute a vector variable, initialized with the first VF values of 7770 the induction variable. E.g., for an iv with IV_PHI='X' and 7771 evolution S, for a vector of 4 units, we want to compute: 7772 [X, X + S, X + 2*S, X + 3*S]. */ 7773 7774 if (dump_enabled_p ()) 7775 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 7776 7777 latch_e = loop_latch_edge (iv_loop); 7778 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7779 7780 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 7781 gcc_assert (step_expr != NULL_TREE); 7782 7783 pe = loop_preheader_edge (iv_loop); 7784 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, 7785 loop_preheader_edge (iv_loop)); 7786 7787 stmts = NULL; 7788 if (!nested_in_vect_loop) 7789 { 7790 /* Convert the initial value to the desired type. */ 7791 tree new_type = TREE_TYPE (vectype); 7792 init_expr = gimple_convert (&stmts, new_type, init_expr); 7793 7794 /* If we are using the loop mask to "peel" for alignment then we need 7795 to adjust the start value here. */ 7796 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 7797 if (skip_niters != NULL_TREE) 7798 { 7799 if (FLOAT_TYPE_P (vectype)) 7800 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, 7801 skip_niters); 7802 else 7803 skip_niters = gimple_convert (&stmts, new_type, skip_niters); 7804 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, 7805 skip_niters, step_expr); 7806 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 7807 init_expr, skip_step); 7808 } 7809 } 7810 7811 /* Convert the step to the desired type. */ 7812 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr); 7813 7814 if (stmts) 7815 { 7816 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7817 gcc_assert (!new_bb); 7818 } 7819 7820 /* Find the first insertion point in the BB. */ 7821 si = gsi_after_labels (bb); 7822 7823 /* For SLP induction we have to generate several IVs as for example 7824 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S] 7825 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform 7826 [VF*S, VF*S, VF*S, VF*S] for all. */ 7827 if (slp_node) 7828 { 7829 /* Enforced above. */ 7830 unsigned int const_nunits = nunits.to_constant (); 7831 7832 /* Generate [VF*S, VF*S, ... ]. */ 7833 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7834 { 7835 expr = build_int_cst (integer_type_node, vf); 7836 expr = fold_convert (TREE_TYPE (step_expr), expr); 7837 } 7838 else 7839 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7840 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7841 expr, step_expr); 7842 if (! CONSTANT_CLASS_P (new_name)) 7843 new_name = vect_init_vector (phi, new_name, 7844 TREE_TYPE (step_expr), NULL); 7845 new_vec = build_vector_from_val (vectype, new_name); 7846 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 7847 7848 /* Now generate the IVs. */ 7849 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7850 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7851 unsigned elts = const_nunits * nvects; 7852 unsigned nivs = least_common_multiple (group_size, 7853 const_nunits) / const_nunits; 7854 gcc_assert (elts % group_size == 0); 7855 tree elt = init_expr; 7856 unsigned ivn; 7857 for (ivn = 0; ivn < nivs; ++ivn) 7858 { 7859 tree_vector_builder elts (vectype, const_nunits, 1); 7860 stmts = NULL; 7861 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 7862 { 7863 if (ivn*const_nunits + eltn >= group_size 7864 && (ivn * const_nunits + eltn) % group_size == 0) 7865 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), 7866 elt, step_expr); 7867 elts.quick_push (elt); 7868 } 7869 vec_init = gimple_build_vector (&stmts, &elts); 7870 if (stmts) 7871 { 7872 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7873 gcc_assert (!new_bb); 7874 } 7875 7876 /* Create the induction-phi that defines the induction-operand. */ 7877 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7878 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7879 set_vinfo_for_stmt (induction_phi, 7880 new_stmt_vec_info (induction_phi, loop_vinfo)); 7881 induc_def = PHI_RESULT (induction_phi); 7882 7883 /* Create the iv update inside the loop */ 7884 vec_def = make_ssa_name (vec_dest); 7885 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 7886 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7887 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); 7888 7889 /* Set the arguments of the phi node: */ 7890 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7891 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7892 UNKNOWN_LOCATION); 7893 7894 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi); 7895 } 7896 7897 /* Re-use IVs when we can. */ 7898 if (ivn < nvects) 7899 { 7900 unsigned vfp 7901 = least_common_multiple (group_size, const_nunits) / group_size; 7902 /* Generate [VF'*S, VF'*S, ... ]. */ 7903 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7904 { 7905 expr = build_int_cst (integer_type_node, vfp); 7906 expr = fold_convert (TREE_TYPE (step_expr), expr); 7907 } 7908 else 7909 expr = build_int_cst (TREE_TYPE (step_expr), vfp); 7910 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7911 expr, step_expr); 7912 if (! CONSTANT_CLASS_P (new_name)) 7913 new_name = vect_init_vector (phi, new_name, 7914 TREE_TYPE (step_expr), NULL); 7915 new_vec = build_vector_from_val (vectype, new_name); 7916 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 7917 for (; ivn < nvects; ++ivn) 7918 { 7919 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]; 7920 tree def; 7921 if (gimple_code (iv) == GIMPLE_PHI) 7922 def = gimple_phi_result (iv); 7923 else 7924 def = gimple_assign_lhs (iv); 7925 new_stmt = gimple_build_assign (make_ssa_name (vectype), 7926 PLUS_EXPR, 7927 def, vec_step); 7928 if (gimple_code (iv) == GIMPLE_PHI) 7929 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 7930 else 7931 { 7932 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 7933 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING); 7934 } 7935 set_vinfo_for_stmt (new_stmt, 7936 new_stmt_vec_info (new_stmt, loop_vinfo)); 7937 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); 7938 } 7939 } 7940 7941 return true; 7942 } 7943 7944 /* Create the vector that holds the initial_value of the induction. */ 7945 if (nested_in_vect_loop) 7946 { 7947 /* iv_loop is nested in the loop to be vectorized. init_expr had already 7948 been created during vectorization of previous stmts. We obtain it 7949 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 7950 vec_init = vect_get_vec_def_for_operand (init_expr, phi); 7951 /* If the initial value is not of proper type, convert it. */ 7952 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 7953 { 7954 new_stmt 7955 = gimple_build_assign (vect_get_new_ssa_name (vectype, 7956 vect_simple_var, 7957 "vec_iv_"), 7958 VIEW_CONVERT_EXPR, 7959 build1 (VIEW_CONVERT_EXPR, vectype, 7960 vec_init)); 7961 vec_init = gimple_assign_lhs (new_stmt); 7962 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 7963 new_stmt); 7964 gcc_assert (!new_bb); 7965 set_vinfo_for_stmt (new_stmt, 7966 new_stmt_vec_info (new_stmt, loop_vinfo)); 7967 } 7968 } 7969 else 7970 { 7971 /* iv_loop is the loop to be vectorized. Create: 7972 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 7973 stmts = NULL; 7974 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr); 7975 7976 unsigned HOST_WIDE_INT const_nunits; 7977 if (nunits.is_constant (&const_nunits)) 7978 { 7979 tree_vector_builder elts (vectype, const_nunits, 1); 7980 elts.quick_push (new_name); 7981 for (i = 1; i < const_nunits; i++) 7982 { 7983 /* Create: new_name_i = new_name + step_expr */ 7984 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 7985 new_name, step_expr); 7986 elts.quick_push (new_name); 7987 } 7988 /* Create a vector from [new_name_0, new_name_1, ..., 7989 new_name_nunits-1] */ 7990 vec_init = gimple_build_vector (&stmts, &elts); 7991 } 7992 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 7993 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 7994 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype, 7995 new_name, step_expr); 7996 else 7997 { 7998 /* Build: 7999 [base, base, base, ...] 8000 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 8001 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 8002 gcc_assert (flag_associative_math); 8003 tree index = build_index_vector (vectype, 0, 1); 8004 tree base_vec = gimple_build_vector_from_val (&stmts, vectype, 8005 new_name); 8006 tree step_vec = gimple_build_vector_from_val (&stmts, vectype, 8007 step_expr); 8008 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index); 8009 vec_init = gimple_build (&stmts, MULT_EXPR, vectype, 8010 vec_init, step_vec); 8011 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype, 8012 vec_init, base_vec); 8013 } 8014 8015 if (stmts) 8016 { 8017 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 8018 gcc_assert (!new_bb); 8019 } 8020 } 8021 8022 8023 /* Create the vector that holds the step of the induction. */ 8024 if (nested_in_vect_loop) 8025 /* iv_loop is nested in the loop to be vectorized. Generate: 8026 vec_step = [S, S, S, S] */ 8027 new_name = step_expr; 8028 else 8029 { 8030 /* iv_loop is the loop to be vectorized. Generate: 8031 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 8032 gimple_seq seq = NULL; 8033 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 8034 { 8035 expr = build_int_cst (integer_type_node, vf); 8036 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 8037 } 8038 else 8039 expr = build_int_cst (TREE_TYPE (step_expr), vf); 8040 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 8041 expr, step_expr); 8042 if (seq) 8043 { 8044 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8045 gcc_assert (!new_bb); 8046 } 8047 } 8048 8049 t = unshare_expr (new_name); 8050 gcc_assert (CONSTANT_CLASS_P (new_name) 8051 || TREE_CODE (new_name) == SSA_NAME); 8052 new_vec = build_vector_from_val (vectype, t); 8053 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 8054 8055 8056 /* Create the following def-use cycle: 8057 loop prolog: 8058 vec_init = ... 8059 vec_step = ... 8060 loop: 8061 vec_iv = PHI <vec_init, vec_loop> 8062 ... 8063 STMT 8064 ... 8065 vec_loop = vec_iv + vec_step; */ 8066 8067 /* Create the induction-phi that defines the induction-operand. */ 8068 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 8069 induction_phi = create_phi_node (vec_dest, iv_loop->header); 8070 set_vinfo_for_stmt (induction_phi, 8071 new_stmt_vec_info (induction_phi, loop_vinfo)); 8072 induc_def = PHI_RESULT (induction_phi); 8073 8074 /* Create the iv update inside the loop */ 8075 vec_def = make_ssa_name (vec_dest); 8076 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step); 8077 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 8078 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo)); 8079 8080 /* Set the arguments of the phi node: */ 8081 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 8082 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 8083 UNKNOWN_LOCATION); 8084 8085 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi; 8086 8087 /* In case that vectorization factor (VF) is bigger than the number 8088 of elements that we can fit in a vectype (nunits), we have to generate 8089 more than one vector stmt - i.e - we need to "unroll" the 8090 vector stmt by a factor VF/nunits. For more details see documentation 8091 in vectorizable_operation. */ 8092 8093 if (ncopies > 1) 8094 { 8095 gimple_seq seq = NULL; 8096 stmt_vec_info prev_stmt_vinfo; 8097 /* FORNOW. This restriction should be relaxed. */ 8098 gcc_assert (!nested_in_vect_loop); 8099 8100 /* Create the vector that holds the step of the induction. */ 8101 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 8102 { 8103 expr = build_int_cst (integer_type_node, nunits); 8104 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 8105 } 8106 else 8107 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 8108 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 8109 expr, step_expr); 8110 if (seq) 8111 { 8112 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 8113 gcc_assert (!new_bb); 8114 } 8115 8116 t = unshare_expr (new_name); 8117 gcc_assert (CONSTANT_CLASS_P (new_name) 8118 || TREE_CODE (new_name) == SSA_NAME); 8119 new_vec = build_vector_from_val (vectype, t); 8120 vec_step = vect_init_vector (phi, new_vec, vectype, NULL); 8121 8122 vec_def = induc_def; 8123 prev_stmt_vinfo = vinfo_for_stmt (induction_phi); 8124 for (i = 1; i < ncopies; i++) 8125 { 8126 /* vec_i = vec_prev + vec_step */ 8127 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, 8128 vec_def, vec_step); 8129 vec_def = make_ssa_name (vec_dest, new_stmt); 8130 gimple_assign_set_lhs (new_stmt, vec_def); 8131 8132 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); 8133 set_vinfo_for_stmt (new_stmt, 8134 new_stmt_vec_info (new_stmt, loop_vinfo)); 8135 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; 8136 prev_stmt_vinfo = vinfo_for_stmt (new_stmt); 8137 } 8138 } 8139 8140 if (nested_in_vect_loop) 8141 { 8142 /* Find the loop-closed exit-phi of the induction, and record 8143 the final vector of induction results: */ 8144 exit_phi = NULL; 8145 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 8146 { 8147 gimple *use_stmt = USE_STMT (use_p); 8148 if (is_gimple_debug (use_stmt)) 8149 continue; 8150 8151 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))) 8152 { 8153 exit_phi = use_stmt; 8154 break; 8155 } 8156 } 8157 if (exit_phi) 8158 { 8159 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); 8160 /* FORNOW. Currently not supporting the case that an inner-loop induction 8161 is not used in the outer-loop (i.e. only outside the outer-loop). */ 8162 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 8163 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 8164 8165 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; 8166 if (dump_enabled_p ()) 8167 { 8168 dump_printf_loc (MSG_NOTE, vect_location, 8169 "vector of inductions after inner-loop:"); 8170 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0); 8171 } 8172 } 8173 } 8174 8175 8176 if (dump_enabled_p ()) 8177 { 8178 dump_printf_loc (MSG_NOTE, vect_location, 8179 "transform induction: created def-use cycle: "); 8180 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0); 8181 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 8182 SSA_NAME_DEF_STMT (vec_def), 0); 8183 } 8184 8185 return true; 8186 } 8187 8188 /* Function vectorizable_live_operation. 8189 8190 STMT computes a value that is used outside the loop. Check if 8191 it can be supported. */ 8192 8193 bool 8194 vectorizable_live_operation (gimple *stmt, 8195 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 8196 slp_tree slp_node, int slp_index, 8197 gimple **vec_stmt) 8198 { 8199 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 8200 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 8201 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8202 imm_use_iterator imm_iter; 8203 tree lhs, lhs_type, bitsize, vec_bitsize; 8204 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 8205 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 8206 int ncopies; 8207 gimple *use_stmt; 8208 auto_vec<tree> vec_oprnds; 8209 int vec_entry = 0; 8210 poly_uint64 vec_index = 0; 8211 8212 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 8213 8214 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 8215 return false; 8216 8217 /* FORNOW. CHECKME. */ 8218 if (nested_in_vect_loop_p (loop, stmt)) 8219 return false; 8220 8221 /* If STMT is not relevant and it is a simple assignment and its inputs are 8222 invariant then it can remain in place, unvectorized. The original last 8223 scalar value that it computes will be used. */ 8224 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 8225 { 8226 gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo)); 8227 if (dump_enabled_p ()) 8228 dump_printf_loc (MSG_NOTE, vect_location, 8229 "statement is simple and uses invariant. Leaving in " 8230 "place.\n"); 8231 return true; 8232 } 8233 8234 if (slp_node) 8235 ncopies = 1; 8236 else 8237 ncopies = vect_get_num_copies (loop_vinfo, vectype); 8238 8239 if (slp_node) 8240 { 8241 gcc_assert (slp_index >= 0); 8242 8243 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length (); 8244 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 8245 8246 /* Get the last occurrence of the scalar index from the concatenation of 8247 all the slp vectors. Calculate which slp vector it is and the index 8248 within. */ 8249 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; 8250 8251 /* Calculate which vector contains the result, and which lane of 8252 that vector we need. */ 8253 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index)) 8254 { 8255 if (dump_enabled_p ()) 8256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8257 "Cannot determine which vector holds the" 8258 " final result.\n"); 8259 return false; 8260 } 8261 } 8262 8263 if (!vec_stmt) 8264 { 8265 /* No transformation required. */ 8266 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 8267 { 8268 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 8269 OPTIMIZE_FOR_SPEED)) 8270 { 8271 if (dump_enabled_p ()) 8272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8273 "can't use a fully-masked loop because " 8274 "the target doesn't support extract last " 8275 "reduction.\n"); 8276 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8277 } 8278 else if (slp_node) 8279 { 8280 if (dump_enabled_p ()) 8281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8282 "can't use a fully-masked loop because an " 8283 "SLP statement is live after the loop.\n"); 8284 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8285 } 8286 else if (ncopies > 1) 8287 { 8288 if (dump_enabled_p ()) 8289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8290 "can't use a fully-masked loop because" 8291 " ncopies is greater than 1.\n"); 8292 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8293 } 8294 else 8295 { 8296 gcc_assert (ncopies == 1 && !slp_node); 8297 vect_record_loop_mask (loop_vinfo, 8298 &LOOP_VINFO_MASKS (loop_vinfo), 8299 1, vectype); 8300 } 8301 } 8302 return true; 8303 } 8304 8305 /* If stmt has a related stmt, then use that for getting the lhs. */ 8306 if (is_pattern_stmt_p (stmt_info)) 8307 stmt = STMT_VINFO_RELATED_STMT (stmt_info); 8308 8309 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) 8310 : gimple_get_lhs (stmt); 8311 lhs_type = TREE_TYPE (lhs); 8312 8313 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype) 8314 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype))) 8315 : TYPE_SIZE (TREE_TYPE (vectype))); 8316 vec_bitsize = TYPE_SIZE (vectype); 8317 8318 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 8319 tree vec_lhs, bitstart; 8320 if (slp_node) 8321 { 8322 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8323 8324 /* Get the correct slp vectorized stmt. */ 8325 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]; 8326 if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) 8327 vec_lhs = gimple_phi_result (phi); 8328 else 8329 vec_lhs = gimple_get_lhs (vec_stmt); 8330 8331 /* Get entry to use. */ 8332 bitstart = bitsize_int (vec_index); 8333 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 8334 } 8335 else 8336 { 8337 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info); 8338 vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt); 8339 gcc_checking_assert (ncopies == 1 8340 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8341 8342 /* For multiple copies, get the last copy. */ 8343 for (int i = 1; i < ncopies; ++i) 8344 vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, 8345 vec_lhs); 8346 8347 /* Get the last lane in the vector. */ 8348 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); 8349 } 8350 8351 gimple_seq stmts = NULL; 8352 tree new_tree; 8353 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8354 { 8355 /* Emit: 8356 8357 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> 8358 8359 where VEC_LHS is the vectorized live-out result and MASK is 8360 the loop mask for the final iteration. */ 8361 gcc_assert (ncopies == 1 && !slp_node); 8362 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); 8363 tree scalar_res = make_ssa_name (scalar_type); 8364 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 8365 1, vectype, 0); 8366 gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST, 8367 2, mask, vec_lhs); 8368 gimple_call_set_lhs (new_stmt, scalar_res); 8369 gimple_seq_add_stmt (&stmts, new_stmt); 8370 8371 /* Convert the extracted vector element to the required scalar type. */ 8372 new_tree = gimple_convert (&stmts, lhs_type, scalar_res); 8373 } 8374 else 8375 { 8376 tree bftype = TREE_TYPE (vectype); 8377 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 8378 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 8379 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart); 8380 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 8381 &stmts, true, NULL_TREE); 8382 } 8383 8384 if (stmts) 8385 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts); 8386 8387 /* Replace use of lhs with newly computed result. If the use stmt is a 8388 single arg PHI, just replace all uses of PHI result. It's necessary 8389 because lcssa PHI defining lhs may be before newly inserted stmt. */ 8390 use_operand_p use_p; 8391 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 8392 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 8393 && !is_gimple_debug (use_stmt)) 8394 { 8395 if (gimple_code (use_stmt) == GIMPLE_PHI 8396 && gimple_phi_num_args (use_stmt) == 1) 8397 { 8398 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 8399 } 8400 else 8401 { 8402 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 8403 SET_USE (use_p, new_tree); 8404 } 8405 update_stmt (use_stmt); 8406 } 8407 8408 return true; 8409 } 8410 8411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */ 8412 8413 static void 8414 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt) 8415 { 8416 ssa_op_iter op_iter; 8417 imm_use_iterator imm_iter; 8418 def_operand_p def_p; 8419 gimple *ustmt; 8420 8421 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF) 8422 { 8423 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 8424 { 8425 basic_block bb; 8426 8427 if (!is_gimple_debug (ustmt)) 8428 continue; 8429 8430 bb = gimple_bb (ustmt); 8431 8432 if (!flow_bb_inside_loop_p (loop, bb)) 8433 { 8434 if (gimple_debug_bind_p (ustmt)) 8435 { 8436 if (dump_enabled_p ()) 8437 dump_printf_loc (MSG_NOTE, vect_location, 8438 "killing debug use\n"); 8439 8440 gimple_debug_bind_reset_value (ustmt); 8441 update_stmt (ustmt); 8442 } 8443 else 8444 gcc_unreachable (); 8445 } 8446 } 8447 } 8448 } 8449 8450 /* Given loop represented by LOOP_VINFO, return true if computation of 8451 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 8452 otherwise. */ 8453 8454 static bool 8455 loop_niters_no_overflow (loop_vec_info loop_vinfo) 8456 { 8457 /* Constant case. */ 8458 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8459 { 8460 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 8461 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 8462 8463 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 8464 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 8465 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 8466 return true; 8467 } 8468 8469 widest_int max; 8470 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8471 /* Check the upper bound of loop niters. */ 8472 if (get_max_loop_iterations (loop, &max)) 8473 { 8474 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 8475 signop sgn = TYPE_SIGN (type); 8476 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 8477 if (max < type_max) 8478 return true; 8479 } 8480 return false; 8481 } 8482 8483 /* Return a mask type with half the number of elements as TYPE. */ 8484 8485 tree 8486 vect_halve_mask_nunits (tree type) 8487 { 8488 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2); 8489 return build_truth_vector_type (nunits, current_vector_size); 8490 } 8491 8492 /* Return a mask type with twice as many elements as TYPE. */ 8493 8494 tree 8495 vect_double_mask_nunits (tree type) 8496 { 8497 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2; 8498 return build_truth_vector_type (nunits, current_vector_size); 8499 } 8500 8501 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to 8502 contain a sequence of NVECTORS masks that each control a vector of type 8503 VECTYPE. */ 8504 8505 void 8506 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 8507 unsigned int nvectors, tree vectype) 8508 { 8509 gcc_assert (nvectors != 0); 8510 if (masks->length () < nvectors) 8511 masks->safe_grow_cleared (nvectors); 8512 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8513 /* The number of scalars per iteration and the number of vectors are 8514 both compile-time constants. */ 8515 unsigned int nscalars_per_iter 8516 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 8517 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 8518 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 8519 { 8520 rgm->max_nscalars_per_iter = nscalars_per_iter; 8521 rgm->mask_type = build_same_sized_truth_vector_type (vectype); 8522 } 8523 } 8524 8525 /* Given a complete set of masks MASKS, extract mask number INDEX 8526 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 8527 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. 8528 8529 See the comment above vec_loop_masks for more details about the mask 8530 arrangement. */ 8531 8532 tree 8533 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, 8534 unsigned int nvectors, tree vectype, unsigned int index) 8535 { 8536 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8537 tree mask_type = rgm->mask_type; 8538 8539 /* Populate the rgroup's mask array, if this is the first time we've 8540 used it. */ 8541 if (rgm->masks.is_empty ()) 8542 { 8543 rgm->masks.safe_grow_cleared (nvectors); 8544 for (unsigned int i = 0; i < nvectors; ++i) 8545 { 8546 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); 8547 /* Provide a dummy definition until the real one is available. */ 8548 SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); 8549 rgm->masks[i] = mask; 8550 } 8551 } 8552 8553 tree mask = rgm->masks[index]; 8554 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 8555 TYPE_VECTOR_SUBPARTS (vectype))) 8556 { 8557 /* A loop mask for data type X can be reused for data type Y 8558 if X has N times more elements than Y and if Y's elements 8559 are N times bigger than X's. In this case each sequence 8560 of N elements in the loop mask will be all-zero or all-one. 8561 We can then view-convert the mask so that each sequence of 8562 N elements is replaced by a single element. */ 8563 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 8564 TYPE_VECTOR_SUBPARTS (vectype))); 8565 gimple_seq seq = NULL; 8566 mask_type = build_same_sized_truth_vector_type (vectype); 8567 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 8568 if (seq) 8569 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 8570 } 8571 return mask; 8572 } 8573 8574 /* Scale profiling counters by estimation for LOOP which is vectorized 8575 by factor VF. */ 8576 8577 static void 8578 scale_profile_for_vect_loop (struct loop *loop, unsigned vf) 8579 { 8580 edge preheader = loop_preheader_edge (loop); 8581 /* Reduce loop iterations by the vectorization factor. */ 8582 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 8583 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 8584 8585 if (freq_h.nonzero_p ()) 8586 { 8587 profile_probability p; 8588 8589 /* Avoid dropping loop body profile counter to 0 because of zero count 8590 in loop's preheader. */ 8591 if (!(freq_e == profile_count::zero ())) 8592 freq_e = freq_e.force_nonzero (); 8593 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); 8594 scale_loop_frequencies (loop, p); 8595 } 8596 8597 edge exit_e = single_exit (loop); 8598 exit_e->probability = profile_probability::always () 8599 .apply_scale (1, new_est_niter + 1); 8600 8601 edge exit_l = single_pred_edge (loop->latch); 8602 profile_probability prob = exit_l->probability; 8603 exit_l->probability = exit_e->probability.invert (); 8604 if (prob.initialized_p () && exit_l->probability.initialized_p ()) 8605 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); 8606 } 8607 8608 /* Function vect_transform_loop. 8609 8610 The analysis phase has determined that the loop is vectorizable. 8611 Vectorize the loop - created vectorized stmts to replace the scalar 8612 stmts in the loop, and update the loop exit condition. 8613 Returns scalar epilogue loop if any. */ 8614 8615 struct loop * 8616 vect_transform_loop (loop_vec_info loop_vinfo) 8617 { 8618 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8619 struct loop *epilogue = NULL; 8620 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 8621 int nbbs = loop->num_nodes; 8622 int i; 8623 tree niters_vector = NULL_TREE; 8624 tree step_vector = NULL_TREE; 8625 tree niters_vector_mult_vf = NULL_TREE; 8626 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8627 unsigned int lowest_vf = constant_lower_bound (vf); 8628 bool grouped_store; 8629 bool slp_scheduled = false; 8630 gimple *stmt, *pattern_stmt; 8631 gimple_seq pattern_def_seq = NULL; 8632 gimple_stmt_iterator pattern_def_si = gsi_none (); 8633 bool transform_pattern_stmt = false; 8634 bool check_profitability = false; 8635 unsigned int th; 8636 8637 if (dump_enabled_p ()) 8638 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n"); 8639 8640 /* Use the more conservative vectorization threshold. If the number 8641 of iterations is constant assume the cost check has been performed 8642 by our caller. If the threshold makes all loops profitable that 8643 run at least the (estimated) vectorization factor number of times 8644 checking is pointless, too. */ 8645 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 8646 if (th >= vect_vf_for_cost (loop_vinfo) 8647 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8648 { 8649 if (dump_enabled_p ()) 8650 dump_printf_loc (MSG_NOTE, vect_location, 8651 "Profitability threshold is %d loop iterations.\n", 8652 th); 8653 check_profitability = true; 8654 } 8655 8656 /* Make sure there exists a single-predecessor exit bb. Do this before 8657 versioning. */ 8658 edge e = single_exit (loop); 8659 if (! single_pred_p (e->dest)) 8660 { 8661 split_loop_exit_edge (e); 8662 if (dump_enabled_p ()) 8663 dump_printf (MSG_NOTE, "split exit edge\n"); 8664 } 8665 8666 /* Version the loop first, if required, so the profitability check 8667 comes first. */ 8668 8669 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 8670 { 8671 poly_uint64 versioning_threshold 8672 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 8673 if (check_profitability 8674 && ordered_p (poly_uint64 (th), versioning_threshold)) 8675 { 8676 versioning_threshold = ordered_max (poly_uint64 (th), 8677 versioning_threshold); 8678 check_profitability = false; 8679 } 8680 vect_loop_versioning (loop_vinfo, th, check_profitability, 8681 versioning_threshold); 8682 check_profitability = false; 8683 } 8684 8685 /* Make sure there exists a single-predecessor exit bb also on the 8686 scalar loop copy. Do this after versioning but before peeling 8687 so CFG structure is fine for both scalar and if-converted loop 8688 to make slpeel_duplicate_current_defs_from_edges face matched 8689 loop closed PHI nodes on the exit. */ 8690 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8691 { 8692 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 8693 if (! single_pred_p (e->dest)) 8694 { 8695 split_loop_exit_edge (e); 8696 if (dump_enabled_p ()) 8697 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 8698 } 8699 } 8700 8701 tree niters = vect_build_loop_niters (loop_vinfo); 8702 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 8703 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 8704 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 8705 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 8706 &step_vector, &niters_vector_mult_vf, th, 8707 check_profitability, niters_no_overflow); 8708 8709 if (niters_vector == NULL_TREE) 8710 { 8711 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8712 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8713 && known_eq (lowest_vf, vf)) 8714 { 8715 niters_vector 8716 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 8717 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); 8718 step_vector = build_one_cst (TREE_TYPE (niters)); 8719 } 8720 else 8721 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 8722 &step_vector, niters_no_overflow); 8723 } 8724 8725 /* 1) Make sure the loop header has exactly two entries 8726 2) Make sure we have a preheader basic block. */ 8727 8728 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 8729 8730 split_edge (loop_preheader_edge (loop)); 8731 8732 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8733 && vect_use_loop_mask_for_alignment_p (loop_vinfo)) 8734 /* This will deal with any possible peeling. */ 8735 vect_prepare_for_masked_peels (loop_vinfo); 8736 8737 /* FORNOW: the vectorizer supports only loops which body consist 8738 of one basic block (header + empty latch). When the vectorizer will 8739 support more involved loop forms, the order by which the BBs are 8740 traversed need to be reconsidered. */ 8741 8742 for (i = 0; i < nbbs; i++) 8743 { 8744 basic_block bb = bbs[i]; 8745 stmt_vec_info stmt_info; 8746 8747 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 8748 gsi_next (&si)) 8749 { 8750 gphi *phi = si.phi (); 8751 if (dump_enabled_p ()) 8752 { 8753 dump_printf_loc (MSG_NOTE, vect_location, 8754 "------>vectorizing phi: "); 8755 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0); 8756 } 8757 stmt_info = vinfo_for_stmt (phi); 8758 if (!stmt_info) 8759 continue; 8760 8761 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8762 vect_loop_kill_debug_uses (loop, phi); 8763 8764 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8765 && !STMT_VINFO_LIVE_P (stmt_info)) 8766 continue; 8767 8768 if (STMT_VINFO_VECTYPE (stmt_info) 8769 && (maybe_ne 8770 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf)) 8771 && dump_enabled_p ()) 8772 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8773 8774 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8775 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8776 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 8777 && ! PURE_SLP_STMT (stmt_info)) 8778 { 8779 if (dump_enabled_p ()) 8780 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 8781 vect_transform_stmt (phi, NULL, NULL, NULL, NULL); 8782 } 8783 } 8784 8785 pattern_stmt = NULL; 8786 for (gimple_stmt_iterator si = gsi_start_bb (bb); 8787 !gsi_end_p (si) || transform_pattern_stmt;) 8788 { 8789 bool is_store; 8790 8791 if (transform_pattern_stmt) 8792 stmt = pattern_stmt; 8793 else 8794 { 8795 stmt = gsi_stmt (si); 8796 /* During vectorization remove existing clobber stmts. */ 8797 if (gimple_clobber_p (stmt)) 8798 { 8799 unlink_stmt_vdef (stmt); 8800 gsi_remove (&si, true); 8801 release_defs (stmt); 8802 continue; 8803 } 8804 } 8805 8806 if (dump_enabled_p ()) 8807 { 8808 dump_printf_loc (MSG_NOTE, vect_location, 8809 "------>vectorizing statement: "); 8810 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 8811 } 8812 8813 stmt_info = vinfo_for_stmt (stmt); 8814 8815 /* vector stmts created in the outer-loop during vectorization of 8816 stmts in an inner-loop may not have a stmt_info, and do not 8817 need to be vectorized. */ 8818 if (!stmt_info) 8819 { 8820 gsi_next (&si); 8821 continue; 8822 } 8823 8824 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8825 vect_loop_kill_debug_uses (loop, stmt); 8826 8827 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8828 && !STMT_VINFO_LIVE_P (stmt_info)) 8829 { 8830 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 8831 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 8832 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 8833 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 8834 { 8835 stmt = pattern_stmt; 8836 stmt_info = vinfo_for_stmt (stmt); 8837 } 8838 else 8839 { 8840 gsi_next (&si); 8841 continue; 8842 } 8843 } 8844 else if (STMT_VINFO_IN_PATTERN_P (stmt_info) 8845 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info)) 8846 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt)) 8847 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt)))) 8848 transform_pattern_stmt = true; 8849 8850 /* If pattern statement has def stmts, vectorize them too. */ 8851 if (is_pattern_stmt_p (stmt_info)) 8852 { 8853 if (pattern_def_seq == NULL) 8854 { 8855 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 8856 pattern_def_si = gsi_start (pattern_def_seq); 8857 } 8858 else if (!gsi_end_p (pattern_def_si)) 8859 gsi_next (&pattern_def_si); 8860 if (pattern_def_seq != NULL) 8861 { 8862 gimple *pattern_def_stmt = NULL; 8863 stmt_vec_info pattern_def_stmt_info = NULL; 8864 8865 while (!gsi_end_p (pattern_def_si)) 8866 { 8867 pattern_def_stmt = gsi_stmt (pattern_def_si); 8868 pattern_def_stmt_info 8869 = vinfo_for_stmt (pattern_def_stmt); 8870 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info) 8871 || STMT_VINFO_LIVE_P (pattern_def_stmt_info)) 8872 break; 8873 gsi_next (&pattern_def_si); 8874 } 8875 8876 if (!gsi_end_p (pattern_def_si)) 8877 { 8878 if (dump_enabled_p ()) 8879 { 8880 dump_printf_loc (MSG_NOTE, vect_location, 8881 "==> vectorizing pattern def " 8882 "stmt: "); 8883 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, 8884 pattern_def_stmt, 0); 8885 } 8886 8887 stmt = pattern_def_stmt; 8888 stmt_info = pattern_def_stmt_info; 8889 } 8890 else 8891 { 8892 pattern_def_si = gsi_none (); 8893 transform_pattern_stmt = false; 8894 } 8895 } 8896 else 8897 transform_pattern_stmt = false; 8898 } 8899 8900 if (STMT_VINFO_VECTYPE (stmt_info)) 8901 { 8902 poly_uint64 nunits 8903 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 8904 if (!STMT_SLP_TYPE (stmt_info) 8905 && maybe_ne (nunits, vf) 8906 && dump_enabled_p ()) 8907 /* For SLP VF is set according to unrolling factor, and not 8908 to vector size, hence for SLP this print is not valid. */ 8909 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8910 } 8911 8912 /* SLP. Schedule all the SLP instances when the first SLP stmt is 8913 reached. */ 8914 if (STMT_SLP_TYPE (stmt_info)) 8915 { 8916 if (!slp_scheduled) 8917 { 8918 slp_scheduled = true; 8919 8920 if (dump_enabled_p ()) 8921 dump_printf_loc (MSG_NOTE, vect_location, 8922 "=== scheduling SLP instances ===\n"); 8923 8924 vect_schedule_slp (loop_vinfo); 8925 } 8926 8927 /* Hybrid SLP stmts must be vectorized in addition to SLP. */ 8928 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) 8929 { 8930 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 8931 { 8932 pattern_def_seq = NULL; 8933 gsi_next (&si); 8934 } 8935 continue; 8936 } 8937 } 8938 8939 /* -------- vectorize statement ------------ */ 8940 if (dump_enabled_p ()) 8941 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 8942 8943 grouped_store = false; 8944 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL); 8945 if (is_store) 8946 { 8947 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 8948 { 8949 /* Interleaving. If IS_STORE is TRUE, the vectorization of the 8950 interleaving chain was completed - free all the stores in 8951 the chain. */ 8952 gsi_next (&si); 8953 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info)); 8954 } 8955 else 8956 { 8957 /* Free the attached stmt_vec_info and remove the stmt. */ 8958 gimple *store = gsi_stmt (si); 8959 free_stmt_vec_info (store); 8960 unlink_stmt_vdef (store); 8961 gsi_remove (&si, true); 8962 release_defs (store); 8963 } 8964 8965 /* Stores can only appear at the end of pattern statements. */ 8966 gcc_assert (!transform_pattern_stmt); 8967 pattern_def_seq = NULL; 8968 } 8969 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si)) 8970 { 8971 pattern_def_seq = NULL; 8972 gsi_next (&si); 8973 } 8974 } /* stmts in BB */ 8975 8976 /* Stub out scalar statements that must not survive vectorization. 8977 Doing this here helps with grouped statements, or statements that 8978 are involved in patterns. */ 8979 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); 8980 !gsi_end_p (gsi); gsi_next (&gsi)) 8981 { 8982 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); 8983 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD)) 8984 { 8985 tree lhs = gimple_get_lhs (call); 8986 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8987 { 8988 tree zero = build_zero_cst (TREE_TYPE (lhs)); 8989 gimple *new_stmt = gimple_build_assign (lhs, zero); 8990 gsi_replace (&gsi, new_stmt, true); 8991 } 8992 } 8993 } 8994 } /* BBs in loop */ 8995 8996 /* The vectorization factor is always > 1, so if we use an IV increment of 1. 8997 a zero NITERS becomes a nonzero NITERS_VECTOR. */ 8998 if (integer_onep (step_vector)) 8999 niters_no_overflow = true; 9000 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, 9001 niters_vector_mult_vf, !niters_no_overflow); 9002 9003 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 9004 scale_profile_for_vect_loop (loop, assumed_vf); 9005 9006 /* True if the final iteration might not handle a full vector's 9007 worth of scalar iterations. */ 9008 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 9009 /* The minimum number of iterations performed by the epilogue. This 9010 is 1 when peeling for gaps because we always need a final scalar 9011 iteration. */ 9012 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 9013 /* +1 to convert latch counts to loop iteration counts, 9014 -min_epilogue_iters to remove iterations that cannot be performed 9015 by the vector code. */ 9016 int bias_for_lowest = 1 - min_epilogue_iters; 9017 int bias_for_assumed = bias_for_lowest; 9018 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 9019 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 9020 { 9021 /* When the amount of peeling is known at compile time, the first 9022 iteration will have exactly alignment_npeels active elements. 9023 In the worst case it will have at least one. */ 9024 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); 9025 bias_for_lowest += lowest_vf - min_first_active; 9026 bias_for_assumed += assumed_vf - min_first_active; 9027 } 9028 /* In these calculations the "- 1" converts loop iteration counts 9029 back to latch counts. */ 9030 if (loop->any_upper_bound) 9031 loop->nb_iterations_upper_bound 9032 = (final_iter_may_be_partial 9033 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, 9034 lowest_vf) - 1 9035 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, 9036 lowest_vf) - 1); 9037 if (loop->any_likely_upper_bound) 9038 loop->nb_iterations_likely_upper_bound 9039 = (final_iter_may_be_partial 9040 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound 9041 + bias_for_lowest, lowest_vf) - 1 9042 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound 9043 + bias_for_lowest, lowest_vf) - 1); 9044 if (loop->any_estimate) 9045 loop->nb_iterations_estimate 9046 = (final_iter_may_be_partial 9047 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, 9048 assumed_vf) - 1 9049 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, 9050 assumed_vf) - 1); 9051 9052 if (dump_enabled_p ()) 9053 { 9054 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 9055 { 9056 dump_printf_loc (MSG_NOTE, vect_location, 9057 "LOOP VECTORIZED\n"); 9058 if (loop->inner) 9059 dump_printf_loc (MSG_NOTE, vect_location, 9060 "OUTER LOOP VECTORIZED\n"); 9061 dump_printf (MSG_NOTE, "\n"); 9062 } 9063 else 9064 { 9065 dump_printf_loc (MSG_NOTE, vect_location, 9066 "LOOP EPILOGUE VECTORIZED (VS="); 9067 dump_dec (MSG_NOTE, current_vector_size); 9068 dump_printf (MSG_NOTE, ")\n"); 9069 } 9070 } 9071 9072 /* Free SLP instances here because otherwise stmt reference counting 9073 won't work. */ 9074 slp_instance instance; 9075 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 9076 vect_free_slp_instance (instance); 9077 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 9078 /* Clear-up safelen field since its value is invalid after vectorization 9079 since vectorized loop can have loop-carried dependencies. */ 9080 loop->safelen = 0; 9081 9082 /* Don't vectorize epilogue for epilogue. */ 9083 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 9084 epilogue = NULL; 9085 9086 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) 9087 epilogue = NULL; 9088 9089 if (epilogue) 9090 { 9091 auto_vector_sizes vector_sizes; 9092 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); 9093 unsigned int next_size = 0; 9094 9095 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 9096 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0 9097 && known_eq (vf, lowest_vf)) 9098 { 9099 unsigned int eiters 9100 = (LOOP_VINFO_INT_NITERS (loop_vinfo) 9101 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); 9102 eiters = eiters % lowest_vf; 9103 epilogue->nb_iterations_upper_bound = eiters - 1; 9104 9105 unsigned int ratio; 9106 while (next_size < vector_sizes.length () 9107 && !(constant_multiple_p (current_vector_size, 9108 vector_sizes[next_size], &ratio) 9109 && eiters >= lowest_vf / ratio)) 9110 next_size += 1; 9111 } 9112 else 9113 while (next_size < vector_sizes.length () 9114 && maybe_lt (current_vector_size, vector_sizes[next_size])) 9115 next_size += 1; 9116 9117 if (next_size == vector_sizes.length ()) 9118 epilogue = NULL; 9119 } 9120 9121 if (epilogue) 9122 { 9123 epilogue->force_vectorize = loop->force_vectorize; 9124 epilogue->safelen = loop->safelen; 9125 epilogue->dont_vectorize = false; 9126 9127 /* We may need to if-convert epilogue to vectorize it. */ 9128 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 9129 tree_if_conversion (epilogue); 9130 } 9131 9132 return epilogue; 9133 } 9134 9135 /* The code below is trying to perform simple optimization - revert 9136 if-conversion for masked stores, i.e. if the mask of a store is zero 9137 do not perform it and all stored value producers also if possible. 9138 For example, 9139 for (i=0; i<n; i++) 9140 if (c[i]) 9141 { 9142 p1[i] += 1; 9143 p2[i] = p3[i] +2; 9144 } 9145 this transformation will produce the following semi-hammock: 9146 9147 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 9148 { 9149 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 9150 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 9151 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 9152 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 9153 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 9154 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 9155 } 9156 */ 9157 9158 void 9159 optimize_mask_stores (struct loop *loop) 9160 { 9161 basic_block *bbs = get_loop_body (loop); 9162 unsigned nbbs = loop->num_nodes; 9163 unsigned i; 9164 basic_block bb; 9165 struct loop *bb_loop; 9166 gimple_stmt_iterator gsi; 9167 gimple *stmt; 9168 auto_vec<gimple *> worklist; 9169 9170 vect_location = find_loop_location (loop); 9171 /* Pick up all masked stores in loop if any. */ 9172 for (i = 0; i < nbbs; i++) 9173 { 9174 bb = bbs[i]; 9175 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 9176 gsi_next (&gsi)) 9177 { 9178 stmt = gsi_stmt (gsi); 9179 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 9180 worklist.safe_push (stmt); 9181 } 9182 } 9183 9184 free (bbs); 9185 if (worklist.is_empty ()) 9186 return; 9187 9188 /* Loop has masked stores. */ 9189 while (!worklist.is_empty ()) 9190 { 9191 gimple *last, *last_store; 9192 edge e, efalse; 9193 tree mask; 9194 basic_block store_bb, join_bb; 9195 gimple_stmt_iterator gsi_to; 9196 tree vdef, new_vdef; 9197 gphi *phi; 9198 tree vectype; 9199 tree zero; 9200 9201 last = worklist.pop (); 9202 mask = gimple_call_arg (last, 2); 9203 bb = gimple_bb (last); 9204 /* Create then_bb and if-then structure in CFG, then_bb belongs to 9205 the same loop as if_bb. It could be different to LOOP when two 9206 level loop-nest is vectorized and mask_store belongs to the inner 9207 one. */ 9208 e = split_block (bb, last); 9209 bb_loop = bb->loop_father; 9210 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 9211 join_bb = e->dest; 9212 store_bb = create_empty_bb (bb); 9213 add_bb_to_loop (store_bb, bb_loop); 9214 e->flags = EDGE_TRUE_VALUE; 9215 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 9216 /* Put STORE_BB to likely part. */ 9217 efalse->probability = profile_probability::unlikely (); 9218 store_bb->count = efalse->count (); 9219 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 9220 if (dom_info_available_p (CDI_DOMINATORS)) 9221 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 9222 if (dump_enabled_p ()) 9223 dump_printf_loc (MSG_NOTE, vect_location, 9224 "Create new block %d to sink mask stores.", 9225 store_bb->index); 9226 /* Create vector comparison with boolean result. */ 9227 vectype = TREE_TYPE (mask); 9228 zero = build_zero_cst (vectype); 9229 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 9230 gsi = gsi_last_bb (bb); 9231 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 9232 /* Create new PHI node for vdef of the last masked store: 9233 .MEM_2 = VDEF <.MEM_1> 9234 will be converted to 9235 .MEM.3 = VDEF <.MEM_1> 9236 and new PHI node will be created in join bb 9237 .MEM_2 = PHI <.MEM_1, .MEM_3> 9238 */ 9239 vdef = gimple_vdef (last); 9240 new_vdef = make_ssa_name (gimple_vop (cfun), last); 9241 gimple_set_vdef (last, new_vdef); 9242 phi = create_phi_node (vdef, join_bb); 9243 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 9244 9245 /* Put all masked stores with the same mask to STORE_BB if possible. */ 9246 while (true) 9247 { 9248 gimple_stmt_iterator gsi_from; 9249 gimple *stmt1 = NULL; 9250 9251 /* Move masked store to STORE_BB. */ 9252 last_store = last; 9253 gsi = gsi_for_stmt (last); 9254 gsi_from = gsi; 9255 /* Shift GSI to the previous stmt for further traversal. */ 9256 gsi_prev (&gsi); 9257 gsi_to = gsi_start_bb (store_bb); 9258 gsi_move_before (&gsi_from, &gsi_to); 9259 /* Setup GSI_TO to the non-empty block start. */ 9260 gsi_to = gsi_start_bb (store_bb); 9261 if (dump_enabled_p ()) 9262 { 9263 dump_printf_loc (MSG_NOTE, vect_location, 9264 "Move stmt to created bb\n"); 9265 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0); 9266 } 9267 /* Move all stored value producers if possible. */ 9268 while (!gsi_end_p (gsi)) 9269 { 9270 tree lhs; 9271 imm_use_iterator imm_iter; 9272 use_operand_p use_p; 9273 bool res; 9274 9275 /* Skip debug statements. */ 9276 if (is_gimple_debug (gsi_stmt (gsi))) 9277 { 9278 gsi_prev (&gsi); 9279 continue; 9280 } 9281 stmt1 = gsi_stmt (gsi); 9282 /* Do not consider statements writing to memory or having 9283 volatile operand. */ 9284 if (gimple_vdef (stmt1) 9285 || gimple_has_volatile_ops (stmt1)) 9286 break; 9287 gsi_from = gsi; 9288 gsi_prev (&gsi); 9289 lhs = gimple_get_lhs (stmt1); 9290 if (!lhs) 9291 break; 9292 9293 /* LHS of vectorized stmt must be SSA_NAME. */ 9294 if (TREE_CODE (lhs) != SSA_NAME) 9295 break; 9296 9297 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 9298 { 9299 /* Remove dead scalar statement. */ 9300 if (has_zero_uses (lhs)) 9301 { 9302 gsi_remove (&gsi_from, true); 9303 continue; 9304 } 9305 } 9306 9307 /* Check that LHS does not have uses outside of STORE_BB. */ 9308 res = true; 9309 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 9310 { 9311 gimple *use_stmt; 9312 use_stmt = USE_STMT (use_p); 9313 if (is_gimple_debug (use_stmt)) 9314 continue; 9315 if (gimple_bb (use_stmt) != store_bb) 9316 { 9317 res = false; 9318 break; 9319 } 9320 } 9321 if (!res) 9322 break; 9323 9324 if (gimple_vuse (stmt1) 9325 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 9326 break; 9327 9328 /* Can move STMT1 to STORE_BB. */ 9329 if (dump_enabled_p ()) 9330 { 9331 dump_printf_loc (MSG_NOTE, vect_location, 9332 "Move stmt to created bb\n"); 9333 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0); 9334 } 9335 gsi_move_before (&gsi_from, &gsi_to); 9336 /* Shift GSI_TO for further insertion. */ 9337 gsi_prev (&gsi_to); 9338 } 9339 /* Put other masked stores with the same mask to STORE_BB. */ 9340 if (worklist.is_empty () 9341 || gimple_call_arg (worklist.last (), 2) != mask 9342 || worklist.last () != stmt1) 9343 break; 9344 last = worklist.pop (); 9345 } 9346 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 9347 } 9348 } 9349