1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "tree-cfg.h" 54 #include "tree-hash-traits.h" 55 #include "vec-perm-indices.h" 56 #include "internal-fn.h" 57 58 /* Return true if load- or store-lanes optab OPTAB is implemented for 59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 60 61 static bool 62 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 63 tree vectype, unsigned HOST_WIDE_INT count) 64 { 65 machine_mode mode, array_mode; 66 bool limit_p; 67 68 mode = TYPE_MODE (vectype); 69 if (!targetm.array_mode (mode, count).exists (&array_mode)) 70 { 71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 72 limit_p = !targetm.array_mode_supported_p (mode, count); 73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 74 { 75 if (dump_enabled_p ()) 76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 77 "no array mode for %s[" 78 HOST_WIDE_INT_PRINT_DEC "]\n", 79 GET_MODE_NAME (mode), count); 80 return false; 81 } 82 } 83 84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 85 { 86 if (dump_enabled_p ()) 87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 88 "cannot use %s<%s><%s>\n", name, 89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 90 return false; 91 } 92 93 if (dump_enabled_p ()) 94 dump_printf_loc (MSG_NOTE, vect_location, 95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 96 GET_MODE_NAME (mode)); 97 98 return true; 99 } 100 101 102 /* Return the smallest scalar part of STMT. 103 This is used to determine the vectype of the stmt. We generally set the 104 vectype according to the type of the result (lhs). For stmts whose 105 result-type is different than the type of the arguments (e.g., demotion, 106 promotion), vectype will be reset appropriately (later). Note that we have 107 to visit the smallest datatype in this function, because that determines the 108 VF. If the smallest datatype in the loop is present only as the rhs of a 109 promotion operation - we'd miss it. 110 Such a case, where a variable of this datatype does not appear in the lhs 111 anywhere in the loop, can only occur if it's an invariant: e.g.: 112 'int_x = (int) short_inv', which we'd expect to have been optimized away by 113 invariant motion. However, we cannot rely on invariant motion to always 114 take invariants out of the loop, and so in the case of promotion we also 115 have to check the rhs. 116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 117 types. */ 118 119 tree 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 121 HOST_WIDE_INT *rhs_size_unit) 122 { 123 tree scalar_type = gimple_expr_type (stmt); 124 HOST_WIDE_INT lhs, rhs; 125 126 /* During the analysis phase, this function is called on arbitrary 127 statements that might not have scalar results. */ 128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 129 return scalar_type; 130 131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 132 133 if (is_gimple_assign (stmt) 134 && (gimple_assign_cast_p (stmt) 135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR 136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR 137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 140 { 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 142 143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 144 if (rhs < lhs) 145 scalar_type = rhs_type; 146 } 147 148 *lhs_size_unit = lhs; 149 *rhs_size_unit = rhs; 150 return scalar_type; 151 } 152 153 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 155 tested at run-time. Return TRUE if DDR was successfully inserted. 156 Return false if versioning is not supported. */ 157 158 static bool 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 160 { 161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 162 163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 164 return false; 165 166 if (!runtime_alias_check_p (ddr, loop, 167 optimize_loop_nest_for_speed_p (loop))) 168 return false; 169 170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 171 return true; 172 } 173 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 175 176 static void 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 178 { 179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 180 for (unsigned int i = 0; i < checks.length(); ++i) 181 if (checks[i] == value) 182 return; 183 184 if (dump_enabled_p ()) 185 { 186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that "); 187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value); 188 dump_printf (MSG_NOTE, " is nonzero\n"); 189 } 190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 191 } 192 193 /* Return true if we know that the order of vectorized STMT_A and 194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 195 At least one of the statements is a write. */ 196 197 static bool 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 199 { 200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 202 203 /* Single statements are always kept in their original order. */ 204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 206 return true; 207 208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 209 group are emitted at the position of the first scalar load and all 210 stores in a group are emitted at the position of the last scalar store. 211 Thus writes will happen no earlier than their current position 212 (but could happen later) while reads will happen no later than their 213 current position (but could happen earlier). Reordering is therefore 214 only possible if the first access is a write. */ 215 gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b); 216 return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))); 217 } 218 219 /* A subroutine of vect_analyze_data_ref_dependence. Handle 220 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 221 distances. These distances are conservatively correct but they don't 222 reflect a guaranteed dependence. 223 224 Return true if this function does all the work necessary to avoid 225 an alias or false if the caller should use the dependence distances 226 to limit the vectorization factor in the usual way. LOOP_DEPTH is 227 the depth of the loop described by LOOP_VINFO and the other arguments 228 are as for vect_analyze_data_ref_dependence. */ 229 230 static bool 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 232 loop_vec_info loop_vinfo, 233 int loop_depth, unsigned int *max_vf) 234 { 235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 236 lambda_vector dist_v; 237 unsigned int i; 238 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 239 { 240 int dist = dist_v[loop_depth]; 241 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 242 { 243 /* If the user asserted safelen >= DIST consecutive iterations 244 can be executed concurrently, assume independence. 245 246 ??? An alternative would be to add the alias check even 247 in this case, and vectorize the fallback loop with the 248 maximum VF set to safelen. However, if the user has 249 explicitly given a length, it's less likely that that 250 would be a win. */ 251 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 252 { 253 if ((unsigned int) loop->safelen < *max_vf) 254 *max_vf = loop->safelen; 255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 256 continue; 257 } 258 259 /* For dependence distances of 2 or more, we have the option 260 of limiting VF or checking for an alias at runtime. 261 Prefer to check at runtime if we can, to avoid limiting 262 the VF unnecessarily when the bases are in fact independent. 263 264 Note that the alias checks will be removed if the VF ends up 265 being small enough. */ 266 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 267 } 268 } 269 return true; 270 } 271 272 273 /* Function vect_analyze_data_ref_dependence. 274 275 Return TRUE if there (might) exist a dependence between a memory-reference 276 DRA and a memory-reference DRB. When versioning for alias may check a 277 dependence at run-time, return FALSE. Adjust *MAX_VF according to 278 the data dependence. */ 279 280 static bool 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 282 loop_vec_info loop_vinfo, 283 unsigned int *max_vf) 284 { 285 unsigned int i; 286 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 287 struct data_reference *dra = DDR_A (ddr); 288 struct data_reference *drb = DDR_B (ddr); 289 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 290 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 291 lambda_vector dist_v; 292 unsigned int loop_depth; 293 294 /* In loop analysis all data references should be vectorizable. */ 295 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 296 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 297 gcc_unreachable (); 298 299 /* Independent data accesses. */ 300 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 301 return false; 302 303 if (dra == drb 304 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 305 return false; 306 307 /* We do not have to consider dependences between accesses that belong 308 to the same group, unless the stride could be smaller than the 309 group size. */ 310 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 311 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b) 312 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 313 return false; 314 315 /* Even if we have an anti-dependence then, as the vectorized loop covers at 316 least two scalar iterations, there is always also a true dependence. 317 As the vectorizer does not re-order loads and stores we can ignore 318 the anti-dependence if TBAA can disambiguate both DRs similar to the 319 case with known negative distance anti-dependences (positive 320 distance anti-dependences would violate TBAA constraints). */ 321 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 322 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 323 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 324 get_alias_set (DR_REF (drb)))) 325 return false; 326 327 /* Unknown data dependence. */ 328 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 329 { 330 /* If user asserted safelen consecutive iterations can be 331 executed concurrently, assume independence. */ 332 if (loop->safelen >= 2) 333 { 334 if ((unsigned int) loop->safelen < *max_vf) 335 *max_vf = loop->safelen; 336 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 337 return false; 338 } 339 340 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 341 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 342 { 343 if (dump_enabled_p ()) 344 { 345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 346 "versioning for alias not supported for: " 347 "can't determine dependence between "); 348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 349 DR_REF (dra)); 350 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 351 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 352 DR_REF (drb)); 353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 354 } 355 return true; 356 } 357 358 if (dump_enabled_p ()) 359 { 360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 361 "versioning for alias required: " 362 "can't determine dependence between "); 363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 364 DR_REF (dra)); 365 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 366 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 367 DR_REF (drb)); 368 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 369 } 370 371 /* Add to list of ddrs that need to be tested at run-time. */ 372 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 373 } 374 375 /* Known data dependence. */ 376 if (DDR_NUM_DIST_VECTS (ddr) == 0) 377 { 378 /* If user asserted safelen consecutive iterations can be 379 executed concurrently, assume independence. */ 380 if (loop->safelen >= 2) 381 { 382 if ((unsigned int) loop->safelen < *max_vf) 383 *max_vf = loop->safelen; 384 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 385 return false; 386 } 387 388 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 389 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 390 { 391 if (dump_enabled_p ()) 392 { 393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 394 "versioning for alias not supported for: " 395 "bad dist vector for "); 396 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 397 DR_REF (dra)); 398 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 399 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 400 DR_REF (drb)); 401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 402 } 403 return true; 404 } 405 406 if (dump_enabled_p ()) 407 { 408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 409 "versioning for alias required: " 410 "bad dist vector for "); 411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 412 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 413 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 415 } 416 /* Add to list of ddrs that need to be tested at run-time. */ 417 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 418 } 419 420 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 421 422 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 423 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 424 loop_depth, max_vf)) 425 return false; 426 427 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 428 { 429 int dist = dist_v[loop_depth]; 430 431 if (dump_enabled_p ()) 432 dump_printf_loc (MSG_NOTE, vect_location, 433 "dependence distance = %d.\n", dist); 434 435 if (dist == 0) 436 { 437 if (dump_enabled_p ()) 438 { 439 dump_printf_loc (MSG_NOTE, vect_location, 440 "dependence distance == 0 between "); 441 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 442 dump_printf (MSG_NOTE, " and "); 443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 445 } 446 447 /* When we perform grouped accesses and perform implicit CSE 448 by detecting equal accesses and doing disambiguation with 449 runtime alias tests like for 450 .. = a[i]; 451 .. = a[i+1]; 452 a[i] = ..; 453 a[i+1] = ..; 454 *p = ..; 455 .. = a[i]; 456 .. = a[i+1]; 457 where we will end up loading { a[i], a[i+1] } once, make 458 sure that inserting group loads before the first load and 459 stores after the last store will do the right thing. 460 Similar for groups like 461 a[i] = ...; 462 ... = a[i]; 463 a[i+1] = ...; 464 where loads from the group interleave with the store. */ 465 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 466 { 467 if (dump_enabled_p ()) 468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 469 "READ_WRITE dependence in interleaving.\n"); 470 return true; 471 } 472 473 if (loop->safelen < 2) 474 { 475 tree indicator = dr_zero_step_indicator (dra); 476 if (TREE_CODE (indicator) != INTEGER_CST) 477 vect_check_nonzero_value (loop_vinfo, indicator); 478 else if (integer_zerop (indicator)) 479 { 480 if (dump_enabled_p ()) 481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 482 "access also has a zero step\n"); 483 return true; 484 } 485 } 486 continue; 487 } 488 489 if (dist > 0 && DDR_REVERSED_P (ddr)) 490 { 491 /* If DDR_REVERSED_P the order of the data-refs in DDR was 492 reversed (to make distance vector positive), and the actual 493 distance is negative. */ 494 if (dump_enabled_p ()) 495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 496 "dependence distance negative.\n"); 497 /* Record a negative dependence distance to later limit the 498 amount of stmt copying / unrolling we can perform. 499 Only need to handle read-after-write dependence. */ 500 if (DR_IS_READ (drb) 501 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 502 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 503 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 504 continue; 505 } 506 507 unsigned int abs_dist = abs (dist); 508 if (abs_dist >= 2 && abs_dist < *max_vf) 509 { 510 /* The dependence distance requires reduction of the maximal 511 vectorization factor. */ 512 *max_vf = abs (dist); 513 if (dump_enabled_p ()) 514 dump_printf_loc (MSG_NOTE, vect_location, 515 "adjusting maximal vectorization factor to %i\n", 516 *max_vf); 517 } 518 519 if (abs_dist >= *max_vf) 520 { 521 /* Dependence distance does not create dependence, as far as 522 vectorization is concerned, in this case. */ 523 if (dump_enabled_p ()) 524 dump_printf_loc (MSG_NOTE, vect_location, 525 "dependence distance >= VF.\n"); 526 continue; 527 } 528 529 if (dump_enabled_p ()) 530 { 531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 532 "not vectorized, possible dependence " 533 "between data-refs "); 534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 535 dump_printf (MSG_NOTE, " and "); 536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 537 dump_printf (MSG_NOTE, "\n"); 538 } 539 540 return true; 541 } 542 543 return false; 544 } 545 546 /* Function vect_analyze_data_ref_dependences. 547 548 Examine all the data references in the loop, and make sure there do not 549 exist any data dependences between them. Set *MAX_VF according to 550 the maximum vectorization factor the data dependences allow. */ 551 552 bool 553 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 554 unsigned int *max_vf) 555 { 556 unsigned int i; 557 struct data_dependence_relation *ddr; 558 559 if (dump_enabled_p ()) 560 dump_printf_loc (MSG_NOTE, vect_location, 561 "=== vect_analyze_data_ref_dependences ===\n"); 562 563 LOOP_VINFO_DDRS (loop_vinfo) 564 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 565 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 566 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 567 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 568 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 569 &LOOP_VINFO_DDRS (loop_vinfo), 570 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 571 return false; 572 573 /* For epilogues we either have no aliases or alias versioning 574 was applied to original loop. Therefore we may just get max_vf 575 using VF of original loop. */ 576 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 577 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 578 else 579 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 580 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 581 return false; 582 583 return true; 584 } 585 586 587 /* Function vect_slp_analyze_data_ref_dependence. 588 589 Return TRUE if there (might) exist a dependence between a memory-reference 590 DRA and a memory-reference DRB. When versioning for alias may check a 591 dependence at run-time, return FALSE. Adjust *MAX_VF according to 592 the data dependence. */ 593 594 static bool 595 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 596 { 597 struct data_reference *dra = DDR_A (ddr); 598 struct data_reference *drb = DDR_B (ddr); 599 600 /* We need to check dependences of statements marked as unvectorizable 601 as well, they still can prohibit vectorization. */ 602 603 /* Independent data accesses. */ 604 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 605 return false; 606 607 if (dra == drb) 608 return false; 609 610 /* Read-read is OK. */ 611 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 612 return false; 613 614 /* If dra and drb are part of the same interleaving chain consider 615 them independent. */ 616 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 617 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 618 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 619 return false; 620 621 /* Unknown data dependence. */ 622 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 623 { 624 if (dump_enabled_p ()) 625 { 626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 627 "can't determine dependence between "); 628 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 629 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 630 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 631 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 632 } 633 } 634 else if (dump_enabled_p ()) 635 { 636 dump_printf_loc (MSG_NOTE, vect_location, 637 "determined dependence between "); 638 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 639 dump_printf (MSG_NOTE, " and "); 640 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 641 dump_printf (MSG_NOTE, "\n"); 642 } 643 644 return true; 645 } 646 647 648 /* Analyze dependences involved in the transform of SLP NODE. STORES 649 contain the vector of scalar stores of this instance if we are 650 disambiguating the loads. */ 651 652 static bool 653 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 654 vec<gimple *> stores, gimple *last_store) 655 { 656 /* This walks over all stmts involved in the SLP load/store done 657 in NODE verifying we can sink them up to the last stmt in the 658 group. */ 659 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 660 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 661 { 662 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 663 if (access == last_access) 664 continue; 665 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 666 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 667 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 668 { 669 gimple *stmt = gsi_stmt (gsi); 670 if (! gimple_vuse (stmt) 671 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 672 continue; 673 674 /* If we couldn't record a (single) data reference for this 675 stmt we have to give up. */ 676 /* ??? Here and below if dependence analysis fails we can resort 677 to the alias oracle which can handle more kinds of stmts. */ 678 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 679 if (!dr_b) 680 return false; 681 682 bool dependent = false; 683 /* If we run into a store of this same instance (we've just 684 marked those) then delay dependence checking until we run 685 into the last store because this is where it will have 686 been sunk to (and we verify if we can do that as well). */ 687 if (gimple_visited_p (stmt)) 688 { 689 if (stmt != last_store) 690 continue; 691 unsigned i; 692 gimple *store; 693 FOR_EACH_VEC_ELT (stores, i, store) 694 { 695 data_reference *store_dr 696 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 697 ddr_p ddr = initialize_data_dependence_relation 698 (dr_a, store_dr, vNULL); 699 dependent = vect_slp_analyze_data_ref_dependence (ddr); 700 free_dependence_relation (ddr); 701 if (dependent) 702 break; 703 } 704 } 705 else 706 { 707 ddr_p ddr = initialize_data_dependence_relation (dr_a, 708 dr_b, vNULL); 709 dependent = vect_slp_analyze_data_ref_dependence (ddr); 710 free_dependence_relation (ddr); 711 } 712 if (dependent) 713 return false; 714 } 715 } 716 return true; 717 } 718 719 720 /* Function vect_analyze_data_ref_dependences. 721 722 Examine all the data references in the basic-block, and make sure there 723 do not exist any data dependences between them. Set *MAX_VF according to 724 the maximum vectorization factor the data dependences allow. */ 725 726 bool 727 vect_slp_analyze_instance_dependence (slp_instance instance) 728 { 729 if (dump_enabled_p ()) 730 dump_printf_loc (MSG_NOTE, vect_location, 731 "=== vect_slp_analyze_instance_dependence ===\n"); 732 733 /* The stores of this instance are at the root of the SLP tree. */ 734 slp_tree store = SLP_INSTANCE_TREE (instance); 735 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 736 store = NULL; 737 738 /* Verify we can sink stores to the vectorized stmt insert location. */ 739 gimple *last_store = NULL; 740 if (store) 741 { 742 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 743 return false; 744 745 /* Mark stores in this instance and remember the last one. */ 746 last_store = vect_find_last_scalar_stmt_in_slp (store); 747 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 748 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 749 } 750 751 bool res = true; 752 753 /* Verify we can sink loads to the vectorized stmt insert location, 754 special-casing stores of this instance. */ 755 slp_tree load; 756 unsigned int i; 757 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 758 if (! vect_slp_analyze_node_dependences (instance, load, 759 store 760 ? SLP_TREE_SCALAR_STMTS (store) 761 : vNULL, last_store)) 762 { 763 res = false; 764 break; 765 } 766 767 /* Unset the visited flag. */ 768 if (store) 769 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 770 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 771 772 return res; 773 } 774 775 /* Record in VINFO the base alignment guarantee given by DRB. STMT is 776 the statement that contains DRB, which is useful for recording in the 777 dump file. */ 778 779 static void 780 vect_record_base_alignment (vec_info *vinfo, gimple *stmt, 781 innermost_loop_behavior *drb) 782 { 783 bool existed; 784 innermost_loop_behavior *&entry 785 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 786 if (!existed || entry->base_alignment < drb->base_alignment) 787 { 788 entry = drb; 789 if (dump_enabled_p ()) 790 { 791 dump_printf_loc (MSG_NOTE, vect_location, 792 "recording new base alignment for "); 793 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address); 794 dump_printf (MSG_NOTE, "\n"); 795 dump_printf_loc (MSG_NOTE, vect_location, 796 " alignment: %d\n", drb->base_alignment); 797 dump_printf_loc (MSG_NOTE, vect_location, 798 " misalignment: %d\n", drb->base_misalignment); 799 dump_printf_loc (MSG_NOTE, vect_location, 800 " based on: "); 801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 802 } 803 } 804 } 805 806 /* If the region we're going to vectorize is reached, all unconditional 807 data references occur at least once. We can therefore pool the base 808 alignment guarantees from each unconditional reference. Do this by 809 going through all the data references in VINFO and checking whether 810 the containing statement makes the reference unconditionally. If so, 811 record the alignment of the base address in VINFO so that it can be 812 used for all other references with the same base. */ 813 814 void 815 vect_record_base_alignments (vec_info *vinfo) 816 { 817 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 818 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 819 data_reference *dr; 820 unsigned int i; 821 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr) 822 if (!DR_IS_CONDITIONAL_IN_STMT (dr)) 823 { 824 gimple *stmt = DR_STMT (dr); 825 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr)); 826 827 /* If DR is nested in the loop that is being vectorized, we can also 828 record the alignment of the base wrt the outer loop. */ 829 if (loop && nested_in_vect_loop_p (loop, stmt)) 830 { 831 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 832 vect_record_base_alignment 833 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 834 } 835 } 836 } 837 838 /* Return the target alignment for the vectorized form of DR. */ 839 840 static unsigned int 841 vect_calculate_target_alignment (struct data_reference *dr) 842 { 843 gimple *stmt = DR_STMT (dr); 844 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 845 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 846 return targetm.vectorize.preferred_vector_alignment (vectype); 847 } 848 849 /* Function vect_compute_data_ref_alignment 850 851 Compute the misalignment of the data reference DR. 852 853 Output: 854 1. If during the misalignment computation it is found that the data reference 855 cannot be vectorized then false is returned. 856 2. DR_MISALIGNMENT (DR) is defined. 857 858 FOR NOW: No analysis is actually performed. Misalignment is calculated 859 only for trivial cases. TODO. */ 860 861 bool 862 vect_compute_data_ref_alignment (struct data_reference *dr) 863 { 864 gimple *stmt = DR_STMT (dr); 865 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 866 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 867 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 868 struct loop *loop = NULL; 869 tree ref = DR_REF (dr); 870 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 871 872 if (dump_enabled_p ()) 873 dump_printf_loc (MSG_NOTE, vect_location, 874 "vect_compute_data_ref_alignment:\n"); 875 876 if (loop_vinfo) 877 loop = LOOP_VINFO_LOOP (loop_vinfo); 878 879 /* Initialize misalignment to unknown. */ 880 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 881 882 innermost_loop_behavior *drb = vect_dr_behavior (dr); 883 bool step_preserves_misalignment_p; 884 885 unsigned HOST_WIDE_INT vector_alignment 886 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT; 887 DR_TARGET_ALIGNMENT (dr) = vector_alignment; 888 889 /* No step for BB vectorization. */ 890 if (!loop) 891 { 892 gcc_assert (integer_zerop (drb->step)); 893 step_preserves_misalignment_p = true; 894 } 895 896 /* In case the dataref is in an inner-loop of the loop that is being 897 vectorized (LOOP), we use the base and misalignment information 898 relative to the outer-loop (LOOP). This is ok only if the misalignment 899 stays the same throughout the execution of the inner-loop, which is why 900 we have to check that the stride of the dataref in the inner-loop evenly 901 divides by the vector alignment. */ 902 else if (nested_in_vect_loop_p (loop, stmt)) 903 { 904 step_preserves_misalignment_p 905 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0; 906 907 if (dump_enabled_p ()) 908 { 909 if (step_preserves_misalignment_p) 910 dump_printf_loc (MSG_NOTE, vect_location, 911 "inner step divides the vector alignment.\n"); 912 else 913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 914 "inner step doesn't divide the vector" 915 " alignment.\n"); 916 } 917 } 918 919 /* Similarly we can only use base and misalignment information relative to 920 an innermost loop if the misalignment stays the same throughout the 921 execution of the loop. As above, this is the case if the stride of 922 the dataref evenly divides by the alignment. */ 923 else 924 { 925 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 926 step_preserves_misalignment_p 927 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment); 928 929 if (!step_preserves_misalignment_p && dump_enabled_p ()) 930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 931 "step doesn't divide the vector alignment.\n"); 932 } 933 934 unsigned int base_alignment = drb->base_alignment; 935 unsigned int base_misalignment = drb->base_misalignment; 936 937 /* Calculate the maximum of the pooled base address alignment and the 938 alignment that we can compute for DR itself. */ 939 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 940 if (entry && base_alignment < (*entry)->base_alignment) 941 { 942 base_alignment = (*entry)->base_alignment; 943 base_misalignment = (*entry)->base_misalignment; 944 } 945 946 if (drb->offset_alignment < vector_alignment 947 || !step_preserves_misalignment_p 948 /* We need to know whether the step wrt the vectorized loop is 949 negative when computing the starting misalignment below. */ 950 || TREE_CODE (drb->step) != INTEGER_CST) 951 { 952 if (dump_enabled_p ()) 953 { 954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 955 "Unknown alignment for access: "); 956 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 957 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 958 } 959 return true; 960 } 961 962 if (base_alignment < vector_alignment) 963 { 964 unsigned int max_alignment; 965 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 966 if (max_alignment < vector_alignment 967 || !vect_can_force_dr_alignment_p (base, 968 vector_alignment * BITS_PER_UNIT)) 969 { 970 if (dump_enabled_p ()) 971 { 972 dump_printf_loc (MSG_NOTE, vect_location, 973 "can't force alignment of ref: "); 974 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 975 dump_printf (MSG_NOTE, "\n"); 976 } 977 return true; 978 } 979 980 /* Force the alignment of the decl. 981 NOTE: This is the only change to the code we make during 982 the analysis phase, before deciding to vectorize the loop. */ 983 if (dump_enabled_p ()) 984 { 985 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 986 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 987 dump_printf (MSG_NOTE, "\n"); 988 } 989 990 DR_VECT_AUX (dr)->base_decl = base; 991 DR_VECT_AUX (dr)->base_misaligned = true; 992 base_misalignment = 0; 993 } 994 poly_int64 misalignment 995 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 996 997 /* If this is a backward running DR then first access in the larger 998 vectype actually is N-1 elements before the address in the DR. 999 Adjust misalign accordingly. */ 1000 if (tree_int_cst_sgn (drb->step) < 0) 1001 /* PLUS because STEP is negative. */ 1002 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1003 * TREE_INT_CST_LOW (drb->step)); 1004 1005 unsigned int const_misalignment; 1006 if (!known_misalignment (misalignment, vector_alignment, 1007 &const_misalignment)) 1008 { 1009 if (dump_enabled_p ()) 1010 { 1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1012 "Non-constant misalignment for access: "); 1013 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1014 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1015 } 1016 return true; 1017 } 1018 1019 SET_DR_MISALIGNMENT (dr, const_misalignment); 1020 1021 if (dump_enabled_p ()) 1022 { 1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1024 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 1025 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1026 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1027 } 1028 1029 return true; 1030 } 1031 1032 /* Function vect_update_misalignment_for_peel. 1033 Sets DR's misalignment 1034 - to 0 if it has the same alignment as DR_PEEL, 1035 - to the misalignment computed using NPEEL if DR's salignment is known, 1036 - to -1 (unknown) otherwise. 1037 1038 DR - the data reference whose misalignment is to be adjusted. 1039 DR_PEEL - the data reference whose misalignment is being made 1040 zero in the vector loop by the peel. 1041 NPEEL - the number of iterations in the peel loop if the misalignment 1042 of DR_PEEL is known at compile time. */ 1043 1044 static void 1045 vect_update_misalignment_for_peel (struct data_reference *dr, 1046 struct data_reference *dr_peel, int npeel) 1047 { 1048 unsigned int i; 1049 vec<dr_p> same_aligned_drs; 1050 struct data_reference *current_dr; 1051 int dr_size = vect_get_scalar_dr_size (dr); 1052 int dr_peel_size = vect_get_scalar_dr_size (dr_peel); 1053 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1054 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1055 1056 /* For interleaved data accesses the step in the loop must be multiplied by 1057 the size of the interleaving group. */ 1058 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1059 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1060 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1061 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1062 1063 /* It can be assumed that the data refs with the same alignment as dr_peel 1064 are aligned in the vector loop. */ 1065 same_aligned_drs 1066 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1067 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1068 { 1069 if (current_dr != dr) 1070 continue; 1071 gcc_assert (!known_alignment_for_access_p (dr) 1072 || !known_alignment_for_access_p (dr_peel) 1073 || (DR_MISALIGNMENT (dr) / dr_size 1074 == DR_MISALIGNMENT (dr_peel) / dr_peel_size)); 1075 SET_DR_MISALIGNMENT (dr, 0); 1076 return; 1077 } 1078 1079 if (known_alignment_for_access_p (dr) 1080 && known_alignment_for_access_p (dr_peel)) 1081 { 1082 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1083 int misal = DR_MISALIGNMENT (dr); 1084 misal += negative ? -npeel * dr_size : npeel * dr_size; 1085 misal &= DR_TARGET_ALIGNMENT (dr) - 1; 1086 SET_DR_MISALIGNMENT (dr, misal); 1087 return; 1088 } 1089 1090 if (dump_enabled_p ()) 1091 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1092 "to unknown (-1).\n"); 1093 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 1094 } 1095 1096 1097 /* Function verify_data_ref_alignment 1098 1099 Return TRUE if DR can be handled with respect to alignment. */ 1100 1101 static bool 1102 verify_data_ref_alignment (data_reference_p dr) 1103 { 1104 enum dr_alignment_support supportable_dr_alignment 1105 = vect_supportable_dr_alignment (dr, false); 1106 if (!supportable_dr_alignment) 1107 { 1108 if (dump_enabled_p ()) 1109 { 1110 if (DR_IS_READ (dr)) 1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1112 "not vectorized: unsupported unaligned load."); 1113 else 1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1115 "not vectorized: unsupported unaligned " 1116 "store."); 1117 1118 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1119 DR_REF (dr)); 1120 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1121 } 1122 return false; 1123 } 1124 1125 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1126 dump_printf_loc (MSG_NOTE, vect_location, 1127 "Vectorizing an unaligned access.\n"); 1128 1129 return true; 1130 } 1131 1132 /* Function vect_verify_datarefs_alignment 1133 1134 Return TRUE if all data references in the loop can be 1135 handled with respect to alignment. */ 1136 1137 bool 1138 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1139 { 1140 vec<data_reference_p> datarefs = vinfo->datarefs; 1141 struct data_reference *dr; 1142 unsigned int i; 1143 1144 FOR_EACH_VEC_ELT (datarefs, i, dr) 1145 { 1146 gimple *stmt = DR_STMT (dr); 1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1148 1149 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1150 continue; 1151 1152 /* For interleaving, only the alignment of the first access matters. */ 1153 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1154 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1155 continue; 1156 1157 /* Strided accesses perform only component accesses, alignment is 1158 irrelevant for them. */ 1159 if (STMT_VINFO_STRIDED_P (stmt_info) 1160 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1161 continue; 1162 1163 if (! verify_data_ref_alignment (dr)) 1164 return false; 1165 } 1166 1167 return true; 1168 } 1169 1170 /* Given an memory reference EXP return whether its alignment is less 1171 than its size. */ 1172 1173 static bool 1174 not_size_aligned (tree exp) 1175 { 1176 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1177 return true; 1178 1179 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1180 > get_object_alignment (exp)); 1181 } 1182 1183 /* Function vector_alignment_reachable_p 1184 1185 Return true if vector alignment for DR is reachable by peeling 1186 a few loop iterations. Return false otherwise. */ 1187 1188 static bool 1189 vector_alignment_reachable_p (struct data_reference *dr) 1190 { 1191 gimple *stmt = DR_STMT (dr); 1192 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1193 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1194 1195 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1196 { 1197 /* For interleaved access we peel only if number of iterations in 1198 the prolog loop ({VF - misalignment}), is a multiple of the 1199 number of the interleaved accesses. */ 1200 int elem_size, mis_in_elements; 1201 1202 /* FORNOW: handle only known alignment. */ 1203 if (!known_alignment_for_access_p (dr)) 1204 return false; 1205 1206 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1207 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1208 elem_size = vector_element_size (vector_size, nelements); 1209 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1210 1211 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info))) 1212 return false; 1213 } 1214 1215 /* If misalignment is known at the compile time then allow peeling 1216 only if natural alignment is reachable through peeling. */ 1217 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1218 { 1219 HOST_WIDE_INT elmsize = 1220 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1221 if (dump_enabled_p ()) 1222 { 1223 dump_printf_loc (MSG_NOTE, vect_location, 1224 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1225 dump_printf (MSG_NOTE, 1226 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1227 } 1228 if (DR_MISALIGNMENT (dr) % elmsize) 1229 { 1230 if (dump_enabled_p ()) 1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1232 "data size does not divide the misalignment.\n"); 1233 return false; 1234 } 1235 } 1236 1237 if (!known_alignment_for_access_p (dr)) 1238 { 1239 tree type = TREE_TYPE (DR_REF (dr)); 1240 bool is_packed = not_size_aligned (DR_REF (dr)); 1241 if (dump_enabled_p ()) 1242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1243 "Unknown misalignment, %snaturally aligned\n", 1244 is_packed ? "not " : ""); 1245 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1246 } 1247 1248 return true; 1249 } 1250 1251 1252 /* Calculate the cost of the memory access represented by DR. */ 1253 1254 static void 1255 vect_get_data_access_cost (struct data_reference *dr, 1256 unsigned int *inside_cost, 1257 unsigned int *outside_cost, 1258 stmt_vector_for_cost *body_cost_vec) 1259 { 1260 gimple *stmt = DR_STMT (dr); 1261 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1263 int ncopies; 1264 1265 if (PURE_SLP_STMT (stmt_info)) 1266 ncopies = 1; 1267 else 1268 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1269 1270 if (DR_IS_READ (dr)) 1271 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1272 NULL, body_cost_vec, false); 1273 else 1274 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1275 1276 if (dump_enabled_p ()) 1277 dump_printf_loc (MSG_NOTE, vect_location, 1278 "vect_get_data_access_cost: inside_cost = %d, " 1279 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1280 } 1281 1282 1283 typedef struct _vect_peel_info 1284 { 1285 struct data_reference *dr; 1286 int npeel; 1287 unsigned int count; 1288 } *vect_peel_info; 1289 1290 typedef struct _vect_peel_extended_info 1291 { 1292 struct _vect_peel_info peel_info; 1293 unsigned int inside_cost; 1294 unsigned int outside_cost; 1295 } *vect_peel_extended_info; 1296 1297 1298 /* Peeling hashtable helpers. */ 1299 1300 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1301 { 1302 static inline hashval_t hash (const _vect_peel_info *); 1303 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1304 }; 1305 1306 inline hashval_t 1307 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1308 { 1309 return (hashval_t) peel_info->npeel; 1310 } 1311 1312 inline bool 1313 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1314 { 1315 return (a->npeel == b->npeel); 1316 } 1317 1318 1319 /* Insert DR into peeling hash table with NPEEL as key. */ 1320 1321 static void 1322 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1323 loop_vec_info loop_vinfo, struct data_reference *dr, 1324 int npeel) 1325 { 1326 struct _vect_peel_info elem, *slot; 1327 _vect_peel_info **new_slot; 1328 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1329 1330 elem.npeel = npeel; 1331 slot = peeling_htab->find (&elem); 1332 if (slot) 1333 slot->count++; 1334 else 1335 { 1336 slot = XNEW (struct _vect_peel_info); 1337 slot->npeel = npeel; 1338 slot->dr = dr; 1339 slot->count = 1; 1340 new_slot = peeling_htab->find_slot (slot, INSERT); 1341 *new_slot = slot; 1342 } 1343 1344 if (!supportable_dr_alignment 1345 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1346 slot->count += VECT_MAX_COST; 1347 } 1348 1349 1350 /* Traverse peeling hash table to find peeling option that aligns maximum 1351 number of data accesses. */ 1352 1353 int 1354 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1355 _vect_peel_extended_info *max) 1356 { 1357 vect_peel_info elem = *slot; 1358 1359 if (elem->count > max->peel_info.count 1360 || (elem->count == max->peel_info.count 1361 && max->peel_info.npeel > elem->npeel)) 1362 { 1363 max->peel_info.npeel = elem->npeel; 1364 max->peel_info.count = elem->count; 1365 max->peel_info.dr = elem->dr; 1366 } 1367 1368 return 1; 1369 } 1370 1371 /* Get the costs of peeling NPEEL iterations checking data access costs 1372 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's 1373 misalignment will be zero after peeling. */ 1374 1375 static void 1376 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs, 1377 struct data_reference *dr0, 1378 unsigned int *inside_cost, 1379 unsigned int *outside_cost, 1380 stmt_vector_for_cost *body_cost_vec, 1381 unsigned int npeel, 1382 bool unknown_misalignment) 1383 { 1384 unsigned i; 1385 data_reference *dr; 1386 1387 FOR_EACH_VEC_ELT (datarefs, i, dr) 1388 { 1389 gimple *stmt = DR_STMT (dr); 1390 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1391 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1392 continue; 1393 1394 /* For interleaving, only the alignment of the first access 1395 matters. */ 1396 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1397 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1398 continue; 1399 1400 /* Strided accesses perform only component accesses, alignment is 1401 irrelevant for them. */ 1402 if (STMT_VINFO_STRIDED_P (stmt_info) 1403 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1404 continue; 1405 1406 int save_misalignment; 1407 save_misalignment = DR_MISALIGNMENT (dr); 1408 if (npeel == 0) 1409 ; 1410 else if (unknown_misalignment && dr == dr0) 1411 SET_DR_MISALIGNMENT (dr, 0); 1412 else 1413 vect_update_misalignment_for_peel (dr, dr0, npeel); 1414 vect_get_data_access_cost (dr, inside_cost, outside_cost, 1415 body_cost_vec); 1416 SET_DR_MISALIGNMENT (dr, save_misalignment); 1417 } 1418 } 1419 1420 /* Traverse peeling hash table and calculate cost for each peeling option. 1421 Find the one with the lowest cost. */ 1422 1423 int 1424 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1425 _vect_peel_extended_info *min) 1426 { 1427 vect_peel_info elem = *slot; 1428 int dummy; 1429 unsigned int inside_cost = 0, outside_cost = 0; 1430 gimple *stmt = DR_STMT (elem->dr); 1431 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1432 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1433 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1434 epilogue_cost_vec; 1435 1436 prologue_cost_vec.create (2); 1437 body_cost_vec.create (2); 1438 epilogue_cost_vec.create (2); 1439 1440 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo), 1441 elem->dr, &inside_cost, &outside_cost, 1442 &body_cost_vec, elem->npeel, false); 1443 1444 body_cost_vec.release (); 1445 1446 outside_cost += vect_get_known_peeling_cost 1447 (loop_vinfo, elem->npeel, &dummy, 1448 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1449 &prologue_cost_vec, &epilogue_cost_vec); 1450 1451 /* Prologue and epilogue costs are added to the target model later. 1452 These costs depend only on the scalar iteration cost, the 1453 number of peeling iterations finally chosen, and the number of 1454 misaligned statements. So discard the information found here. */ 1455 prologue_cost_vec.release (); 1456 epilogue_cost_vec.release (); 1457 1458 if (inside_cost < min->inside_cost 1459 || (inside_cost == min->inside_cost 1460 && outside_cost < min->outside_cost)) 1461 { 1462 min->inside_cost = inside_cost; 1463 min->outside_cost = outside_cost; 1464 min->peel_info.dr = elem->dr; 1465 min->peel_info.npeel = elem->npeel; 1466 min->peel_info.count = elem->count; 1467 } 1468 1469 return 1; 1470 } 1471 1472 1473 /* Choose best peeling option by traversing peeling hash table and either 1474 choosing an option with the lowest cost (if cost model is enabled) or the 1475 option that aligns as many accesses as possible. */ 1476 1477 static struct _vect_peel_extended_info 1478 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1479 loop_vec_info loop_vinfo) 1480 { 1481 struct _vect_peel_extended_info res; 1482 1483 res.peel_info.dr = NULL; 1484 1485 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1486 { 1487 res.inside_cost = INT_MAX; 1488 res.outside_cost = INT_MAX; 1489 peeling_htab->traverse <_vect_peel_extended_info *, 1490 vect_peeling_hash_get_lowest_cost> (&res); 1491 } 1492 else 1493 { 1494 res.peel_info.count = 0; 1495 peeling_htab->traverse <_vect_peel_extended_info *, 1496 vect_peeling_hash_get_most_frequent> (&res); 1497 res.inside_cost = 0; 1498 res.outside_cost = 0; 1499 } 1500 1501 return res; 1502 } 1503 1504 /* Return true if the new peeling NPEEL is supported. */ 1505 1506 static bool 1507 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0, 1508 unsigned npeel) 1509 { 1510 unsigned i; 1511 struct data_reference *dr = NULL; 1512 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1513 gimple *stmt; 1514 stmt_vec_info stmt_info; 1515 enum dr_alignment_support supportable_dr_alignment; 1516 1517 /* Ensure that all data refs can be vectorized after the peel. */ 1518 FOR_EACH_VEC_ELT (datarefs, i, dr) 1519 { 1520 int save_misalignment; 1521 1522 if (dr == dr0) 1523 continue; 1524 1525 stmt = DR_STMT (dr); 1526 stmt_info = vinfo_for_stmt (stmt); 1527 /* For interleaving, only the alignment of the first access 1528 matters. */ 1529 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1530 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1531 continue; 1532 1533 /* Strided accesses perform only component accesses, alignment is 1534 irrelevant for them. */ 1535 if (STMT_VINFO_STRIDED_P (stmt_info) 1536 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1537 continue; 1538 1539 save_misalignment = DR_MISALIGNMENT (dr); 1540 vect_update_misalignment_for_peel (dr, dr0, npeel); 1541 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1542 SET_DR_MISALIGNMENT (dr, save_misalignment); 1543 1544 if (!supportable_dr_alignment) 1545 return false; 1546 } 1547 1548 return true; 1549 } 1550 1551 /* Function vect_enhance_data_refs_alignment 1552 1553 This pass will use loop versioning and loop peeling in order to enhance 1554 the alignment of data references in the loop. 1555 1556 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1557 original loop is to be vectorized. Any other loops that are created by 1558 the transformations performed in this pass - are not supposed to be 1559 vectorized. This restriction will be relaxed. 1560 1561 This pass will require a cost model to guide it whether to apply peeling 1562 or versioning or a combination of the two. For example, the scheme that 1563 intel uses when given a loop with several memory accesses, is as follows: 1564 choose one memory access ('p') which alignment you want to force by doing 1565 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1566 other accesses are not necessarily aligned, or (2) use loop versioning to 1567 generate one loop in which all accesses are aligned, and another loop in 1568 which only 'p' is necessarily aligned. 1569 1570 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1571 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1572 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1573 1574 Devising a cost model is the most critical aspect of this work. It will 1575 guide us on which access to peel for, whether to use loop versioning, how 1576 many versions to create, etc. The cost model will probably consist of 1577 generic considerations as well as target specific considerations (on 1578 powerpc for example, misaligned stores are more painful than misaligned 1579 loads). 1580 1581 Here are the general steps involved in alignment enhancements: 1582 1583 -- original loop, before alignment analysis: 1584 for (i=0; i<N; i++){ 1585 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1586 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1587 } 1588 1589 -- After vect_compute_data_refs_alignment: 1590 for (i=0; i<N; i++){ 1591 x = q[i]; # DR_MISALIGNMENT(q) = 3 1592 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1593 } 1594 1595 -- Possibility 1: we do loop versioning: 1596 if (p is aligned) { 1597 for (i=0; i<N; i++){ # loop 1A 1598 x = q[i]; # DR_MISALIGNMENT(q) = 3 1599 p[i] = y; # DR_MISALIGNMENT(p) = 0 1600 } 1601 } 1602 else { 1603 for (i=0; i<N; i++){ # loop 1B 1604 x = q[i]; # DR_MISALIGNMENT(q) = 3 1605 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1606 } 1607 } 1608 1609 -- Possibility 2: we do loop peeling: 1610 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1611 x = q[i]; 1612 p[i] = y; 1613 } 1614 for (i = 3; i < N; i++){ # loop 2A 1615 x = q[i]; # DR_MISALIGNMENT(q) = 0 1616 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1617 } 1618 1619 -- Possibility 3: combination of loop peeling and versioning: 1620 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1621 x = q[i]; 1622 p[i] = y; 1623 } 1624 if (p is aligned) { 1625 for (i = 3; i<N; i++){ # loop 3A 1626 x = q[i]; # DR_MISALIGNMENT(q) = 0 1627 p[i] = y; # DR_MISALIGNMENT(p) = 0 1628 } 1629 } 1630 else { 1631 for (i = 3; i<N; i++){ # loop 3B 1632 x = q[i]; # DR_MISALIGNMENT(q) = 0 1633 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1634 } 1635 } 1636 1637 These loops are later passed to loop_transform to be vectorized. The 1638 vectorizer will use the alignment information to guide the transformation 1639 (whether to generate regular loads/stores, or with special handling for 1640 misalignment). */ 1641 1642 bool 1643 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1644 { 1645 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1646 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1647 enum dr_alignment_support supportable_dr_alignment; 1648 struct data_reference *dr0 = NULL, *first_store = NULL; 1649 struct data_reference *dr; 1650 unsigned int i, j; 1651 bool do_peeling = false; 1652 bool do_versioning = false; 1653 bool stat; 1654 gimple *stmt; 1655 stmt_vec_info stmt_info; 1656 unsigned int npeel = 0; 1657 bool one_misalignment_known = false; 1658 bool one_misalignment_unknown = false; 1659 bool one_dr_unsupportable = false; 1660 struct data_reference *unsupportable_dr = NULL; 1661 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1662 unsigned possible_npeel_number = 1; 1663 tree vectype; 1664 unsigned int mis, same_align_drs_max = 0; 1665 hash_table<peel_info_hasher> peeling_htab (1); 1666 1667 if (dump_enabled_p ()) 1668 dump_printf_loc (MSG_NOTE, vect_location, 1669 "=== vect_enhance_data_refs_alignment ===\n"); 1670 1671 /* Reset data so we can safely be called multiple times. */ 1672 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1673 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1674 1675 /* While cost model enhancements are expected in the future, the high level 1676 view of the code at this time is as follows: 1677 1678 A) If there is a misaligned access then see if peeling to align 1679 this access can make all data references satisfy 1680 vect_supportable_dr_alignment. If so, update data structures 1681 as needed and return true. 1682 1683 B) If peeling wasn't possible and there is a data reference with an 1684 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1685 then see if loop versioning checks can be used to make all data 1686 references satisfy vect_supportable_dr_alignment. If so, update 1687 data structures as needed and return true. 1688 1689 C) If neither peeling nor versioning were successful then return false if 1690 any data reference does not satisfy vect_supportable_dr_alignment. 1691 1692 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1693 1694 Note, Possibility 3 above (which is peeling and versioning together) is not 1695 being done at this time. */ 1696 1697 /* (1) Peeling to force alignment. */ 1698 1699 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1700 Considerations: 1701 + How many accesses will become aligned due to the peeling 1702 - How many accesses will become unaligned due to the peeling, 1703 and the cost of misaligned accesses. 1704 - The cost of peeling (the extra runtime checks, the increase 1705 in code size). */ 1706 1707 FOR_EACH_VEC_ELT (datarefs, i, dr) 1708 { 1709 stmt = DR_STMT (dr); 1710 stmt_info = vinfo_for_stmt (stmt); 1711 1712 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1713 continue; 1714 1715 /* For interleaving, only the alignment of the first access 1716 matters. */ 1717 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1718 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1719 continue; 1720 1721 /* For invariant accesses there is nothing to enhance. */ 1722 if (integer_zerop (DR_STEP (dr))) 1723 continue; 1724 1725 /* Strided accesses perform only component accesses, alignment is 1726 irrelevant for them. */ 1727 if (STMT_VINFO_STRIDED_P (stmt_info) 1728 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1729 continue; 1730 1731 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1732 do_peeling = vector_alignment_reachable_p (dr); 1733 if (do_peeling) 1734 { 1735 if (known_alignment_for_access_p (dr)) 1736 { 1737 unsigned int npeel_tmp = 0; 1738 bool negative = tree_int_cst_compare (DR_STEP (dr), 1739 size_zero_node) < 0; 1740 1741 vectype = STMT_VINFO_VECTYPE (stmt_info); 1742 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); 1743 unsigned int dr_size = vect_get_scalar_dr_size (dr); 1744 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr)); 1745 if (DR_MISALIGNMENT (dr) != 0) 1746 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1747 1748 /* For multiple types, it is possible that the bigger type access 1749 will have more than one peeling option. E.g., a loop with two 1750 types: one of size (vector size / 4), and the other one of 1751 size (vector size / 8). Vectorization factor will 8. If both 1752 accesses are misaligned by 3, the first one needs one scalar 1753 iteration to be aligned, and the second one needs 5. But the 1754 first one will be aligned also by peeling 5 scalar 1755 iterations, and in that case both accesses will be aligned. 1756 Hence, except for the immediate peeling amount, we also want 1757 to try to add full vector size, while we don't exceed 1758 vectorization factor. 1759 We do this automatically for cost model, since we calculate 1760 cost for every peeling option. */ 1761 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1762 { 1763 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1764 ? vf * GROUP_SIZE (stmt_info) : vf); 1765 possible_npeel_number 1766 = vect_get_num_vectors (nscalars, vectype); 1767 1768 /* NPEEL_TMP is 0 when there is no misalignment, but also 1769 allow peeling NELEMENTS. */ 1770 if (DR_MISALIGNMENT (dr) == 0) 1771 possible_npeel_number++; 1772 } 1773 1774 /* Save info about DR in the hash table. Also include peeling 1775 amounts according to the explanation above. */ 1776 for (j = 0; j < possible_npeel_number; j++) 1777 { 1778 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1779 dr, npeel_tmp); 1780 npeel_tmp += target_align / dr_size; 1781 } 1782 1783 one_misalignment_known = true; 1784 } 1785 else 1786 { 1787 /* If we don't know any misalignment values, we prefer 1788 peeling for data-ref that has the maximum number of data-refs 1789 with the same alignment, unless the target prefers to align 1790 stores over load. */ 1791 unsigned same_align_drs 1792 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1793 if (!dr0 1794 || same_align_drs_max < same_align_drs) 1795 { 1796 same_align_drs_max = same_align_drs; 1797 dr0 = dr; 1798 } 1799 /* For data-refs with the same number of related 1800 accesses prefer the one where the misalign 1801 computation will be invariant in the outermost loop. */ 1802 else if (same_align_drs_max == same_align_drs) 1803 { 1804 struct loop *ivloop0, *ivloop; 1805 ivloop0 = outermost_invariant_loop_for_expr 1806 (loop, DR_BASE_ADDRESS (dr0)); 1807 ivloop = outermost_invariant_loop_for_expr 1808 (loop, DR_BASE_ADDRESS (dr)); 1809 if ((ivloop && !ivloop0) 1810 || (ivloop && ivloop0 1811 && flow_loop_nested_p (ivloop, ivloop0))) 1812 dr0 = dr; 1813 } 1814 1815 one_misalignment_unknown = true; 1816 1817 /* Check for data refs with unsupportable alignment that 1818 can be peeled. */ 1819 if (!supportable_dr_alignment) 1820 { 1821 one_dr_unsupportable = true; 1822 unsupportable_dr = dr; 1823 } 1824 1825 if (!first_store && DR_IS_WRITE (dr)) 1826 first_store = dr; 1827 } 1828 } 1829 else 1830 { 1831 if (!aligned_access_p (dr)) 1832 { 1833 if (dump_enabled_p ()) 1834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1835 "vector alignment may not be reachable\n"); 1836 break; 1837 } 1838 } 1839 } 1840 1841 /* Check if we can possibly peel the loop. */ 1842 if (!vect_can_advance_ivs_p (loop_vinfo) 1843 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1844 || loop->inner) 1845 do_peeling = false; 1846 1847 struct _vect_peel_extended_info peel_for_known_alignment; 1848 struct _vect_peel_extended_info peel_for_unknown_alignment; 1849 struct _vect_peel_extended_info best_peel; 1850 1851 peel_for_unknown_alignment.inside_cost = INT_MAX; 1852 peel_for_unknown_alignment.outside_cost = INT_MAX; 1853 peel_for_unknown_alignment.peel_info.count = 0; 1854 1855 if (do_peeling 1856 && one_misalignment_unknown) 1857 { 1858 /* Check if the target requires to prefer stores over loads, i.e., if 1859 misaligned stores are more expensive than misaligned loads (taking 1860 drs with same alignment into account). */ 1861 unsigned int load_inside_cost = 0; 1862 unsigned int load_outside_cost = 0; 1863 unsigned int store_inside_cost = 0; 1864 unsigned int store_outside_cost = 0; 1865 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1866 1867 stmt_vector_for_cost dummy; 1868 dummy.create (2); 1869 vect_get_peeling_costs_all_drs (datarefs, dr0, 1870 &load_inside_cost, 1871 &load_outside_cost, 1872 &dummy, estimated_npeels, true); 1873 dummy.release (); 1874 1875 if (first_store) 1876 { 1877 dummy.create (2); 1878 vect_get_peeling_costs_all_drs (datarefs, first_store, 1879 &store_inside_cost, 1880 &store_outside_cost, 1881 &dummy, estimated_npeels, true); 1882 dummy.release (); 1883 } 1884 else 1885 { 1886 store_inside_cost = INT_MAX; 1887 store_outside_cost = INT_MAX; 1888 } 1889 1890 if (load_inside_cost > store_inside_cost 1891 || (load_inside_cost == store_inside_cost 1892 && load_outside_cost > store_outside_cost)) 1893 { 1894 dr0 = first_store; 1895 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1896 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1897 } 1898 else 1899 { 1900 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1901 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1902 } 1903 1904 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1905 prologue_cost_vec.create (2); 1906 epilogue_cost_vec.create (2); 1907 1908 int dummy2; 1909 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1910 (loop_vinfo, estimated_npeels, &dummy2, 1911 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1912 &prologue_cost_vec, &epilogue_cost_vec); 1913 1914 prologue_cost_vec.release (); 1915 epilogue_cost_vec.release (); 1916 1917 peel_for_unknown_alignment.peel_info.count = 1 1918 + STMT_VINFO_SAME_ALIGN_REFS 1919 (vinfo_for_stmt (DR_STMT (dr0))).length (); 1920 } 1921 1922 peel_for_unknown_alignment.peel_info.npeel = 0; 1923 peel_for_unknown_alignment.peel_info.dr = dr0; 1924 1925 best_peel = peel_for_unknown_alignment; 1926 1927 peel_for_known_alignment.inside_cost = INT_MAX; 1928 peel_for_known_alignment.outside_cost = INT_MAX; 1929 peel_for_known_alignment.peel_info.count = 0; 1930 peel_for_known_alignment.peel_info.dr = NULL; 1931 1932 if (do_peeling && one_misalignment_known) 1933 { 1934 /* Peeling is possible, but there is no data access that is not supported 1935 unless aligned. So we try to choose the best possible peeling from 1936 the hash table. */ 1937 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 1938 (&peeling_htab, loop_vinfo); 1939 } 1940 1941 /* Compare costs of peeling for known and unknown alignment. */ 1942 if (peel_for_known_alignment.peel_info.dr != NULL 1943 && peel_for_unknown_alignment.inside_cost 1944 >= peel_for_known_alignment.inside_cost) 1945 { 1946 best_peel = peel_for_known_alignment; 1947 1948 /* If the best peeling for known alignment has NPEEL == 0, perform no 1949 peeling at all except if there is an unsupportable dr that we can 1950 align. */ 1951 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 1952 do_peeling = false; 1953 } 1954 1955 /* If there is an unsupportable data ref, prefer this over all choices so far 1956 since we'd have to discard a chosen peeling except when it accidentally 1957 aligned the unsupportable data ref. */ 1958 if (one_dr_unsupportable) 1959 dr0 = unsupportable_dr; 1960 else if (do_peeling) 1961 { 1962 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 1963 TODO: Use nopeel_outside_cost or get rid of it? */ 1964 unsigned nopeel_inside_cost = 0; 1965 unsigned nopeel_outside_cost = 0; 1966 1967 stmt_vector_for_cost dummy; 1968 dummy.create (2); 1969 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost, 1970 &nopeel_outside_cost, &dummy, 0, false); 1971 dummy.release (); 1972 1973 /* Add epilogue costs. As we do not peel for alignment here, no prologue 1974 costs will be recorded. */ 1975 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1976 prologue_cost_vec.create (2); 1977 epilogue_cost_vec.create (2); 1978 1979 int dummy2; 1980 nopeel_outside_cost += vect_get_known_peeling_cost 1981 (loop_vinfo, 0, &dummy2, 1982 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1983 &prologue_cost_vec, &epilogue_cost_vec); 1984 1985 prologue_cost_vec.release (); 1986 epilogue_cost_vec.release (); 1987 1988 npeel = best_peel.peel_info.npeel; 1989 dr0 = best_peel.peel_info.dr; 1990 1991 /* If no peeling is not more expensive than the best peeling we 1992 have so far, don't perform any peeling. */ 1993 if (nopeel_inside_cost <= best_peel.inside_cost) 1994 do_peeling = false; 1995 } 1996 1997 if (do_peeling) 1998 { 1999 stmt = DR_STMT (dr0); 2000 stmt_info = vinfo_for_stmt (stmt); 2001 vectype = STMT_VINFO_VECTYPE (stmt_info); 2002 2003 if (known_alignment_for_access_p (dr0)) 2004 { 2005 bool negative = tree_int_cst_compare (DR_STEP (dr0), 2006 size_zero_node) < 0; 2007 if (!npeel) 2008 { 2009 /* Since it's known at compile time, compute the number of 2010 iterations in the peeled loop (the peeling factor) for use in 2011 updating DR_MISALIGNMENT values. The peeling factor is the 2012 vectorization factor minus the misalignment as an element 2013 count. */ 2014 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0); 2015 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2016 npeel = ((mis & (target_align - 1)) 2017 / vect_get_scalar_dr_size (dr0)); 2018 } 2019 2020 /* For interleaved data access every iteration accesses all the 2021 members of the group, therefore we divide the number of iterations 2022 by the group size. */ 2023 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 2024 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2025 npeel /= GROUP_SIZE (stmt_info); 2026 2027 if (dump_enabled_p ()) 2028 dump_printf_loc (MSG_NOTE, vect_location, 2029 "Try peeling by %d\n", npeel); 2030 } 2031 2032 /* Ensure that all datarefs can be vectorized after the peel. */ 2033 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel)) 2034 do_peeling = false; 2035 2036 /* Check if all datarefs are supportable and log. */ 2037 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 2038 { 2039 stat = vect_verify_datarefs_alignment (loop_vinfo); 2040 if (!stat) 2041 do_peeling = false; 2042 else 2043 return stat; 2044 } 2045 2046 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2047 if (do_peeling) 2048 { 2049 unsigned max_allowed_peel 2050 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 2051 if (max_allowed_peel != (unsigned)-1) 2052 { 2053 unsigned max_peel = npeel; 2054 if (max_peel == 0) 2055 { 2056 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2057 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1; 2058 } 2059 if (max_peel > max_allowed_peel) 2060 { 2061 do_peeling = false; 2062 if (dump_enabled_p ()) 2063 dump_printf_loc (MSG_NOTE, vect_location, 2064 "Disable peeling, max peels reached: %d\n", max_peel); 2065 } 2066 } 2067 } 2068 2069 /* Cost model #2 - if peeling may result in a remaining loop not 2070 iterating enough to be vectorized then do not peel. Since this 2071 is a cost heuristic rather than a correctness decision, use the 2072 most likely runtime value for variable vectorization factors. */ 2073 if (do_peeling 2074 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2075 { 2076 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2077 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2078 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2079 < assumed_vf + max_peel) 2080 do_peeling = false; 2081 } 2082 2083 if (do_peeling) 2084 { 2085 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2086 If the misalignment of DR_i is identical to that of dr0 then set 2087 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2088 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2089 by the peeling factor times the element size of DR_i (MOD the 2090 vectorization factor times the size). Otherwise, the 2091 misalignment of DR_i must be set to unknown. */ 2092 FOR_EACH_VEC_ELT (datarefs, i, dr) 2093 if (dr != dr0) 2094 { 2095 /* Strided accesses perform only component accesses, alignment 2096 is irrelevant for them. */ 2097 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2098 if (STMT_VINFO_STRIDED_P (stmt_info) 2099 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2100 continue; 2101 2102 vect_update_misalignment_for_peel (dr, dr0, npeel); 2103 } 2104 2105 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 2106 if (npeel) 2107 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2108 else 2109 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2110 = DR_MISALIGNMENT (dr0); 2111 SET_DR_MISALIGNMENT (dr0, 0); 2112 if (dump_enabled_p ()) 2113 { 2114 dump_printf_loc (MSG_NOTE, vect_location, 2115 "Alignment of access forced using peeling.\n"); 2116 dump_printf_loc (MSG_NOTE, vect_location, 2117 "Peeling for alignment will be applied.\n"); 2118 } 2119 2120 /* The inside-loop cost will be accounted for in vectorizable_load 2121 and vectorizable_store correctly with adjusted alignments. 2122 Drop the body_cst_vec on the floor here. */ 2123 stat = vect_verify_datarefs_alignment (loop_vinfo); 2124 gcc_assert (stat); 2125 return stat; 2126 } 2127 } 2128 2129 /* (2) Versioning to force alignment. */ 2130 2131 /* Try versioning if: 2132 1) optimize loop for speed 2133 2) there is at least one unsupported misaligned data ref with an unknown 2134 misalignment, and 2135 3) all misaligned data refs with a known misalignment are supported, and 2136 4) the number of runtime alignment checks is within reason. */ 2137 2138 do_versioning = 2139 optimize_loop_nest_for_speed_p (loop) 2140 && (!loop->inner); /* FORNOW */ 2141 2142 if (do_versioning) 2143 { 2144 FOR_EACH_VEC_ELT (datarefs, i, dr) 2145 { 2146 stmt = DR_STMT (dr); 2147 stmt_info = vinfo_for_stmt (stmt); 2148 2149 /* For interleaving, only the alignment of the first access 2150 matters. */ 2151 if (aligned_access_p (dr) 2152 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2153 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2154 continue; 2155 2156 if (STMT_VINFO_STRIDED_P (stmt_info)) 2157 { 2158 /* Strided loads perform only component accesses, alignment is 2159 irrelevant for them. */ 2160 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2161 continue; 2162 do_versioning = false; 2163 break; 2164 } 2165 2166 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2167 2168 if (!supportable_dr_alignment) 2169 { 2170 gimple *stmt; 2171 int mask; 2172 tree vectype; 2173 2174 if (known_alignment_for_access_p (dr) 2175 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2176 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2177 { 2178 do_versioning = false; 2179 break; 2180 } 2181 2182 stmt = DR_STMT (dr); 2183 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2184 gcc_assert (vectype); 2185 2186 /* At present we don't support versioning for alignment 2187 with variable VF, since there's no guarantee that the 2188 VF is a power of two. We could relax this if we added 2189 a way of enforcing a power-of-two size. */ 2190 unsigned HOST_WIDE_INT size; 2191 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2192 { 2193 do_versioning = false; 2194 break; 2195 } 2196 2197 /* The rightmost bits of an aligned address must be zeros. 2198 Construct the mask needed for this test. For example, 2199 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2200 mask must be 15 = 0xf. */ 2201 mask = size - 1; 2202 2203 /* FORNOW: use the same mask to test all potentially unaligned 2204 references in the loop. The vectorizer currently supports 2205 a single vector size, see the reference to 2206 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2207 vectorization factor is computed. */ 2208 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2209 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2210 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2211 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2212 DR_STMT (dr)); 2213 } 2214 } 2215 2216 /* Versioning requires at least one misaligned data reference. */ 2217 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2218 do_versioning = false; 2219 else if (!do_versioning) 2220 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2221 } 2222 2223 if (do_versioning) 2224 { 2225 vec<gimple *> may_misalign_stmts 2226 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2227 gimple *stmt; 2228 2229 /* It can now be assumed that the data references in the statements 2230 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2231 of the loop being vectorized. */ 2232 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2233 { 2234 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2235 dr = STMT_VINFO_DATA_REF (stmt_info); 2236 SET_DR_MISALIGNMENT (dr, 0); 2237 if (dump_enabled_p ()) 2238 dump_printf_loc (MSG_NOTE, vect_location, 2239 "Alignment of access forced using versioning.\n"); 2240 } 2241 2242 if (dump_enabled_p ()) 2243 dump_printf_loc (MSG_NOTE, vect_location, 2244 "Versioning for alignment will be applied.\n"); 2245 2246 /* Peeling and versioning can't be done together at this time. */ 2247 gcc_assert (! (do_peeling && do_versioning)); 2248 2249 stat = vect_verify_datarefs_alignment (loop_vinfo); 2250 gcc_assert (stat); 2251 return stat; 2252 } 2253 2254 /* This point is reached if neither peeling nor versioning is being done. */ 2255 gcc_assert (! (do_peeling || do_versioning)); 2256 2257 stat = vect_verify_datarefs_alignment (loop_vinfo); 2258 return stat; 2259 } 2260 2261 2262 /* Function vect_find_same_alignment_drs. 2263 2264 Update group and alignment relations according to the chosen 2265 vectorization factor. */ 2266 2267 static void 2268 vect_find_same_alignment_drs (struct data_dependence_relation *ddr) 2269 { 2270 struct data_reference *dra = DDR_A (ddr); 2271 struct data_reference *drb = DDR_B (ddr); 2272 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2273 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2274 2275 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2276 return; 2277 2278 if (dra == drb) 2279 return; 2280 2281 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2282 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2283 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2284 return; 2285 2286 /* Two references with distance zero have the same alignment. */ 2287 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2288 - wi::to_poly_offset (DR_INIT (drb))); 2289 if (maybe_ne (diff, 0)) 2290 { 2291 /* Get the wider of the two alignments. */ 2292 unsigned int align_a = (vect_calculate_target_alignment (dra) 2293 / BITS_PER_UNIT); 2294 unsigned int align_b = (vect_calculate_target_alignment (drb) 2295 / BITS_PER_UNIT); 2296 unsigned int max_align = MAX (align_a, align_b); 2297 2298 /* Require the gap to be a multiple of the larger vector alignment. */ 2299 if (!multiple_p (diff, max_align)) 2300 return; 2301 } 2302 2303 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2304 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2305 if (dump_enabled_p ()) 2306 { 2307 dump_printf_loc (MSG_NOTE, vect_location, 2308 "accesses have the same alignment: "); 2309 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2310 dump_printf (MSG_NOTE, " and "); 2311 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2312 dump_printf (MSG_NOTE, "\n"); 2313 } 2314 } 2315 2316 2317 /* Function vect_analyze_data_refs_alignment 2318 2319 Analyze the alignment of the data-references in the loop. 2320 Return FALSE if a data reference is found that cannot be vectorized. */ 2321 2322 bool 2323 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2324 { 2325 if (dump_enabled_p ()) 2326 dump_printf_loc (MSG_NOTE, vect_location, 2327 "=== vect_analyze_data_refs_alignment ===\n"); 2328 2329 /* Mark groups of data references with same alignment using 2330 data dependence information. */ 2331 vec<ddr_p> ddrs = vinfo->ddrs; 2332 struct data_dependence_relation *ddr; 2333 unsigned int i; 2334 2335 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2336 vect_find_same_alignment_drs (ddr); 2337 2338 vec<data_reference_p> datarefs = vinfo->datarefs; 2339 struct data_reference *dr; 2340 2341 vect_record_base_alignments (vinfo); 2342 FOR_EACH_VEC_ELT (datarefs, i, dr) 2343 { 2344 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2345 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2346 && !vect_compute_data_ref_alignment (dr)) 2347 { 2348 /* Strided accesses perform only component accesses, misalignment 2349 information is irrelevant for them. */ 2350 if (STMT_VINFO_STRIDED_P (stmt_info) 2351 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2352 continue; 2353 2354 if (dump_enabled_p ()) 2355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2356 "not vectorized: can't calculate alignment " 2357 "for data ref.\n"); 2358 2359 return false; 2360 } 2361 } 2362 2363 return true; 2364 } 2365 2366 2367 /* Analyze alignment of DRs of stmts in NODE. */ 2368 2369 static bool 2370 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2371 { 2372 /* We vectorize from the first scalar stmt in the node unless 2373 the node is permuted in which case we start from the first 2374 element in the group. */ 2375 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2376 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2377 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2378 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2379 2380 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2381 if (! vect_compute_data_ref_alignment (dr) 2382 /* For creating the data-ref pointer we need alignment of the 2383 first element anyway. */ 2384 || (dr != first_dr 2385 && ! vect_compute_data_ref_alignment (first_dr)) 2386 || ! verify_data_ref_alignment (dr)) 2387 { 2388 if (dump_enabled_p ()) 2389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2390 "not vectorized: bad data alignment in basic " 2391 "block.\n"); 2392 return false; 2393 } 2394 2395 return true; 2396 } 2397 2398 /* Function vect_slp_analyze_instance_alignment 2399 2400 Analyze the alignment of the data-references in the SLP instance. 2401 Return FALSE if a data reference is found that cannot be vectorized. */ 2402 2403 bool 2404 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2405 { 2406 if (dump_enabled_p ()) 2407 dump_printf_loc (MSG_NOTE, vect_location, 2408 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2409 2410 slp_tree node; 2411 unsigned i; 2412 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2413 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2414 return false; 2415 2416 node = SLP_INSTANCE_TREE (instance); 2417 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2418 && ! vect_slp_analyze_and_verify_node_alignment 2419 (SLP_INSTANCE_TREE (instance))) 2420 return false; 2421 2422 return true; 2423 } 2424 2425 2426 /* Analyze groups of accesses: check that DR belongs to a group of 2427 accesses of legal size, step, etc. Detect gaps, single element 2428 interleaving, and other special cases. Set grouped access info. 2429 Collect groups of strided stores for further use in SLP analysis. 2430 Worker for vect_analyze_group_access. */ 2431 2432 static bool 2433 vect_analyze_group_access_1 (struct data_reference *dr) 2434 { 2435 tree step = DR_STEP (dr); 2436 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2437 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2438 gimple *stmt = DR_STMT (dr); 2439 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2440 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2441 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2442 HOST_WIDE_INT dr_step = -1; 2443 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2444 bool slp_impossible = false; 2445 2446 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2447 size of the interleaving group (including gaps). */ 2448 if (tree_fits_shwi_p (step)) 2449 { 2450 dr_step = tree_to_shwi (step); 2451 /* Check that STEP is a multiple of type size. Otherwise there is 2452 a non-element-sized gap at the end of the group which we 2453 cannot represent in GROUP_GAP or GROUP_SIZE. 2454 ??? As we can handle non-constant step fine here we should 2455 simply remove uses of GROUP_GAP between the last and first 2456 element and instead rely on DR_STEP. GROUP_SIZE then would 2457 simply not include that gap. */ 2458 if ((dr_step % type_size) != 0) 2459 { 2460 if (dump_enabled_p ()) 2461 { 2462 dump_printf_loc (MSG_NOTE, vect_location, 2463 "Step "); 2464 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2465 dump_printf (MSG_NOTE, 2466 " is not a multiple of the element size for "); 2467 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2468 dump_printf (MSG_NOTE, "\n"); 2469 } 2470 return false; 2471 } 2472 groupsize = absu_hwi (dr_step) / type_size; 2473 } 2474 else 2475 groupsize = 0; 2476 2477 /* Not consecutive access is possible only if it is a part of interleaving. */ 2478 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2479 { 2480 /* Check if it this DR is a part of interleaving, and is a single 2481 element of the group that is accessed in the loop. */ 2482 2483 /* Gaps are supported only for loads. STEP must be a multiple of the type 2484 size. */ 2485 if (DR_IS_READ (dr) 2486 && (dr_step % type_size) == 0 2487 && groupsize > 0) 2488 { 2489 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2490 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2491 GROUP_GAP (stmt_info) = groupsize - 1; 2492 if (dump_enabled_p ()) 2493 { 2494 dump_printf_loc (MSG_NOTE, vect_location, 2495 "Detected single element interleaving "); 2496 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2497 dump_printf (MSG_NOTE, " step "); 2498 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2499 dump_printf (MSG_NOTE, "\n"); 2500 } 2501 2502 return true; 2503 } 2504 2505 if (dump_enabled_p ()) 2506 { 2507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2508 "not consecutive access "); 2509 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2510 } 2511 2512 if (bb_vinfo) 2513 { 2514 /* Mark the statement as unvectorizable. */ 2515 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2516 return true; 2517 } 2518 2519 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2520 STMT_VINFO_STRIDED_P (stmt_info) = true; 2521 return true; 2522 } 2523 2524 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2525 { 2526 /* First stmt in the interleaving chain. Check the chain. */ 2527 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2528 struct data_reference *data_ref = dr; 2529 unsigned int count = 1; 2530 tree prev_init = DR_INIT (data_ref); 2531 gimple *prev = stmt; 2532 HOST_WIDE_INT diff, gaps = 0; 2533 2534 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2535 while (next) 2536 { 2537 /* Skip same data-refs. In case that two or more stmts share 2538 data-ref (supported only for loads), we vectorize only the first 2539 stmt, and the rest get their vectorized loads from the first 2540 one. */ 2541 if (!tree_int_cst_compare (DR_INIT (data_ref), 2542 DR_INIT (STMT_VINFO_DATA_REF ( 2543 vinfo_for_stmt (next))))) 2544 { 2545 if (DR_IS_WRITE (data_ref)) 2546 { 2547 if (dump_enabled_p ()) 2548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2549 "Two store stmts share the same dr.\n"); 2550 return false; 2551 } 2552 2553 if (dump_enabled_p ()) 2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2555 "Two or more load stmts share the same dr.\n"); 2556 2557 /* For load use the same data-ref load. */ 2558 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2559 2560 prev = next; 2561 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2562 continue; 2563 } 2564 2565 prev = next; 2566 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2567 2568 /* All group members have the same STEP by construction. */ 2569 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2570 2571 /* Check that the distance between two accesses is equal to the type 2572 size. Otherwise, we have gaps. */ 2573 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2574 - TREE_INT_CST_LOW (prev_init)) / type_size; 2575 if (diff != 1) 2576 { 2577 /* FORNOW: SLP of accesses with gaps is not supported. */ 2578 slp_impossible = true; 2579 if (DR_IS_WRITE (data_ref)) 2580 { 2581 if (dump_enabled_p ()) 2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2583 "interleaved store with gaps\n"); 2584 return false; 2585 } 2586 2587 gaps += diff - 1; 2588 } 2589 2590 last_accessed_element += diff; 2591 2592 /* Store the gap from the previous member of the group. If there is no 2593 gap in the access, GROUP_GAP is always 1. */ 2594 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2595 2596 prev_init = DR_INIT (data_ref); 2597 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2598 /* Count the number of data-refs in the chain. */ 2599 count++; 2600 } 2601 2602 if (groupsize == 0) 2603 groupsize = count + gaps; 2604 2605 /* This could be UINT_MAX but as we are generating code in a very 2606 inefficient way we have to cap earlier. See PR78699 for example. */ 2607 if (groupsize > 4096) 2608 { 2609 if (dump_enabled_p ()) 2610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2611 "group is too large\n"); 2612 return false; 2613 } 2614 2615 /* Check that the size of the interleaving is equal to count for stores, 2616 i.e., that there are no gaps. */ 2617 if (groupsize != count 2618 && !DR_IS_READ (dr)) 2619 { 2620 if (dump_enabled_p ()) 2621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2622 "interleaved store with gaps\n"); 2623 return false; 2624 } 2625 2626 /* If there is a gap after the last load in the group it is the 2627 difference between the groupsize and the last accessed 2628 element. 2629 When there is no gap, this difference should be 0. */ 2630 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2631 2632 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2633 if (dump_enabled_p ()) 2634 { 2635 dump_printf_loc (MSG_NOTE, vect_location, 2636 "Detected interleaving "); 2637 if (DR_IS_READ (dr)) 2638 dump_printf (MSG_NOTE, "load "); 2639 else 2640 dump_printf (MSG_NOTE, "store "); 2641 dump_printf (MSG_NOTE, "of size %u starting with ", 2642 (unsigned)groupsize); 2643 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2644 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2645 dump_printf_loc (MSG_NOTE, vect_location, 2646 "There is a gap of %u elements after the group\n", 2647 GROUP_GAP (vinfo_for_stmt (stmt))); 2648 } 2649 2650 /* SLP: create an SLP data structure for every interleaving group of 2651 stores for further analysis in vect_analyse_slp. */ 2652 if (DR_IS_WRITE (dr) && !slp_impossible) 2653 { 2654 if (loop_vinfo) 2655 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2656 if (bb_vinfo) 2657 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2658 } 2659 } 2660 2661 return true; 2662 } 2663 2664 /* Analyze groups of accesses: check that DR belongs to a group of 2665 accesses of legal size, step, etc. Detect gaps, single element 2666 interleaving, and other special cases. Set grouped access info. 2667 Collect groups of strided stores for further use in SLP analysis. */ 2668 2669 static bool 2670 vect_analyze_group_access (struct data_reference *dr) 2671 { 2672 if (!vect_analyze_group_access_1 (dr)) 2673 { 2674 /* Dissolve the group if present. */ 2675 gimple *next; 2676 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2677 while (stmt) 2678 { 2679 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2680 next = GROUP_NEXT_ELEMENT (vinfo); 2681 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2682 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2683 stmt = next; 2684 } 2685 return false; 2686 } 2687 return true; 2688 } 2689 2690 /* Analyze the access pattern of the data-reference DR. 2691 In case of non-consecutive accesses call vect_analyze_group_access() to 2692 analyze groups of accesses. */ 2693 2694 static bool 2695 vect_analyze_data_ref_access (struct data_reference *dr) 2696 { 2697 tree step = DR_STEP (dr); 2698 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2699 gimple *stmt = DR_STMT (dr); 2700 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2702 struct loop *loop = NULL; 2703 2704 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2705 return true; 2706 2707 if (loop_vinfo) 2708 loop = LOOP_VINFO_LOOP (loop_vinfo); 2709 2710 if (loop_vinfo && !step) 2711 { 2712 if (dump_enabled_p ()) 2713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2714 "bad data-ref access in loop\n"); 2715 return false; 2716 } 2717 2718 /* Allow loads with zero step in inner-loop vectorization. */ 2719 if (loop_vinfo && integer_zerop (step)) 2720 { 2721 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2722 if (!nested_in_vect_loop_p (loop, stmt)) 2723 return DR_IS_READ (dr); 2724 /* Allow references with zero step for outer loops marked 2725 with pragma omp simd only - it guarantees absence of 2726 loop-carried dependencies between inner loop iterations. */ 2727 if (loop->safelen < 2) 2728 { 2729 if (dump_enabled_p ()) 2730 dump_printf_loc (MSG_NOTE, vect_location, 2731 "zero step in inner loop of nest\n"); 2732 return false; 2733 } 2734 } 2735 2736 if (loop && nested_in_vect_loop_p (loop, stmt)) 2737 { 2738 /* Interleaved accesses are not yet supported within outer-loop 2739 vectorization for references in the inner-loop. */ 2740 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2741 2742 /* For the rest of the analysis we use the outer-loop step. */ 2743 step = STMT_VINFO_DR_STEP (stmt_info); 2744 if (integer_zerop (step)) 2745 { 2746 if (dump_enabled_p ()) 2747 dump_printf_loc (MSG_NOTE, vect_location, 2748 "zero step in outer loop.\n"); 2749 return DR_IS_READ (dr); 2750 } 2751 } 2752 2753 /* Consecutive? */ 2754 if (TREE_CODE (step) == INTEGER_CST) 2755 { 2756 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2757 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2758 || (dr_step < 0 2759 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2760 { 2761 /* Mark that it is not interleaving. */ 2762 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2763 return true; 2764 } 2765 } 2766 2767 if (loop && nested_in_vect_loop_p (loop, stmt)) 2768 { 2769 if (dump_enabled_p ()) 2770 dump_printf_loc (MSG_NOTE, vect_location, 2771 "grouped access in outer loop.\n"); 2772 return false; 2773 } 2774 2775 2776 /* Assume this is a DR handled by non-constant strided load case. */ 2777 if (TREE_CODE (step) != INTEGER_CST) 2778 return (STMT_VINFO_STRIDED_P (stmt_info) 2779 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2780 || vect_analyze_group_access (dr))); 2781 2782 /* Not consecutive access - check if it's a part of interleaving group. */ 2783 return vect_analyze_group_access (dr); 2784 } 2785 2786 /* Compare two data-references DRA and DRB to group them into chunks 2787 suitable for grouping. */ 2788 2789 static int 2790 dr_group_sort_cmp (const void *dra_, const void *drb_) 2791 { 2792 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2793 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2794 int cmp; 2795 2796 /* Stabilize sort. */ 2797 if (dra == drb) 2798 return 0; 2799 2800 /* DRs in different loops never belong to the same group. */ 2801 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2802 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2803 if (loopa != loopb) 2804 return loopa->num < loopb->num ? -1 : 1; 2805 2806 /* Ordering of DRs according to base. */ 2807 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2808 DR_BASE_ADDRESS (drb)); 2809 if (cmp != 0) 2810 return cmp; 2811 2812 /* And according to DR_OFFSET. */ 2813 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2814 if (cmp != 0) 2815 return cmp; 2816 2817 /* Put reads before writes. */ 2818 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2819 return DR_IS_READ (dra) ? -1 : 1; 2820 2821 /* Then sort after access size. */ 2822 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2823 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2824 if (cmp != 0) 2825 return cmp; 2826 2827 /* And after step. */ 2828 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2829 if (cmp != 0) 2830 return cmp; 2831 2832 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2833 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2834 if (cmp == 0) 2835 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2836 return cmp; 2837 } 2838 2839 /* If OP is the result of a conversion, return the unconverted value, 2840 otherwise return null. */ 2841 2842 static tree 2843 strip_conversion (tree op) 2844 { 2845 if (TREE_CODE (op) != SSA_NAME) 2846 return NULL_TREE; 2847 gimple *stmt = SSA_NAME_DEF_STMT (op); 2848 if (!is_gimple_assign (stmt) 2849 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2850 return NULL_TREE; 2851 return gimple_assign_rhs1 (stmt); 2852 } 2853 2854 /* Return true if vectorizable_* routines can handle statements STMT1 2855 and STMT2 being in a single group. */ 2856 2857 static bool 2858 can_group_stmts_p (gimple *stmt1, gimple *stmt2) 2859 { 2860 if (gimple_assign_single_p (stmt1)) 2861 return gimple_assign_single_p (stmt2); 2862 2863 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1)) 2864 { 2865 /* Check for two masked loads or two masked stores. */ 2866 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2)) 2867 return false; 2868 internal_fn ifn = gimple_call_internal_fn (stmt1); 2869 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2870 return false; 2871 if (ifn != gimple_call_internal_fn (stmt2)) 2872 return false; 2873 2874 /* Check that the masks are the same. Cope with casts of masks, 2875 like those created by build_mask_conversion. */ 2876 tree mask1 = gimple_call_arg (stmt1, 2); 2877 tree mask2 = gimple_call_arg (stmt2, 2); 2878 if (!operand_equal_p (mask1, mask2, 0)) 2879 { 2880 mask1 = strip_conversion (mask1); 2881 if (!mask1) 2882 return false; 2883 mask2 = strip_conversion (mask2); 2884 if (!mask2) 2885 return false; 2886 if (!operand_equal_p (mask1, mask2, 0)) 2887 return false; 2888 } 2889 return true; 2890 } 2891 2892 return false; 2893 } 2894 2895 /* Function vect_analyze_data_ref_accesses. 2896 2897 Analyze the access pattern of all the data references in the loop. 2898 2899 FORNOW: the only access pattern that is considered vectorizable is a 2900 simple step 1 (consecutive) access. 2901 2902 FORNOW: handle only arrays and pointer accesses. */ 2903 2904 bool 2905 vect_analyze_data_ref_accesses (vec_info *vinfo) 2906 { 2907 unsigned int i; 2908 vec<data_reference_p> datarefs = vinfo->datarefs; 2909 struct data_reference *dr; 2910 2911 if (dump_enabled_p ()) 2912 dump_printf_loc (MSG_NOTE, vect_location, 2913 "=== vect_analyze_data_ref_accesses ===\n"); 2914 2915 if (datarefs.is_empty ()) 2916 return true; 2917 2918 /* Sort the array of datarefs to make building the interleaving chains 2919 linear. Don't modify the original vector's order, it is needed for 2920 determining what dependencies are reversed. */ 2921 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2922 datarefs_copy.qsort (dr_group_sort_cmp); 2923 2924 /* Build the interleaving chains. */ 2925 for (i = 0; i < datarefs_copy.length () - 1;) 2926 { 2927 data_reference_p dra = datarefs_copy[i]; 2928 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2929 stmt_vec_info lastinfo = NULL; 2930 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 2931 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 2932 { 2933 ++i; 2934 continue; 2935 } 2936 for (i = i + 1; i < datarefs_copy.length (); ++i) 2937 { 2938 data_reference_p drb = datarefs_copy[i]; 2939 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2940 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 2941 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2942 break; 2943 2944 /* ??? Imperfect sorting (non-compatible types, non-modulo 2945 accesses, same accesses) can lead to a group to be artificially 2946 split here as we don't just skip over those. If it really 2947 matters we can push those to a worklist and re-iterate 2948 over them. The we can just skip ahead to the next DR here. */ 2949 2950 /* DRs in a different loop should not be put into the same 2951 interleaving group. */ 2952 if (gimple_bb (DR_STMT (dra))->loop_father 2953 != gimple_bb (DR_STMT (drb))->loop_father) 2954 break; 2955 2956 /* Check that the data-refs have same first location (except init) 2957 and they are both either store or load (not load and store, 2958 not masked loads or stores). */ 2959 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2960 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2961 DR_BASE_ADDRESS (drb)) != 0 2962 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 2963 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb))) 2964 break; 2965 2966 /* Check that the data-refs have the same constant size. */ 2967 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2968 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2969 if (!tree_fits_uhwi_p (sza) 2970 || !tree_fits_uhwi_p (szb) 2971 || !tree_int_cst_equal (sza, szb)) 2972 break; 2973 2974 /* Check that the data-refs have the same step. */ 2975 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 2976 break; 2977 2978 /* Check the types are compatible. 2979 ??? We don't distinguish this during sorting. */ 2980 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2981 TREE_TYPE (DR_REF (drb)))) 2982 break; 2983 2984 /* Check that the DR_INITs are compile-time constants. */ 2985 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 2986 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 2987 break; 2988 2989 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 2990 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 2991 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 2992 HOST_WIDE_INT init_prev 2993 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 2994 gcc_assert (init_a <= init_b 2995 && init_a <= init_prev 2996 && init_prev <= init_b); 2997 2998 /* Do not place the same access in the interleaving chain twice. */ 2999 if (init_b == init_prev) 3000 { 3001 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3002 < gimple_uid (DR_STMT (drb))); 3003 /* ??? For now we simply "drop" the later reference which is 3004 otherwise the same rather than finishing off this group. 3005 In the end we'd want to re-process duplicates forming 3006 multiple groups from the refs, likely by just collecting 3007 all candidates (including duplicates and split points 3008 below) in a vector and then process them together. */ 3009 continue; 3010 } 3011 3012 /* If init_b == init_a + the size of the type * k, we have an 3013 interleaving, and DRA is accessed before DRB. */ 3014 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3015 if (type_size_a == 0 3016 || (init_b - init_a) % type_size_a != 0) 3017 break; 3018 3019 /* If we have a store, the accesses are adjacent. This splits 3020 groups into chunks we support (we don't support vectorization 3021 of stores with gaps). */ 3022 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3023 break; 3024 3025 /* If the step (if not zero or non-constant) is greater than the 3026 difference between data-refs' inits this splits groups into 3027 suitable sizes. */ 3028 if (tree_fits_shwi_p (DR_STEP (dra))) 3029 { 3030 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3031 if (step != 0 && step <= (init_b - init_a)) 3032 break; 3033 } 3034 3035 if (dump_enabled_p ()) 3036 { 3037 dump_printf_loc (MSG_NOTE, vect_location, 3038 "Detected interleaving "); 3039 if (DR_IS_READ (dra)) 3040 dump_printf (MSG_NOTE, "load "); 3041 else 3042 dump_printf (MSG_NOTE, "store "); 3043 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 3044 dump_printf (MSG_NOTE, " and "); 3045 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 3046 dump_printf (MSG_NOTE, "\n"); 3047 } 3048 3049 /* Link the found element into the group list. */ 3050 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 3051 { 3052 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 3053 lastinfo = stmtinfo_a; 3054 } 3055 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 3056 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 3057 lastinfo = stmtinfo_b; 3058 } 3059 } 3060 3061 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3062 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 3063 && !vect_analyze_data_ref_access (dr)) 3064 { 3065 if (dump_enabled_p ()) 3066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3067 "not vectorized: complicated access pattern.\n"); 3068 3069 if (is_a <bb_vec_info> (vinfo)) 3070 { 3071 /* Mark the statement as not vectorizable. */ 3072 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3073 continue; 3074 } 3075 else 3076 { 3077 datarefs_copy.release (); 3078 return false; 3079 } 3080 } 3081 3082 datarefs_copy.release (); 3083 return true; 3084 } 3085 3086 /* Function vect_vfa_segment_size. 3087 3088 Input: 3089 DR: The data reference. 3090 LENGTH_FACTOR: segment length to consider. 3091 3092 Return a value suitable for the dr_with_seg_len::seg_len field. 3093 This is the "distance travelled" by the pointer from the first 3094 iteration in the segment to the last. Note that it does not include 3095 the size of the access; in effect it only describes the first byte. */ 3096 3097 static tree 3098 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3099 { 3100 length_factor = size_binop (MINUS_EXPR, 3101 fold_convert (sizetype, length_factor), 3102 size_one_node); 3103 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)), 3104 length_factor); 3105 } 3106 3107 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)), 3108 gives the worst-case number of bytes covered by the segment. */ 3109 3110 static unsigned HOST_WIDE_INT 3111 vect_vfa_access_size (data_reference *dr) 3112 { 3113 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr)); 3114 tree ref_type = TREE_TYPE (DR_REF (dr)); 3115 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3116 unsigned HOST_WIDE_INT access_size = ref_size; 3117 if (GROUP_FIRST_ELEMENT (stmt_vinfo)) 3118 { 3119 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr)); 3120 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo); 3121 } 3122 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3123 && (vect_supportable_dr_alignment (dr, false) 3124 == dr_explicit_realign_optimized)) 3125 { 3126 /* We might access a full vector's worth. */ 3127 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3128 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3129 } 3130 return access_size; 3131 } 3132 3133 /* Get the minimum alignment for all the scalar accesses that DR describes. */ 3134 3135 static unsigned int 3136 vect_vfa_align (const data_reference *dr) 3137 { 3138 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr))); 3139 } 3140 3141 /* Function vect_no_alias_p. 3142 3143 Given data references A and B with equal base and offset, see whether 3144 the alias relation can be decided at compilation time. Return 1 if 3145 it can and the references alias, 0 if it can and the references do 3146 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3147 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3148 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3149 3150 static int 3151 vect_compile_time_alias (struct data_reference *a, struct data_reference *b, 3152 tree segment_length_a, tree segment_length_b, 3153 unsigned HOST_WIDE_INT access_size_a, 3154 unsigned HOST_WIDE_INT access_size_b) 3155 { 3156 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a)); 3157 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b)); 3158 poly_uint64 const_length_a; 3159 poly_uint64 const_length_b; 3160 3161 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3162 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3163 [a, a+12) */ 3164 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3165 { 3166 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3167 offset_a = (offset_a + access_size_a) - const_length_a; 3168 } 3169 else 3170 const_length_a = tree_to_poly_uint64 (segment_length_a); 3171 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3172 { 3173 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3174 offset_b = (offset_b + access_size_b) - const_length_b; 3175 } 3176 else 3177 const_length_b = tree_to_poly_uint64 (segment_length_b); 3178 3179 const_length_a += access_size_a; 3180 const_length_b += access_size_b; 3181 3182 if (ranges_known_overlap_p (offset_a, const_length_a, 3183 offset_b, const_length_b)) 3184 return 1; 3185 3186 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3187 offset_b, const_length_b)) 3188 return 0; 3189 3190 return -1; 3191 } 3192 3193 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3194 in DDR is >= VF. */ 3195 3196 static bool 3197 dependence_distance_ge_vf (data_dependence_relation *ddr, 3198 unsigned int loop_depth, poly_uint64 vf) 3199 { 3200 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3201 || DDR_NUM_DIST_VECTS (ddr) == 0) 3202 return false; 3203 3204 /* If the dependence is exact, we should have limited the VF instead. */ 3205 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3206 3207 unsigned int i; 3208 lambda_vector dist_v; 3209 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3210 { 3211 HOST_WIDE_INT dist = dist_v[loop_depth]; 3212 if (dist != 0 3213 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3214 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3215 return false; 3216 } 3217 3218 if (dump_enabled_p ()) 3219 { 3220 dump_printf_loc (MSG_NOTE, vect_location, 3221 "dependence distance between "); 3222 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 3223 dump_printf (MSG_NOTE, " and "); 3224 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 3225 dump_printf (MSG_NOTE, " is >= VF\n"); 3226 } 3227 3228 return true; 3229 } 3230 3231 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3232 3233 static void 3234 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound) 3235 { 3236 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs"); 3237 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr); 3238 dump_printf (dump_kind, ") >= "); 3239 dump_dec (dump_kind, lower_bound.min_value); 3240 } 3241 3242 /* Record that the vectorized loop requires the vec_lower_bound described 3243 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3244 3245 static void 3246 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3247 poly_uint64 min_value) 3248 { 3249 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3250 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3251 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3252 { 3253 unsigned_p &= lower_bounds[i].unsigned_p; 3254 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3255 if (lower_bounds[i].unsigned_p != unsigned_p 3256 || maybe_lt (lower_bounds[i].min_value, min_value)) 3257 { 3258 lower_bounds[i].unsigned_p = unsigned_p; 3259 lower_bounds[i].min_value = min_value; 3260 if (dump_enabled_p ()) 3261 { 3262 dump_printf_loc (MSG_NOTE, vect_location, 3263 "updating run-time check to "); 3264 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3265 dump_printf (MSG_NOTE, "\n"); 3266 } 3267 } 3268 return; 3269 } 3270 3271 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3272 if (dump_enabled_p ()) 3273 { 3274 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3275 dump_lower_bound (MSG_NOTE, lower_bound); 3276 dump_printf (MSG_NOTE, "\n"); 3277 } 3278 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3279 } 3280 3281 /* Return true if it's unlikely that the step of the vectorized form of DR 3282 will span fewer than GAP bytes. */ 3283 3284 static bool 3285 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap) 3286 { 3287 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 3288 HOST_WIDE_INT count 3289 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3290 if (GROUP_FIRST_ELEMENT (stmt_info)) 3291 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 3292 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr); 3293 } 3294 3295 /* Return true if we know that there is no alias between DR_A and DR_B 3296 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set 3297 *LOWER_BOUND_OUT to this N. */ 3298 3299 static bool 3300 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b, 3301 poly_uint64 *lower_bound_out) 3302 { 3303 /* Check that there is a constant gap of known sign between DR_A 3304 and DR_B. */ 3305 poly_int64 init_a, init_b; 3306 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3307 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3308 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3309 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3310 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3311 || !ordered_p (init_a, init_b)) 3312 return false; 3313 3314 /* Sort DR_A and DR_B by the address they access. */ 3315 if (maybe_lt (init_b, init_a)) 3316 { 3317 std::swap (init_a, init_b); 3318 std::swap (dr_a, dr_b); 3319 } 3320 3321 /* If the two accesses could be dependent within a scalar iteration, 3322 make sure that we'd retain their order. */ 3323 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b) 3324 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b))) 3325 return false; 3326 3327 /* There is no alias if abs (DR_STEP) is greater than or equal to 3328 the bytes spanned by the combination of the two accesses. */ 3329 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a; 3330 return true; 3331 } 3332 3333 /* Function vect_prune_runtime_alias_test_list. 3334 3335 Prune a list of ddrs to be tested at run-time by versioning for alias. 3336 Merge several alias checks into one if possible. 3337 Return FALSE if resulting list of ddrs is longer then allowed by 3338 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3339 3340 bool 3341 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3342 { 3343 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3344 hash_set <tree_pair_hash> compared_objects; 3345 3346 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3347 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3348 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3349 vec<vec_object_pair> &check_unequal_addrs 3350 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3351 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3352 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3353 3354 ddr_p ddr; 3355 unsigned int i; 3356 tree length_factor; 3357 3358 if (dump_enabled_p ()) 3359 dump_printf_loc (MSG_NOTE, vect_location, 3360 "=== vect_prune_runtime_alias_test_list ===\n"); 3361 3362 /* Step values are irrelevant for aliasing if the number of vector 3363 iterations is equal to the number of scalar iterations (which can 3364 happen for fully-SLP loops). */ 3365 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3366 3367 if (!ignore_step_p) 3368 { 3369 /* Convert the checks for nonzero steps into bound tests. */ 3370 tree value; 3371 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3372 vect_check_lower_bound (loop_vinfo, value, true, 1); 3373 } 3374 3375 if (may_alias_ddrs.is_empty ()) 3376 return true; 3377 3378 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3379 3380 unsigned int loop_depth 3381 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3382 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3383 3384 /* First, we collect all data ref pairs for aliasing checks. */ 3385 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3386 { 3387 int comp_res; 3388 poly_uint64 lower_bound; 3389 struct data_reference *dr_a, *dr_b; 3390 gimple *dr_group_first_a, *dr_group_first_b; 3391 tree segment_length_a, segment_length_b; 3392 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3393 unsigned int align_a, align_b; 3394 gimple *stmt_a, *stmt_b; 3395 3396 /* Ignore the alias if the VF we chose ended up being no greater 3397 than the dependence distance. */ 3398 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3399 continue; 3400 3401 if (DDR_OBJECT_A (ddr)) 3402 { 3403 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3404 if (!compared_objects.add (new_pair)) 3405 { 3406 if (dump_enabled_p ()) 3407 { 3408 dump_printf_loc (MSG_NOTE, vect_location, "checking that "); 3409 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first); 3410 dump_printf (MSG_NOTE, " and "); 3411 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second); 3412 dump_printf (MSG_NOTE, " have different addresses\n"); 3413 } 3414 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3415 } 3416 continue; 3417 } 3418 3419 dr_a = DDR_A (ddr); 3420 stmt_a = DR_STMT (DDR_A (ddr)); 3421 3422 dr_b = DDR_B (ddr); 3423 stmt_b = DR_STMT (DDR_B (ddr)); 3424 3425 /* Skip the pair if inter-iteration dependencies are irrelevant 3426 and intra-iteration dependencies are guaranteed to be honored. */ 3427 if (ignore_step_p 3428 && (vect_preserves_scalar_order_p (stmt_a, stmt_b) 3429 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound))) 3430 { 3431 if (dump_enabled_p ()) 3432 { 3433 dump_printf_loc (MSG_NOTE, vect_location, 3434 "no need for alias check between "); 3435 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3436 dump_printf (MSG_NOTE, " and "); 3437 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3438 dump_printf (MSG_NOTE, " when VF is 1\n"); 3439 } 3440 continue; 3441 } 3442 3443 /* See whether we can handle the alias using a bounds check on 3444 the step, and whether that's likely to be the best approach. 3445 (It might not be, for example, if the minimum step is much larger 3446 than the number of bytes handled by one vector iteration.) */ 3447 if (!ignore_step_p 3448 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST 3449 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound) 3450 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound) 3451 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound))) 3452 { 3453 bool unsigned_p = dr_known_forward_stride_p (dr_a); 3454 if (dump_enabled_p ()) 3455 { 3456 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "); 3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3458 dump_printf (MSG_NOTE, " and "); 3459 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3460 dump_printf (MSG_NOTE, " when the step "); 3461 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a)); 3462 dump_printf (MSG_NOTE, " is outside "); 3463 if (unsigned_p) 3464 dump_printf (MSG_NOTE, "[0"); 3465 else 3466 { 3467 dump_printf (MSG_NOTE, "("); 3468 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3469 } 3470 dump_printf (MSG_NOTE, ", "); 3471 dump_dec (MSG_NOTE, lower_bound); 3472 dump_printf (MSG_NOTE, ")\n"); 3473 } 3474 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p, 3475 lower_bound); 3476 continue; 3477 } 3478 3479 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3480 if (dr_group_first_a) 3481 { 3482 stmt_a = dr_group_first_a; 3483 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3484 } 3485 3486 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3487 if (dr_group_first_b) 3488 { 3489 stmt_b = dr_group_first_b; 3490 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3491 } 3492 3493 if (ignore_step_p) 3494 { 3495 segment_length_a = size_zero_node; 3496 segment_length_b = size_zero_node; 3497 } 3498 else 3499 { 3500 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3501 length_factor = scalar_loop_iters; 3502 else 3503 length_factor = size_int (vect_factor); 3504 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3505 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3506 } 3507 access_size_a = vect_vfa_access_size (dr_a); 3508 access_size_b = vect_vfa_access_size (dr_b); 3509 align_a = vect_vfa_align (dr_a); 3510 align_b = vect_vfa_align (dr_b); 3511 3512 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 3513 DR_BASE_ADDRESS (dr_b)); 3514 if (comp_res == 0) 3515 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), 3516 DR_OFFSET (dr_b)); 3517 3518 /* See whether the alias is known at compilation time. */ 3519 if (comp_res == 0 3520 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3521 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3522 && poly_int_tree_p (segment_length_a) 3523 && poly_int_tree_p (segment_length_b)) 3524 { 3525 int res = vect_compile_time_alias (dr_a, dr_b, 3526 segment_length_a, 3527 segment_length_b, 3528 access_size_a, 3529 access_size_b); 3530 if (res >= 0 && dump_enabled_p ()) 3531 { 3532 dump_printf_loc (MSG_NOTE, vect_location, 3533 "can tell at compile time that "); 3534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3535 dump_printf (MSG_NOTE, " and "); 3536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3537 if (res == 0) 3538 dump_printf (MSG_NOTE, " do not alias\n"); 3539 else 3540 dump_printf (MSG_NOTE, " alias\n"); 3541 } 3542 3543 if (res == 0) 3544 continue; 3545 3546 if (res == 1) 3547 { 3548 if (dump_enabled_p ()) 3549 dump_printf_loc (MSG_NOTE, vect_location, 3550 "not vectorized: compilation time alias.\n"); 3551 return false; 3552 } 3553 } 3554 3555 dr_with_seg_len_pair_t dr_with_seg_len_pair 3556 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a), 3557 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b)); 3558 3559 /* Canonicalize pairs by sorting the two DR members. */ 3560 if (comp_res > 0) 3561 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3562 3563 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3564 } 3565 3566 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3567 3568 unsigned int count = (comp_alias_ddrs.length () 3569 + check_unequal_addrs.length ()); 3570 3571 dump_printf_loc (MSG_NOTE, vect_location, 3572 "improved number of alias checks from %d to %d\n", 3573 may_alias_ddrs.length (), count); 3574 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3575 { 3576 if (dump_enabled_p ()) 3577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3578 "number of versioning for alias " 3579 "run-time tests exceeds %d " 3580 "(--param vect-max-version-for-alias-checks)\n", 3581 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3582 return false; 3583 } 3584 3585 return true; 3586 } 3587 3588 /* Check whether we can use an internal function for a gather load 3589 or scatter store. READ_P is true for loads and false for stores. 3590 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3591 the type of the memory elements being loaded or stored. OFFSET_BITS 3592 is the number of bits in each scalar offset and OFFSET_SIGN is the 3593 sign of the offset. SCALE is the amount by which the offset should 3594 be multiplied *after* it has been converted to address width. 3595 3596 Return true if the function is supported, storing the function 3597 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ 3598 3599 bool 3600 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, 3601 tree memory_type, unsigned int offset_bits, 3602 signop offset_sign, int scale, 3603 internal_fn *ifn_out, tree *element_type_out) 3604 { 3605 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3606 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3607 if (offset_bits > element_bits) 3608 /* Internal functions require the offset to be the same width as 3609 the vector elements. We can extend narrower offsets, but it isn't 3610 safe to truncate wider offsets. */ 3611 return false; 3612 3613 if (element_bits != memory_bits) 3614 /* For now the vector elements must be the same width as the 3615 memory elements. */ 3616 return false; 3617 3618 /* Work out which function we need. */ 3619 internal_fn ifn; 3620 if (read_p) 3621 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3622 else 3623 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3624 3625 /* Test whether the target supports this combination. */ 3626 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3627 offset_sign, scale)) 3628 return false; 3629 3630 *ifn_out = ifn; 3631 *element_type_out = TREE_TYPE (vectype); 3632 return true; 3633 } 3634 3635 /* CALL is a call to an internal gather load or scatter store function. 3636 Describe the operation in INFO. */ 3637 3638 static void 3639 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info) 3640 { 3641 stmt_vec_info stmt_info = vinfo_for_stmt (call); 3642 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3643 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3644 3645 info->ifn = gimple_call_internal_fn (call); 3646 info->decl = NULL_TREE; 3647 info->base = gimple_call_arg (call, 0); 3648 info->offset = gimple_call_arg (call, 1); 3649 info->offset_dt = vect_unknown_def_type; 3650 info->offset_vectype = NULL_TREE; 3651 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3652 info->element_type = TREE_TYPE (vectype); 3653 info->memory_type = TREE_TYPE (DR_REF (dr)); 3654 } 3655 3656 /* Return true if a non-affine read or write in STMT is suitable for a 3657 gather load or scatter store. Describe the operation in *INFO if so. */ 3658 3659 bool 3660 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3661 gather_scatter_info *info) 3662 { 3663 HOST_WIDE_INT scale = 1; 3664 poly_int64 pbitpos, pbitsize; 3665 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3667 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3668 tree offtype = NULL_TREE; 3669 tree decl = NULL_TREE, base, off; 3670 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3671 tree memory_type = TREE_TYPE (DR_REF (dr)); 3672 machine_mode pmode; 3673 int punsignedp, reversep, pvolatilep = 0; 3674 internal_fn ifn; 3675 tree element_type; 3676 bool masked_p = false; 3677 3678 /* See whether this is already a call to a gather/scatter internal function. 3679 If not, see whether it's a masked load or store. */ 3680 gcall *call = dyn_cast <gcall *> (stmt); 3681 if (call && gimple_call_internal_p (call)) 3682 { 3683 ifn = gimple_call_internal_fn (stmt); 3684 if (internal_gather_scatter_fn_p (ifn)) 3685 { 3686 vect_describe_gather_scatter_call (call, info); 3687 return true; 3688 } 3689 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3690 } 3691 3692 /* True if we should aim to use internal functions rather than 3693 built-in functions. */ 3694 bool use_ifn_p = (DR_IS_READ (dr) 3695 ? supports_vec_gather_load_p () 3696 : supports_vec_scatter_store_p ()); 3697 3698 base = DR_REF (dr); 3699 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3700 see if we can use the def stmt of the address. */ 3701 if (masked_p 3702 && TREE_CODE (base) == MEM_REF 3703 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3704 && integer_zerop (TREE_OPERAND (base, 1)) 3705 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3706 { 3707 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3708 if (is_gimple_assign (def_stmt) 3709 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3710 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3711 } 3712 3713 /* The gather and scatter builtins need address of the form 3714 loop_invariant + vector * {1, 2, 4, 8} 3715 or 3716 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3717 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3718 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3719 multiplications and additions in it. To get a vector, we need 3720 a single SSA_NAME that will be defined in the loop and will 3721 contain everything that is not loop invariant and that can be 3722 vectorized. The following code attempts to find such a preexistng 3723 SSA_NAME OFF and put the loop invariants into a tree BASE 3724 that can be gimplified before the loop. */ 3725 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3726 &punsignedp, &reversep, &pvolatilep); 3727 gcc_assert (base && !reversep); 3728 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3729 3730 if (TREE_CODE (base) == MEM_REF) 3731 { 3732 if (!integer_zerop (TREE_OPERAND (base, 1))) 3733 { 3734 if (off == NULL_TREE) 3735 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3736 else 3737 off = size_binop (PLUS_EXPR, off, 3738 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3739 } 3740 base = TREE_OPERAND (base, 0); 3741 } 3742 else 3743 base = build_fold_addr_expr (base); 3744 3745 if (off == NULL_TREE) 3746 off = size_zero_node; 3747 3748 /* If base is not loop invariant, either off is 0, then we start with just 3749 the constant offset in the loop invariant BASE and continue with base 3750 as OFF, otherwise give up. 3751 We could handle that case by gimplifying the addition of base + off 3752 into some SSA_NAME and use that as off, but for now punt. */ 3753 if (!expr_invariant_in_loop_p (loop, base)) 3754 { 3755 if (!integer_zerop (off)) 3756 return false; 3757 off = base; 3758 base = size_int (pbytepos); 3759 } 3760 /* Otherwise put base + constant offset into the loop invariant BASE 3761 and continue with OFF. */ 3762 else 3763 { 3764 base = fold_convert (sizetype, base); 3765 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3766 } 3767 3768 /* OFF at this point may be either a SSA_NAME or some tree expression 3769 from get_inner_reference. Try to peel off loop invariants from it 3770 into BASE as long as possible. */ 3771 STRIP_NOPS (off); 3772 while (offtype == NULL_TREE) 3773 { 3774 enum tree_code code; 3775 tree op0, op1, add = NULL_TREE; 3776 3777 if (TREE_CODE (off) == SSA_NAME) 3778 { 3779 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3780 3781 if (expr_invariant_in_loop_p (loop, off)) 3782 return false; 3783 3784 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3785 break; 3786 3787 op0 = gimple_assign_rhs1 (def_stmt); 3788 code = gimple_assign_rhs_code (def_stmt); 3789 op1 = gimple_assign_rhs2 (def_stmt); 3790 } 3791 else 3792 { 3793 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3794 return false; 3795 code = TREE_CODE (off); 3796 extract_ops_from_tree (off, &code, &op0, &op1); 3797 } 3798 switch (code) 3799 { 3800 case POINTER_PLUS_EXPR: 3801 case PLUS_EXPR: 3802 if (expr_invariant_in_loop_p (loop, op0)) 3803 { 3804 add = op0; 3805 off = op1; 3806 do_add: 3807 add = fold_convert (sizetype, add); 3808 if (scale != 1) 3809 add = size_binop (MULT_EXPR, add, size_int (scale)); 3810 base = size_binop (PLUS_EXPR, base, add); 3811 continue; 3812 } 3813 if (expr_invariant_in_loop_p (loop, op1)) 3814 { 3815 add = op1; 3816 off = op0; 3817 goto do_add; 3818 } 3819 break; 3820 case MINUS_EXPR: 3821 if (expr_invariant_in_loop_p (loop, op1)) 3822 { 3823 add = fold_convert (sizetype, op1); 3824 add = size_binop (MINUS_EXPR, size_zero_node, add); 3825 off = op0; 3826 goto do_add; 3827 } 3828 break; 3829 case MULT_EXPR: 3830 if (scale == 1 && tree_fits_shwi_p (op1)) 3831 { 3832 int new_scale = tree_to_shwi (op1); 3833 /* Only treat this as a scaling operation if the target 3834 supports it. */ 3835 if (use_ifn_p 3836 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, 3837 vectype, memory_type, 1, 3838 TYPE_SIGN (TREE_TYPE (op0)), 3839 new_scale, &ifn, 3840 &element_type)) 3841 break; 3842 scale = new_scale; 3843 off = op0; 3844 continue; 3845 } 3846 break; 3847 case SSA_NAME: 3848 off = op0; 3849 continue; 3850 CASE_CONVERT: 3851 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3852 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3853 break; 3854 if (TYPE_PRECISION (TREE_TYPE (op0)) 3855 == TYPE_PRECISION (TREE_TYPE (off))) 3856 { 3857 off = op0; 3858 continue; 3859 } 3860 3861 /* The internal functions need the offset to be the same width 3862 as the elements of VECTYPE. Don't include operations that 3863 cast the offset from that width to a different width. */ 3864 if (use_ifn_p 3865 && (int_size_in_bytes (TREE_TYPE (vectype)) 3866 == int_size_in_bytes (TREE_TYPE (off)))) 3867 break; 3868 3869 if (TYPE_PRECISION (TREE_TYPE (op0)) 3870 < TYPE_PRECISION (TREE_TYPE (off))) 3871 { 3872 off = op0; 3873 offtype = TREE_TYPE (off); 3874 STRIP_NOPS (off); 3875 continue; 3876 } 3877 break; 3878 default: 3879 break; 3880 } 3881 break; 3882 } 3883 3884 /* If at the end OFF still isn't a SSA_NAME or isn't 3885 defined in the loop, punt. */ 3886 if (TREE_CODE (off) != SSA_NAME 3887 || expr_invariant_in_loop_p (loop, off)) 3888 return false; 3889 3890 if (offtype == NULL_TREE) 3891 offtype = TREE_TYPE (off); 3892 3893 if (use_ifn_p) 3894 { 3895 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, 3896 memory_type, TYPE_PRECISION (offtype), 3897 TYPE_SIGN (offtype), scale, &ifn, 3898 &element_type)) 3899 return false; 3900 } 3901 else 3902 { 3903 if (DR_IS_READ (dr)) 3904 { 3905 if (targetm.vectorize.builtin_gather) 3906 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 3907 } 3908 else 3909 { 3910 if (targetm.vectorize.builtin_scatter) 3911 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 3912 } 3913 3914 if (!decl) 3915 return false; 3916 3917 ifn = IFN_LAST; 3918 element_type = TREE_TYPE (vectype); 3919 } 3920 3921 info->ifn = ifn; 3922 info->decl = decl; 3923 info->base = base; 3924 info->offset = off; 3925 info->offset_dt = vect_unknown_def_type; 3926 info->offset_vectype = NULL_TREE; 3927 info->scale = scale; 3928 info->element_type = element_type; 3929 info->memory_type = memory_type; 3930 return true; 3931 } 3932 3933 /* Function vect_analyze_data_refs. 3934 3935 Find all the data references in the loop or basic block. 3936 3937 The general structure of the analysis of data refs in the vectorizer is as 3938 follows: 3939 1- vect_analyze_data_refs(loop/bb): call 3940 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3941 in the loop/bb and their dependences. 3942 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3943 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3944 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3945 3946 */ 3947 3948 bool 3949 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) 3950 { 3951 struct loop *loop = NULL; 3952 unsigned int i; 3953 struct data_reference *dr; 3954 tree scalar_type; 3955 3956 if (dump_enabled_p ()) 3957 dump_printf_loc (MSG_NOTE, vect_location, 3958 "=== vect_analyze_data_refs ===\n"); 3959 3960 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 3961 loop = LOOP_VINFO_LOOP (loop_vinfo); 3962 3963 /* Go through the data-refs, check that the analysis succeeded. Update 3964 pointer from stmt_vec_info struct to DR and vectype. */ 3965 3966 vec<data_reference_p> datarefs = vinfo->datarefs; 3967 FOR_EACH_VEC_ELT (datarefs, i, dr) 3968 { 3969 gimple *stmt; 3970 stmt_vec_info stmt_info; 3971 tree base, offset, init; 3972 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 3973 bool simd_lane_access = false; 3974 poly_uint64 vf; 3975 3976 again: 3977 if (!dr || !DR_REF (dr)) 3978 { 3979 if (dump_enabled_p ()) 3980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3981 "not vectorized: unhandled data-ref\n"); 3982 return false; 3983 } 3984 3985 stmt = DR_STMT (dr); 3986 stmt_info = vinfo_for_stmt (stmt); 3987 3988 /* Discard clobbers from the dataref vector. We will remove 3989 clobber stmts during vectorization. */ 3990 if (gimple_clobber_p (stmt)) 3991 { 3992 free_data_ref (dr); 3993 if (i == datarefs.length () - 1) 3994 { 3995 datarefs.pop (); 3996 break; 3997 } 3998 datarefs.ordered_remove (i); 3999 dr = datarefs[i]; 4000 goto again; 4001 } 4002 4003 /* Check that analysis of the data-ref succeeded. */ 4004 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4005 || !DR_STEP (dr)) 4006 { 4007 bool maybe_gather 4008 = DR_IS_READ (dr) 4009 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4010 && (targetm.vectorize.builtin_gather != NULL 4011 || supports_vec_gather_load_p ()); 4012 bool maybe_scatter 4013 = DR_IS_WRITE (dr) 4014 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4015 && (targetm.vectorize.builtin_scatter != NULL 4016 || supports_vec_scatter_store_p ()); 4017 bool maybe_simd_lane_access 4018 = is_a <loop_vec_info> (vinfo) && loop->simduid; 4019 4020 /* If target supports vector gather loads or scatter stores, or if 4021 this might be a SIMD lane access, see if they can't be used. */ 4022 if (is_a <loop_vec_info> (vinfo) 4023 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 4024 && !nested_in_vect_loop_p (loop, stmt)) 4025 { 4026 struct data_reference *newdr 4027 = create_data_ref (NULL, loop_containing_stmt (stmt), 4028 DR_REF (dr), stmt, !maybe_scatter, 4029 DR_IS_CONDITIONAL_IN_STMT (dr)); 4030 gcc_assert (newdr != NULL && DR_REF (newdr)); 4031 if (DR_BASE_ADDRESS (newdr) 4032 && DR_OFFSET (newdr) 4033 && DR_INIT (newdr) 4034 && DR_STEP (newdr) 4035 && integer_zerop (DR_STEP (newdr))) 4036 { 4037 if (maybe_simd_lane_access) 4038 { 4039 tree off = DR_OFFSET (newdr); 4040 STRIP_NOPS (off); 4041 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4042 && TREE_CODE (off) == MULT_EXPR 4043 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4044 { 4045 tree step = TREE_OPERAND (off, 1); 4046 off = TREE_OPERAND (off, 0); 4047 STRIP_NOPS (off); 4048 if (CONVERT_EXPR_P (off) 4049 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 4050 0))) 4051 < TYPE_PRECISION (TREE_TYPE (off))) 4052 off = TREE_OPERAND (off, 0); 4053 if (TREE_CODE (off) == SSA_NAME) 4054 { 4055 gimple *def = SSA_NAME_DEF_STMT (off); 4056 tree reft = TREE_TYPE (DR_REF (newdr)); 4057 if (is_gimple_call (def) 4058 && gimple_call_internal_p (def) 4059 && (gimple_call_internal_fn (def) 4060 == IFN_GOMP_SIMD_LANE)) 4061 { 4062 tree arg = gimple_call_arg (def, 0); 4063 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4064 arg = SSA_NAME_VAR (arg); 4065 if (arg == loop->simduid 4066 /* For now. */ 4067 && tree_int_cst_equal 4068 (TYPE_SIZE_UNIT (reft), 4069 step)) 4070 { 4071 DR_OFFSET (newdr) = ssize_int (0); 4072 DR_STEP (newdr) = step; 4073 DR_OFFSET_ALIGNMENT (newdr) 4074 = BIGGEST_ALIGNMENT; 4075 DR_STEP_ALIGNMENT (newdr) 4076 = highest_pow2_factor (step); 4077 dr = newdr; 4078 simd_lane_access = true; 4079 } 4080 } 4081 } 4082 } 4083 } 4084 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 4085 { 4086 dr = newdr; 4087 if (maybe_gather) 4088 gatherscatter = GATHER; 4089 else 4090 gatherscatter = SCATTER; 4091 } 4092 } 4093 if (gatherscatter == SG_NONE && !simd_lane_access) 4094 free_data_ref (newdr); 4095 } 4096 4097 if (gatherscatter == SG_NONE && !simd_lane_access) 4098 { 4099 if (dump_enabled_p ()) 4100 { 4101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4102 "not vectorized: data ref analysis " 4103 "failed "); 4104 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4105 } 4106 4107 if (is_a <bb_vec_info> (vinfo)) 4108 break; 4109 4110 return false; 4111 } 4112 } 4113 4114 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4115 { 4116 if (dump_enabled_p ()) 4117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4118 "not vectorized: base addr of dr is a " 4119 "constant\n"); 4120 4121 if (is_a <bb_vec_info> (vinfo)) 4122 break; 4123 4124 if (gatherscatter != SG_NONE || simd_lane_access) 4125 free_data_ref (dr); 4126 return false; 4127 } 4128 4129 if (TREE_THIS_VOLATILE (DR_REF (dr))) 4130 { 4131 if (dump_enabled_p ()) 4132 { 4133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4134 "not vectorized: volatile type "); 4135 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4136 } 4137 4138 if (is_a <bb_vec_info> (vinfo)) 4139 break; 4140 4141 return false; 4142 } 4143 4144 if (stmt_can_throw_internal (stmt)) 4145 { 4146 if (dump_enabled_p ()) 4147 { 4148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4149 "not vectorized: statement can throw an " 4150 "exception "); 4151 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4152 } 4153 4154 if (is_a <bb_vec_info> (vinfo)) 4155 break; 4156 4157 if (gatherscatter != SG_NONE || simd_lane_access) 4158 free_data_ref (dr); 4159 return false; 4160 } 4161 4162 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4163 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4164 { 4165 if (dump_enabled_p ()) 4166 { 4167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4168 "not vectorized: statement is bitfield " 4169 "access "); 4170 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4171 } 4172 4173 if (is_a <bb_vec_info> (vinfo)) 4174 break; 4175 4176 if (gatherscatter != SG_NONE || simd_lane_access) 4177 free_data_ref (dr); 4178 return false; 4179 } 4180 4181 base = unshare_expr (DR_BASE_ADDRESS (dr)); 4182 offset = unshare_expr (DR_OFFSET (dr)); 4183 init = unshare_expr (DR_INIT (dr)); 4184 4185 if (is_gimple_call (stmt) 4186 && (!gimple_call_internal_p (stmt) 4187 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 4188 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 4189 { 4190 if (dump_enabled_p ()) 4191 { 4192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4193 "not vectorized: dr in a call "); 4194 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4195 } 4196 4197 if (is_a <bb_vec_info> (vinfo)) 4198 break; 4199 4200 if (gatherscatter != SG_NONE || simd_lane_access) 4201 free_data_ref (dr); 4202 return false; 4203 } 4204 4205 /* Update DR field in stmt_vec_info struct. */ 4206 4207 /* If the dataref is in an inner-loop of the loop that is considered for 4208 for vectorization, we also want to analyze the access relative to 4209 the outer-loop (DR contains information only relative to the 4210 inner-most enclosing loop). We do that by building a reference to the 4211 first location accessed by the inner-loop, and analyze it relative to 4212 the outer-loop. */ 4213 if (loop && nested_in_vect_loop_p (loop, stmt)) 4214 { 4215 /* Build a reference to the first location accessed by the 4216 inner loop: *(BASE + INIT + OFFSET). By construction, 4217 this address must be invariant in the inner loop, so we 4218 can consider it as being used in the outer loop. */ 4219 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4220 init, offset); 4221 tree init_addr = fold_build_pointer_plus (base, init_offset); 4222 tree init_ref = build_fold_indirect_ref (init_addr); 4223 4224 if (dump_enabled_p ()) 4225 { 4226 dump_printf_loc (MSG_NOTE, vect_location, 4227 "analyze in outer loop: "); 4228 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref); 4229 dump_printf (MSG_NOTE, "\n"); 4230 } 4231 4232 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4233 init_ref, loop)) 4234 /* dr_analyze_innermost already explained the failure. */ 4235 return false; 4236 4237 if (dump_enabled_p ()) 4238 { 4239 dump_printf_loc (MSG_NOTE, vect_location, 4240 "\touter base_address: "); 4241 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4242 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4243 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 4244 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4245 STMT_VINFO_DR_OFFSET (stmt_info)); 4246 dump_printf (MSG_NOTE, 4247 "\n\touter constant offset from base address: "); 4248 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4249 STMT_VINFO_DR_INIT (stmt_info)); 4250 dump_printf (MSG_NOTE, "\n\touter step: "); 4251 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4252 STMT_VINFO_DR_STEP (stmt_info)); 4253 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n", 4254 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info)); 4255 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n", 4256 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info)); 4257 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n", 4258 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info)); 4259 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n", 4260 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4261 } 4262 } 4263 4264 if (STMT_VINFO_DATA_REF (stmt_info)) 4265 { 4266 if (dump_enabled_p ()) 4267 { 4268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4269 "not vectorized: more than one data ref " 4270 "in stmt: "); 4271 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4272 } 4273 4274 if (is_a <bb_vec_info> (vinfo)) 4275 break; 4276 4277 if (gatherscatter != SG_NONE || simd_lane_access) 4278 free_data_ref (dr); 4279 return false; 4280 } 4281 4282 STMT_VINFO_DATA_REF (stmt_info) = dr; 4283 if (simd_lane_access) 4284 { 4285 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4286 free_data_ref (datarefs[i]); 4287 datarefs[i] = dr; 4288 } 4289 4290 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4291 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4292 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4293 { 4294 if (dump_enabled_p ()) 4295 { 4296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4297 "not vectorized: base object not addressable " 4298 "for stmt: "); 4299 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4300 } 4301 if (is_a <bb_vec_info> (vinfo)) 4302 { 4303 /* In BB vectorization the ref can still participate 4304 in dependence analysis, we just can't vectorize it. */ 4305 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4306 continue; 4307 } 4308 return false; 4309 } 4310 4311 /* Set vectype for STMT. */ 4312 scalar_type = TREE_TYPE (DR_REF (dr)); 4313 STMT_VINFO_VECTYPE (stmt_info) 4314 = get_vectype_for_scalar_type (scalar_type); 4315 if (!STMT_VINFO_VECTYPE (stmt_info)) 4316 { 4317 if (dump_enabled_p ()) 4318 { 4319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4320 "not vectorized: no vectype for stmt: "); 4321 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4322 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4323 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4324 scalar_type); 4325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4326 } 4327 4328 if (is_a <bb_vec_info> (vinfo)) 4329 { 4330 /* No vector type is fine, the ref can still participate 4331 in dependence analysis, we just can't vectorize it. */ 4332 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4333 continue; 4334 } 4335 4336 if (gatherscatter != SG_NONE || simd_lane_access) 4337 { 4338 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4339 if (gatherscatter != SG_NONE) 4340 free_data_ref (dr); 4341 } 4342 return false; 4343 } 4344 else 4345 { 4346 if (dump_enabled_p ()) 4347 { 4348 dump_printf_loc (MSG_NOTE, vect_location, 4349 "got vectype for stmt: "); 4350 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4351 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4352 STMT_VINFO_VECTYPE (stmt_info)); 4353 dump_printf (MSG_NOTE, "\n"); 4354 } 4355 } 4356 4357 /* Adjust the minimal vectorization factor according to the 4358 vector type. */ 4359 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4360 *min_vf = upper_bound (*min_vf, vf); 4361 4362 if (gatherscatter != SG_NONE) 4363 { 4364 gather_scatter_info gs_info; 4365 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4366 &gs_info) 4367 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4368 { 4369 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4370 free_data_ref (dr); 4371 if (dump_enabled_p ()) 4372 { 4373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4374 (gatherscatter == GATHER) ? 4375 "not vectorized: not suitable for gather " 4376 "load " : 4377 "not vectorized: not suitable for scatter " 4378 "store "); 4379 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4380 } 4381 return false; 4382 } 4383 4384 free_data_ref (datarefs[i]); 4385 datarefs[i] = dr; 4386 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4387 } 4388 4389 else if (is_a <loop_vec_info> (vinfo) 4390 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4391 { 4392 if (nested_in_vect_loop_p (loop, stmt)) 4393 { 4394 if (dump_enabled_p ()) 4395 { 4396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4397 "not vectorized: not suitable for strided " 4398 "load "); 4399 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4400 } 4401 return false; 4402 } 4403 STMT_VINFO_STRIDED_P (stmt_info) = true; 4404 } 4405 } 4406 4407 /* If we stopped analysis at the first dataref we could not analyze 4408 when trying to vectorize a basic-block mark the rest of the datarefs 4409 as not vectorizable and truncate the vector of datarefs. That 4410 avoids spending useless time in analyzing their dependence. */ 4411 if (i != datarefs.length ()) 4412 { 4413 gcc_assert (is_a <bb_vec_info> (vinfo)); 4414 for (unsigned j = i; j < datarefs.length (); ++j) 4415 { 4416 data_reference_p dr = datarefs[j]; 4417 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4418 free_data_ref (dr); 4419 } 4420 datarefs.truncate (i); 4421 } 4422 4423 return true; 4424 } 4425 4426 4427 /* Function vect_get_new_vect_var. 4428 4429 Returns a name for a new variable. The current naming scheme appends the 4430 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4431 the name of vectorizer generated variables, and appends that to NAME if 4432 provided. */ 4433 4434 tree 4435 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4436 { 4437 const char *prefix; 4438 tree new_vect_var; 4439 4440 switch (var_kind) 4441 { 4442 case vect_simple_var: 4443 prefix = "vect"; 4444 break; 4445 case vect_scalar_var: 4446 prefix = "stmp"; 4447 break; 4448 case vect_mask_var: 4449 prefix = "mask"; 4450 break; 4451 case vect_pointer_var: 4452 prefix = "vectp"; 4453 break; 4454 default: 4455 gcc_unreachable (); 4456 } 4457 4458 if (name) 4459 { 4460 char* tmp = concat (prefix, "_", name, NULL); 4461 new_vect_var = create_tmp_reg (type, tmp); 4462 free (tmp); 4463 } 4464 else 4465 new_vect_var = create_tmp_reg (type, prefix); 4466 4467 return new_vect_var; 4468 } 4469 4470 /* Like vect_get_new_vect_var but return an SSA name. */ 4471 4472 tree 4473 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4474 { 4475 const char *prefix; 4476 tree new_vect_var; 4477 4478 switch (var_kind) 4479 { 4480 case vect_simple_var: 4481 prefix = "vect"; 4482 break; 4483 case vect_scalar_var: 4484 prefix = "stmp"; 4485 break; 4486 case vect_pointer_var: 4487 prefix = "vectp"; 4488 break; 4489 default: 4490 gcc_unreachable (); 4491 } 4492 4493 if (name) 4494 { 4495 char* tmp = concat (prefix, "_", name, NULL); 4496 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4497 free (tmp); 4498 } 4499 else 4500 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4501 4502 return new_vect_var; 4503 } 4504 4505 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4506 4507 static void 4508 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr) 4509 { 4510 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4511 int misalign = DR_MISALIGNMENT (dr); 4512 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4513 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4514 else 4515 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4516 DR_TARGET_ALIGNMENT (dr), misalign); 4517 } 4518 4519 /* Function vect_create_addr_base_for_vector_ref. 4520 4521 Create an expression that computes the address of the first memory location 4522 that will be accessed for a data reference. 4523 4524 Input: 4525 STMT: The statement containing the data reference. 4526 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4527 OFFSET: Optional. If supplied, it is be added to the initial address. 4528 LOOP: Specify relative to which loop-nest should the address be computed. 4529 For example, when the dataref is in an inner-loop nested in an 4530 outer-loop that is now being vectorized, LOOP can be either the 4531 outer-loop, or the inner-loop. The first memory location accessed 4532 by the following dataref ('in' points to short): 4533 4534 for (i=0; i<N; i++) 4535 for (j=0; j<M; j++) 4536 s += in[i+j] 4537 4538 is as follows: 4539 if LOOP=i_loop: &in (relative to i_loop) 4540 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4541 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4542 initial address. Unlike OFFSET, which is number of elements to 4543 be added, BYTE_OFFSET is measured in bytes. 4544 4545 Output: 4546 1. Return an SSA_NAME whose value is the address of the memory location of 4547 the first vector of the data reference. 4548 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4549 these statement(s) which define the returned SSA_NAME. 4550 4551 FORNOW: We are only handling array accesses with step 1. */ 4552 4553 tree 4554 vect_create_addr_base_for_vector_ref (gimple *stmt, 4555 gimple_seq *new_stmt_list, 4556 tree offset, 4557 tree byte_offset) 4558 { 4559 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4560 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4561 const char *base_name; 4562 tree addr_base; 4563 tree dest; 4564 gimple_seq seq = NULL; 4565 tree vect_ptr_type; 4566 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4567 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4568 innermost_loop_behavior *drb = vect_dr_behavior (dr); 4569 4570 tree data_ref_base = unshare_expr (drb->base_address); 4571 tree base_offset = unshare_expr (drb->offset); 4572 tree init = unshare_expr (drb->init); 4573 4574 if (loop_vinfo) 4575 base_name = get_name (data_ref_base); 4576 else 4577 { 4578 base_offset = ssize_int (0); 4579 init = ssize_int (0); 4580 base_name = get_name (DR_REF (dr)); 4581 } 4582 4583 /* Create base_offset */ 4584 base_offset = size_binop (PLUS_EXPR, 4585 fold_convert (sizetype, base_offset), 4586 fold_convert (sizetype, init)); 4587 4588 if (offset) 4589 { 4590 offset = fold_build2 (MULT_EXPR, sizetype, 4591 fold_convert (sizetype, offset), step); 4592 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4593 base_offset, offset); 4594 } 4595 if (byte_offset) 4596 { 4597 byte_offset = fold_convert (sizetype, byte_offset); 4598 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4599 base_offset, byte_offset); 4600 } 4601 4602 /* base + base_offset */ 4603 if (loop_vinfo) 4604 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4605 else 4606 { 4607 addr_base = build1 (ADDR_EXPR, 4608 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4609 unshare_expr (DR_REF (dr))); 4610 } 4611 4612 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4613 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4614 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4615 gimple_seq_add_seq (new_stmt_list, seq); 4616 4617 if (DR_PTR_INFO (dr) 4618 && TREE_CODE (addr_base) == SSA_NAME 4619 && !SSA_NAME_PTR_INFO (addr_base)) 4620 { 4621 vect_duplicate_ssa_name_ptr_info (addr_base, dr); 4622 if (offset || byte_offset) 4623 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4624 } 4625 4626 if (dump_enabled_p ()) 4627 { 4628 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4629 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4630 dump_printf (MSG_NOTE, "\n"); 4631 } 4632 4633 return addr_base; 4634 } 4635 4636 4637 /* Function vect_create_data_ref_ptr. 4638 4639 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4640 location accessed in the loop by STMT, along with the def-use update 4641 chain to appropriately advance the pointer through the loop iterations. 4642 Also set aliasing information for the pointer. This pointer is used by 4643 the callers to this function to create a memory reference expression for 4644 vector load/store access. 4645 4646 Input: 4647 1. STMT: a stmt that references memory. Expected to be of the form 4648 GIMPLE_ASSIGN <name, data-ref> or 4649 GIMPLE_ASSIGN <data-ref, name>. 4650 2. AGGR_TYPE: the type of the reference, which should be either a vector 4651 or an array. 4652 3. AT_LOOP: the loop where the vector memref is to be created. 4653 4. OFFSET (optional): an offset to be added to the initial address accessed 4654 by the data-ref in STMT. 4655 5. BSI: location where the new stmts are to be placed if there is no loop 4656 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4657 pointing to the initial address. 4658 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4659 to the initial address accessed by the data-ref in STMT. This is 4660 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4661 in bytes. 4662 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4663 to the IV during each iteration of the loop. NULL says to move 4664 by one copy of AGGR_TYPE up or down, depending on the step of the 4665 data reference. 4666 4667 Output: 4668 1. Declare a new ptr to vector_type, and have it point to the base of the 4669 data reference (initial addressed accessed by the data reference). 4670 For example, for vector of type V8HI, the following code is generated: 4671 4672 v8hi *ap; 4673 ap = (v8hi *)initial_address; 4674 4675 if OFFSET is not supplied: 4676 initial_address = &a[init]; 4677 if OFFSET is supplied: 4678 initial_address = &a[init + OFFSET]; 4679 if BYTE_OFFSET is supplied: 4680 initial_address = &a[init] + BYTE_OFFSET; 4681 4682 Return the initial_address in INITIAL_ADDRESS. 4683 4684 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4685 update the pointer in each iteration of the loop. 4686 4687 Return the increment stmt that updates the pointer in PTR_INCR. 4688 4689 3. Set INV_P to true if the access pattern of the data reference in the 4690 vectorized loop is invariant. Set it to false otherwise. 4691 4692 4. Return the pointer. */ 4693 4694 tree 4695 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4696 tree offset, tree *initial_address, 4697 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4698 bool only_init, bool *inv_p, tree byte_offset, 4699 tree iv_step) 4700 { 4701 const char *base_name; 4702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4703 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4704 struct loop *loop = NULL; 4705 bool nested_in_vect_loop = false; 4706 struct loop *containing_loop = NULL; 4707 tree aggr_ptr_type; 4708 tree aggr_ptr; 4709 tree new_temp; 4710 gimple_seq new_stmt_list = NULL; 4711 edge pe = NULL; 4712 basic_block new_bb; 4713 tree aggr_ptr_init; 4714 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4715 tree aptr; 4716 gimple_stmt_iterator incr_gsi; 4717 bool insert_after; 4718 tree indx_before_incr, indx_after_incr; 4719 gimple *incr; 4720 tree step; 4721 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4722 4723 gcc_assert (iv_step != NULL_TREE 4724 || TREE_CODE (aggr_type) == ARRAY_TYPE 4725 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4726 4727 if (loop_vinfo) 4728 { 4729 loop = LOOP_VINFO_LOOP (loop_vinfo); 4730 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4731 containing_loop = (gimple_bb (stmt))->loop_father; 4732 pe = loop_preheader_edge (loop); 4733 } 4734 else 4735 { 4736 gcc_assert (bb_vinfo); 4737 only_init = true; 4738 *ptr_incr = NULL; 4739 } 4740 4741 /* Check the step (evolution) of the load in LOOP, and record 4742 whether it's invariant. */ 4743 step = vect_dr_behavior (dr)->step; 4744 if (integer_zerop (step)) 4745 *inv_p = true; 4746 else 4747 *inv_p = false; 4748 4749 /* Create an expression for the first address accessed by this load 4750 in LOOP. */ 4751 base_name = get_name (DR_BASE_ADDRESS (dr)); 4752 4753 if (dump_enabled_p ()) 4754 { 4755 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4756 dump_printf_loc (MSG_NOTE, vect_location, 4757 "create %s-pointer variable to type: ", 4758 get_tree_code_name (TREE_CODE (aggr_type))); 4759 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4760 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4761 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4762 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4763 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4764 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4765 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4766 else 4767 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4768 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4769 dump_printf (MSG_NOTE, "\n"); 4770 } 4771 4772 /* (1) Create the new aggregate-pointer variable. 4773 Vector and array types inherit the alias set of their component 4774 type by default so we need to use a ref-all pointer if the data 4775 reference does not conflict with the created aggregated data 4776 reference because it is not addressable. */ 4777 bool need_ref_all = false; 4778 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4779 get_alias_set (DR_REF (dr)))) 4780 need_ref_all = true; 4781 /* Likewise for any of the data references in the stmt group. */ 4782 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4783 { 4784 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4785 do 4786 { 4787 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4788 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4789 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4790 get_alias_set (DR_REF (sdr)))) 4791 { 4792 need_ref_all = true; 4793 break; 4794 } 4795 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4796 } 4797 while (orig_stmt); 4798 } 4799 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4800 need_ref_all); 4801 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4802 4803 4804 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4805 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4806 def-use update cycles for the pointer: one relative to the outer-loop 4807 (LOOP), which is what steps (3) and (4) below do. The other is relative 4808 to the inner-loop (which is the inner-most loop containing the dataref), 4809 and this is done be step (5) below. 4810 4811 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4812 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4813 redundant. Steps (3),(4) create the following: 4814 4815 vp0 = &base_addr; 4816 LOOP: vp1 = phi(vp0,vp2) 4817 ... 4818 ... 4819 vp2 = vp1 + step 4820 goto LOOP 4821 4822 If there is an inner-loop nested in loop, then step (5) will also be 4823 applied, and an additional update in the inner-loop will be created: 4824 4825 vp0 = &base_addr; 4826 LOOP: vp1 = phi(vp0,vp2) 4827 ... 4828 inner: vp3 = phi(vp1,vp4) 4829 vp4 = vp3 + inner_step 4830 if () goto inner 4831 ... 4832 vp2 = vp1 + step 4833 if () goto LOOP */ 4834 4835 /* (2) Calculate the initial address of the aggregate-pointer, and set 4836 the aggregate-pointer to point to it before the loop. */ 4837 4838 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4839 4840 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4841 offset, byte_offset); 4842 if (new_stmt_list) 4843 { 4844 if (pe) 4845 { 4846 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4847 gcc_assert (!new_bb); 4848 } 4849 else 4850 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4851 } 4852 4853 *initial_address = new_temp; 4854 aggr_ptr_init = new_temp; 4855 4856 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4857 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4858 inner-loop nested in LOOP (during outer-loop vectorization). */ 4859 4860 /* No update in loop is required. */ 4861 if (only_init && (!loop_vinfo || at_loop == loop)) 4862 aptr = aggr_ptr_init; 4863 else 4864 { 4865 if (iv_step == NULL_TREE) 4866 { 4867 /* The step of the aggregate pointer is the type size. */ 4868 iv_step = TYPE_SIZE_UNIT (aggr_type); 4869 /* One exception to the above is when the scalar step of the load in 4870 LOOP is zero. In this case the step here is also zero. */ 4871 if (*inv_p) 4872 iv_step = size_zero_node; 4873 else if (tree_int_cst_sgn (step) == -1) 4874 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4875 } 4876 4877 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4878 4879 create_iv (aggr_ptr_init, 4880 fold_convert (aggr_ptr_type, iv_step), 4881 aggr_ptr, loop, &incr_gsi, insert_after, 4882 &indx_before_incr, &indx_after_incr); 4883 incr = gsi_stmt (incr_gsi); 4884 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4885 4886 /* Copy the points-to information if it exists. */ 4887 if (DR_PTR_INFO (dr)) 4888 { 4889 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4890 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4891 } 4892 if (ptr_incr) 4893 *ptr_incr = incr; 4894 4895 aptr = indx_before_incr; 4896 } 4897 4898 if (!nested_in_vect_loop || only_init) 4899 return aptr; 4900 4901 4902 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4903 nested in LOOP, if exists. */ 4904 4905 gcc_assert (nested_in_vect_loop); 4906 if (!only_init) 4907 { 4908 standard_iv_increment_position (containing_loop, &incr_gsi, 4909 &insert_after); 4910 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4911 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4912 &indx_after_incr); 4913 incr = gsi_stmt (incr_gsi); 4914 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4915 4916 /* Copy the points-to information if it exists. */ 4917 if (DR_PTR_INFO (dr)) 4918 { 4919 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4920 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4921 } 4922 if (ptr_incr) 4923 *ptr_incr = incr; 4924 4925 return indx_before_incr; 4926 } 4927 else 4928 gcc_unreachable (); 4929 } 4930 4931 4932 /* Function bump_vector_ptr 4933 4934 Increment a pointer (to a vector type) by vector-size. If requested, 4935 i.e. if PTR-INCR is given, then also connect the new increment stmt 4936 to the existing def-use update-chain of the pointer, by modifying 4937 the PTR_INCR as illustrated below: 4938 4939 The pointer def-use update-chain before this function: 4940 DATAREF_PTR = phi (p_0, p_2) 4941 .... 4942 PTR_INCR: p_2 = DATAREF_PTR + step 4943 4944 The pointer def-use update-chain after this function: 4945 DATAREF_PTR = phi (p_0, p_2) 4946 .... 4947 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4948 .... 4949 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4950 4951 Input: 4952 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4953 in the loop. 4954 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4955 the loop. The increment amount across iterations is expected 4956 to be vector_size. 4957 BSI - location where the new update stmt is to be placed. 4958 STMT - the original scalar memory-access stmt that is being vectorized. 4959 BUMP - optional. The offset by which to bump the pointer. If not given, 4960 the offset is assumed to be vector_size. 4961 4962 Output: Return NEW_DATAREF_PTR as illustrated above. 4963 4964 */ 4965 4966 tree 4967 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4968 gimple *stmt, tree bump) 4969 { 4970 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4971 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4972 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4973 tree update = TYPE_SIZE_UNIT (vectype); 4974 gassign *incr_stmt; 4975 ssa_op_iter iter; 4976 use_operand_p use_p; 4977 tree new_dataref_ptr; 4978 4979 if (bump) 4980 update = bump; 4981 4982 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4983 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4984 else 4985 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4986 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4987 dataref_ptr, update); 4988 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4989 4990 /* Copy the points-to information if it exists. */ 4991 if (DR_PTR_INFO (dr)) 4992 { 4993 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4994 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4995 } 4996 4997 if (!ptr_incr) 4998 return new_dataref_ptr; 4999 5000 /* Update the vector-pointer's cross-iteration increment. */ 5001 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 5002 { 5003 tree use = USE_FROM_PTR (use_p); 5004 5005 if (use == dataref_ptr) 5006 SET_USE (use_p, new_dataref_ptr); 5007 else 5008 gcc_assert (operand_equal_p (use, update, 0)); 5009 } 5010 5011 return new_dataref_ptr; 5012 } 5013 5014 5015 /* Copy memory reference info such as base/clique from the SRC reference 5016 to the DEST MEM_REF. */ 5017 5018 void 5019 vect_copy_ref_info (tree dest, tree src) 5020 { 5021 if (TREE_CODE (dest) != MEM_REF) 5022 return; 5023 5024 tree src_base = src; 5025 while (handled_component_p (src_base)) 5026 src_base = TREE_OPERAND (src_base, 0); 5027 if (TREE_CODE (src_base) != MEM_REF 5028 && TREE_CODE (src_base) != TARGET_MEM_REF) 5029 return; 5030 5031 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 5032 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 5033 } 5034 5035 5036 /* Function vect_create_destination_var. 5037 5038 Create a new temporary of type VECTYPE. */ 5039 5040 tree 5041 vect_create_destination_var (tree scalar_dest, tree vectype) 5042 { 5043 tree vec_dest; 5044 const char *name; 5045 char *new_name; 5046 tree type; 5047 enum vect_var_kind kind; 5048 5049 kind = vectype 5050 ? VECTOR_BOOLEAN_TYPE_P (vectype) 5051 ? vect_mask_var 5052 : vect_simple_var 5053 : vect_scalar_var; 5054 type = vectype ? vectype : TREE_TYPE (scalar_dest); 5055 5056 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 5057 5058 name = get_name (scalar_dest); 5059 if (name) 5060 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 5061 else 5062 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 5063 vec_dest = vect_get_new_vect_var (type, kind, new_name); 5064 free (new_name); 5065 5066 return vec_dest; 5067 } 5068 5069 /* Function vect_grouped_store_supported. 5070 5071 Returns TRUE if interleave high and interleave low permutations 5072 are supported, and FALSE otherwise. */ 5073 5074 bool 5075 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 5076 { 5077 machine_mode mode = TYPE_MODE (vectype); 5078 5079 /* vect_permute_store_chain requires the group size to be equal to 3 or 5080 be a power of two. */ 5081 if (count != 3 && exact_log2 (count) == -1) 5082 { 5083 if (dump_enabled_p ()) 5084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5085 "the size of the group of accesses" 5086 " is not a power of 2 or not eqaul to 3\n"); 5087 return false; 5088 } 5089 5090 /* Check that the permutation is supported. */ 5091 if (VECTOR_MODE_P (mode)) 5092 { 5093 unsigned int i; 5094 if (count == 3) 5095 { 5096 unsigned int j0 = 0, j1 = 0, j2 = 0; 5097 unsigned int i, j; 5098 5099 unsigned int nelt; 5100 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5101 { 5102 if (dump_enabled_p ()) 5103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5104 "cannot handle groups of 3 stores for" 5105 " variable-length vectors\n"); 5106 return false; 5107 } 5108 5109 vec_perm_builder sel (nelt, nelt, 1); 5110 sel.quick_grow (nelt); 5111 vec_perm_indices indices; 5112 for (j = 0; j < 3; j++) 5113 { 5114 int nelt0 = ((3 - j) * nelt) % 3; 5115 int nelt1 = ((3 - j) * nelt + 1) % 3; 5116 int nelt2 = ((3 - j) * nelt + 2) % 3; 5117 for (i = 0; i < nelt; i++) 5118 { 5119 if (3 * i + nelt0 < nelt) 5120 sel[3 * i + nelt0] = j0++; 5121 if (3 * i + nelt1 < nelt) 5122 sel[3 * i + nelt1] = nelt + j1++; 5123 if (3 * i + nelt2 < nelt) 5124 sel[3 * i + nelt2] = 0; 5125 } 5126 indices.new_vector (sel, 2, nelt); 5127 if (!can_vec_perm_const_p (mode, indices)) 5128 { 5129 if (dump_enabled_p ()) 5130 dump_printf (MSG_MISSED_OPTIMIZATION, 5131 "permutation op not supported by target.\n"); 5132 return false; 5133 } 5134 5135 for (i = 0; i < nelt; i++) 5136 { 5137 if (3 * i + nelt0 < nelt) 5138 sel[3 * i + nelt0] = 3 * i + nelt0; 5139 if (3 * i + nelt1 < nelt) 5140 sel[3 * i + nelt1] = 3 * i + nelt1; 5141 if (3 * i + nelt2 < nelt) 5142 sel[3 * i + nelt2] = nelt + j2++; 5143 } 5144 indices.new_vector (sel, 2, nelt); 5145 if (!can_vec_perm_const_p (mode, indices)) 5146 { 5147 if (dump_enabled_p ()) 5148 dump_printf (MSG_MISSED_OPTIMIZATION, 5149 "permutation op not supported by target.\n"); 5150 return false; 5151 } 5152 } 5153 return true; 5154 } 5155 else 5156 { 5157 /* If length is not equal to 3 then only power of 2 is supported. */ 5158 gcc_assert (pow2p_hwi (count)); 5159 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5160 5161 /* The encoding has 2 interleaved stepped patterns. */ 5162 vec_perm_builder sel (nelt, 2, 3); 5163 sel.quick_grow (6); 5164 for (i = 0; i < 3; i++) 5165 { 5166 sel[i * 2] = i; 5167 sel[i * 2 + 1] = i + nelt; 5168 } 5169 vec_perm_indices indices (sel, 2, nelt); 5170 if (can_vec_perm_const_p (mode, indices)) 5171 { 5172 for (i = 0; i < 6; i++) 5173 sel[i] += exact_div (nelt, 2); 5174 indices.new_vector (sel, 2, nelt); 5175 if (can_vec_perm_const_p (mode, indices)) 5176 return true; 5177 } 5178 } 5179 } 5180 5181 if (dump_enabled_p ()) 5182 dump_printf (MSG_MISSED_OPTIMIZATION, 5183 "permutaion op not supported by target.\n"); 5184 return false; 5185 } 5186 5187 5188 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5189 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5190 5191 bool 5192 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5193 bool masked_p) 5194 { 5195 if (masked_p) 5196 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5197 vec_mask_store_lanes_optab, 5198 vectype, count); 5199 else 5200 return vect_lanes_optab_supported_p ("vec_store_lanes", 5201 vec_store_lanes_optab, 5202 vectype, count); 5203 } 5204 5205 5206 /* Function vect_permute_store_chain. 5207 5208 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5209 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5210 the data correctly for the stores. Return the final references for stores 5211 in RESULT_CHAIN. 5212 5213 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5214 The input is 4 vectors each containing 8 elements. We assign a number to 5215 each element, the input sequence is: 5216 5217 1st vec: 0 1 2 3 4 5 6 7 5218 2nd vec: 8 9 10 11 12 13 14 15 5219 3rd vec: 16 17 18 19 20 21 22 23 5220 4th vec: 24 25 26 27 28 29 30 31 5221 5222 The output sequence should be: 5223 5224 1st vec: 0 8 16 24 1 9 17 25 5225 2nd vec: 2 10 18 26 3 11 19 27 5226 3rd vec: 4 12 20 28 5 13 21 30 5227 4th vec: 6 14 22 30 7 15 23 31 5228 5229 i.e., we interleave the contents of the four vectors in their order. 5230 5231 We use interleave_high/low instructions to create such output. The input of 5232 each interleave_high/low operation is two vectors: 5233 1st vec 2nd vec 5234 0 1 2 3 4 5 6 7 5235 the even elements of the result vector are obtained left-to-right from the 5236 high/low elements of the first vector. The odd elements of the result are 5237 obtained left-to-right from the high/low elements of the second vector. 5238 The output of interleave_high will be: 0 4 1 5 5239 and of interleave_low: 2 6 3 7 5240 5241 5242 The permutation is done in log LENGTH stages. In each stage interleave_high 5243 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5244 where the first argument is taken from the first half of DR_CHAIN and the 5245 second argument from it's second half. 5246 In our example, 5247 5248 I1: interleave_high (1st vec, 3rd vec) 5249 I2: interleave_low (1st vec, 3rd vec) 5250 I3: interleave_high (2nd vec, 4th vec) 5251 I4: interleave_low (2nd vec, 4th vec) 5252 5253 The output for the first stage is: 5254 5255 I1: 0 16 1 17 2 18 3 19 5256 I2: 4 20 5 21 6 22 7 23 5257 I3: 8 24 9 25 10 26 11 27 5258 I4: 12 28 13 29 14 30 15 31 5259 5260 The output of the second stage, i.e. the final result is: 5261 5262 I1: 0 8 16 24 1 9 17 25 5263 I2: 2 10 18 26 3 11 19 27 5264 I3: 4 12 20 28 5 13 21 30 5265 I4: 6 14 22 30 7 15 23 31. */ 5266 5267 void 5268 vect_permute_store_chain (vec<tree> dr_chain, 5269 unsigned int length, 5270 gimple *stmt, 5271 gimple_stmt_iterator *gsi, 5272 vec<tree> *result_chain) 5273 { 5274 tree vect1, vect2, high, low; 5275 gimple *perm_stmt; 5276 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5277 tree perm_mask_low, perm_mask_high; 5278 tree data_ref; 5279 tree perm3_mask_low, perm3_mask_high; 5280 unsigned int i, j, n, log_length = exact_log2 (length); 5281 5282 result_chain->quick_grow (length); 5283 memcpy (result_chain->address (), dr_chain.address (), 5284 length * sizeof (tree)); 5285 5286 if (length == 3) 5287 { 5288 /* vect_grouped_store_supported ensures that this is constant. */ 5289 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5290 unsigned int j0 = 0, j1 = 0, j2 = 0; 5291 5292 vec_perm_builder sel (nelt, nelt, 1); 5293 sel.quick_grow (nelt); 5294 vec_perm_indices indices; 5295 for (j = 0; j < 3; j++) 5296 { 5297 int nelt0 = ((3 - j) * nelt) % 3; 5298 int nelt1 = ((3 - j) * nelt + 1) % 3; 5299 int nelt2 = ((3 - j) * nelt + 2) % 3; 5300 5301 for (i = 0; i < nelt; i++) 5302 { 5303 if (3 * i + nelt0 < nelt) 5304 sel[3 * i + nelt0] = j0++; 5305 if (3 * i + nelt1 < nelt) 5306 sel[3 * i + nelt1] = nelt + j1++; 5307 if (3 * i + nelt2 < nelt) 5308 sel[3 * i + nelt2] = 0; 5309 } 5310 indices.new_vector (sel, 2, nelt); 5311 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5312 5313 for (i = 0; i < nelt; i++) 5314 { 5315 if (3 * i + nelt0 < nelt) 5316 sel[3 * i + nelt0] = 3 * i + nelt0; 5317 if (3 * i + nelt1 < nelt) 5318 sel[3 * i + nelt1] = 3 * i + nelt1; 5319 if (3 * i + nelt2 < nelt) 5320 sel[3 * i + nelt2] = nelt + j2++; 5321 } 5322 indices.new_vector (sel, 2, nelt); 5323 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5324 5325 vect1 = dr_chain[0]; 5326 vect2 = dr_chain[1]; 5327 5328 /* Create interleaving stmt: 5329 low = VEC_PERM_EXPR <vect1, vect2, 5330 {j, nelt, *, j + 1, nelt + j + 1, *, 5331 j + 2, nelt + j + 2, *, ...}> */ 5332 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5333 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5334 vect2, perm3_mask_low); 5335 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5336 5337 vect1 = data_ref; 5338 vect2 = dr_chain[2]; 5339 /* Create interleaving stmt: 5340 low = VEC_PERM_EXPR <vect1, vect2, 5341 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5342 6, 7, nelt + j + 2, ...}> */ 5343 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5344 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5345 vect2, perm3_mask_high); 5346 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5347 (*result_chain)[j] = data_ref; 5348 } 5349 } 5350 else 5351 { 5352 /* If length is not equal to 3 then only power of 2 is supported. */ 5353 gcc_assert (pow2p_hwi (length)); 5354 5355 /* The encoding has 2 interleaved stepped patterns. */ 5356 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5357 vec_perm_builder sel (nelt, 2, 3); 5358 sel.quick_grow (6); 5359 for (i = 0; i < 3; i++) 5360 { 5361 sel[i * 2] = i; 5362 sel[i * 2 + 1] = i + nelt; 5363 } 5364 vec_perm_indices indices (sel, 2, nelt); 5365 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5366 5367 for (i = 0; i < 6; i++) 5368 sel[i] += exact_div (nelt, 2); 5369 indices.new_vector (sel, 2, nelt); 5370 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5371 5372 for (i = 0, n = log_length; i < n; i++) 5373 { 5374 for (j = 0; j < length/2; j++) 5375 { 5376 vect1 = dr_chain[j]; 5377 vect2 = dr_chain[j+length/2]; 5378 5379 /* Create interleaving stmt: 5380 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5381 ...}> */ 5382 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5383 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5384 vect2, perm_mask_high); 5385 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5386 (*result_chain)[2*j] = high; 5387 5388 /* Create interleaving stmt: 5389 low = VEC_PERM_EXPR <vect1, vect2, 5390 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5391 ...}> */ 5392 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5393 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5394 vect2, perm_mask_low); 5395 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5396 (*result_chain)[2*j+1] = low; 5397 } 5398 memcpy (dr_chain.address (), result_chain->address (), 5399 length * sizeof (tree)); 5400 } 5401 } 5402 } 5403 5404 /* Function vect_setup_realignment 5405 5406 This function is called when vectorizing an unaligned load using 5407 the dr_explicit_realign[_optimized] scheme. 5408 This function generates the following code at the loop prolog: 5409 5410 p = initial_addr; 5411 x msq_init = *(floor(p)); # prolog load 5412 realignment_token = call target_builtin; 5413 loop: 5414 x msq = phi (msq_init, ---) 5415 5416 The stmts marked with x are generated only for the case of 5417 dr_explicit_realign_optimized. 5418 5419 The code above sets up a new (vector) pointer, pointing to the first 5420 location accessed by STMT, and a "floor-aligned" load using that pointer. 5421 It also generates code to compute the "realignment-token" (if the relevant 5422 target hook was defined), and creates a phi-node at the loop-header bb 5423 whose arguments are the result of the prolog-load (created by this 5424 function) and the result of a load that takes place in the loop (to be 5425 created by the caller to this function). 5426 5427 For the case of dr_explicit_realign_optimized: 5428 The caller to this function uses the phi-result (msq) to create the 5429 realignment code inside the loop, and sets up the missing phi argument, 5430 as follows: 5431 loop: 5432 msq = phi (msq_init, lsq) 5433 lsq = *(floor(p')); # load in loop 5434 result = realign_load (msq, lsq, realignment_token); 5435 5436 For the case of dr_explicit_realign: 5437 loop: 5438 msq = *(floor(p)); # load in loop 5439 p' = p + (VS-1); 5440 lsq = *(floor(p')); # load in loop 5441 result = realign_load (msq, lsq, realignment_token); 5442 5443 Input: 5444 STMT - (scalar) load stmt to be vectorized. This load accesses 5445 a memory location that may be unaligned. 5446 BSI - place where new code is to be inserted. 5447 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5448 is used. 5449 5450 Output: 5451 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5452 target hook, if defined. 5453 Return value - the result of the loop-header phi node. */ 5454 5455 tree 5456 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5457 tree *realignment_token, 5458 enum dr_alignment_support alignment_support_scheme, 5459 tree init_addr, 5460 struct loop **at_loop) 5461 { 5462 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5463 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5465 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5466 struct loop *loop = NULL; 5467 edge pe = NULL; 5468 tree scalar_dest = gimple_assign_lhs (stmt); 5469 tree vec_dest; 5470 gimple *inc; 5471 tree ptr; 5472 tree data_ref; 5473 basic_block new_bb; 5474 tree msq_init = NULL_TREE; 5475 tree new_temp; 5476 gphi *phi_stmt; 5477 tree msq = NULL_TREE; 5478 gimple_seq stmts = NULL; 5479 bool inv_p; 5480 bool compute_in_loop = false; 5481 bool nested_in_vect_loop = false; 5482 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5483 struct loop *loop_for_initial_load = NULL; 5484 5485 if (loop_vinfo) 5486 { 5487 loop = LOOP_VINFO_LOOP (loop_vinfo); 5488 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5489 } 5490 5491 gcc_assert (alignment_support_scheme == dr_explicit_realign 5492 || alignment_support_scheme == dr_explicit_realign_optimized); 5493 5494 /* We need to generate three things: 5495 1. the misalignment computation 5496 2. the extra vector load (for the optimized realignment scheme). 5497 3. the phi node for the two vectors from which the realignment is 5498 done (for the optimized realignment scheme). */ 5499 5500 /* 1. Determine where to generate the misalignment computation. 5501 5502 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5503 calculation will be generated by this function, outside the loop (in the 5504 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5505 caller, inside the loop. 5506 5507 Background: If the misalignment remains fixed throughout the iterations of 5508 the loop, then both realignment schemes are applicable, and also the 5509 misalignment computation can be done outside LOOP. This is because we are 5510 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5511 are a multiple of VS (the Vector Size), and therefore the misalignment in 5512 different vectorized LOOP iterations is always the same. 5513 The problem arises only if the memory access is in an inner-loop nested 5514 inside LOOP, which is now being vectorized using outer-loop vectorization. 5515 This is the only case when the misalignment of the memory access may not 5516 remain fixed throughout the iterations of the inner-loop (as explained in 5517 detail in vect_supportable_dr_alignment). In this case, not only is the 5518 optimized realignment scheme not applicable, but also the misalignment 5519 computation (and generation of the realignment token that is passed to 5520 REALIGN_LOAD) have to be done inside the loop. 5521 5522 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5523 or not, which in turn determines if the misalignment is computed inside 5524 the inner-loop, or outside LOOP. */ 5525 5526 if (init_addr != NULL_TREE || !loop_vinfo) 5527 { 5528 compute_in_loop = true; 5529 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5530 } 5531 5532 5533 /* 2. Determine where to generate the extra vector load. 5534 5535 For the optimized realignment scheme, instead of generating two vector 5536 loads in each iteration, we generate a single extra vector load in the 5537 preheader of the loop, and in each iteration reuse the result of the 5538 vector load from the previous iteration. In case the memory access is in 5539 an inner-loop nested inside LOOP, which is now being vectorized using 5540 outer-loop vectorization, we need to determine whether this initial vector 5541 load should be generated at the preheader of the inner-loop, or can be 5542 generated at the preheader of LOOP. If the memory access has no evolution 5543 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5544 to be generated inside LOOP (in the preheader of the inner-loop). */ 5545 5546 if (nested_in_vect_loop) 5547 { 5548 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5549 bool invariant_in_outerloop = 5550 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5551 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5552 } 5553 else 5554 loop_for_initial_load = loop; 5555 if (at_loop) 5556 *at_loop = loop_for_initial_load; 5557 5558 if (loop_for_initial_load) 5559 pe = loop_preheader_edge (loop_for_initial_load); 5560 5561 /* 3. For the case of the optimized realignment, create the first vector 5562 load at the loop preheader. */ 5563 5564 if (alignment_support_scheme == dr_explicit_realign_optimized) 5565 { 5566 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5567 gassign *new_stmt; 5568 5569 gcc_assert (!compute_in_loop); 5570 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5571 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5572 NULL_TREE, &init_addr, NULL, &inc, 5573 true, &inv_p); 5574 if (TREE_CODE (ptr) == SSA_NAME) 5575 new_temp = copy_ssa_name (ptr); 5576 else 5577 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5578 unsigned int align = DR_TARGET_ALIGNMENT (dr); 5579 new_stmt = gimple_build_assign 5580 (new_temp, BIT_AND_EXPR, ptr, 5581 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); 5582 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5583 gcc_assert (!new_bb); 5584 data_ref 5585 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5586 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5587 vect_copy_ref_info (data_ref, DR_REF (dr)); 5588 new_stmt = gimple_build_assign (vec_dest, data_ref); 5589 new_temp = make_ssa_name (vec_dest, new_stmt); 5590 gimple_assign_set_lhs (new_stmt, new_temp); 5591 if (pe) 5592 { 5593 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5594 gcc_assert (!new_bb); 5595 } 5596 else 5597 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5598 5599 msq_init = gimple_assign_lhs (new_stmt); 5600 } 5601 5602 /* 4. Create realignment token using a target builtin, if available. 5603 It is done either inside the containing loop, or before LOOP (as 5604 determined above). */ 5605 5606 if (targetm.vectorize.builtin_mask_for_load) 5607 { 5608 gcall *new_stmt; 5609 tree builtin_decl; 5610 5611 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5612 if (!init_addr) 5613 { 5614 /* Generate the INIT_ADDR computation outside LOOP. */ 5615 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5616 NULL_TREE); 5617 if (loop) 5618 { 5619 pe = loop_preheader_edge (loop); 5620 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5621 gcc_assert (!new_bb); 5622 } 5623 else 5624 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5625 } 5626 5627 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5628 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5629 vec_dest = 5630 vect_create_destination_var (scalar_dest, 5631 gimple_call_return_type (new_stmt)); 5632 new_temp = make_ssa_name (vec_dest, new_stmt); 5633 gimple_call_set_lhs (new_stmt, new_temp); 5634 5635 if (compute_in_loop) 5636 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5637 else 5638 { 5639 /* Generate the misalignment computation outside LOOP. */ 5640 pe = loop_preheader_edge (loop); 5641 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5642 gcc_assert (!new_bb); 5643 } 5644 5645 *realignment_token = gimple_call_lhs (new_stmt); 5646 5647 /* The result of the CALL_EXPR to this builtin is determined from 5648 the value of the parameter and no global variables are touched 5649 which makes the builtin a "const" function. Requiring the 5650 builtin to have the "const" attribute makes it unnecessary 5651 to call mark_call_clobbered. */ 5652 gcc_assert (TREE_READONLY (builtin_decl)); 5653 } 5654 5655 if (alignment_support_scheme == dr_explicit_realign) 5656 return msq; 5657 5658 gcc_assert (!compute_in_loop); 5659 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5660 5661 5662 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5663 5664 pe = loop_preheader_edge (containing_loop); 5665 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5666 msq = make_ssa_name (vec_dest); 5667 phi_stmt = create_phi_node (msq, containing_loop->header); 5668 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5669 5670 return msq; 5671 } 5672 5673 5674 /* Function vect_grouped_load_supported. 5675 5676 COUNT is the size of the load group (the number of statements plus the 5677 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5678 only one statement, with a gap of COUNT - 1. 5679 5680 Returns true if a suitable permute exists. */ 5681 5682 bool 5683 vect_grouped_load_supported (tree vectype, bool single_element_p, 5684 unsigned HOST_WIDE_INT count) 5685 { 5686 machine_mode mode = TYPE_MODE (vectype); 5687 5688 /* If this is single-element interleaving with an element distance 5689 that leaves unused vector loads around punt - we at least create 5690 very sub-optimal code in that case (and blow up memory, 5691 see PR65518). */ 5692 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5693 { 5694 if (dump_enabled_p ()) 5695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5696 "single-element interleaving not supported " 5697 "for not adjacent vector loads\n"); 5698 return false; 5699 } 5700 5701 /* vect_permute_load_chain requires the group size to be equal to 3 or 5702 be a power of two. */ 5703 if (count != 3 && exact_log2 (count) == -1) 5704 { 5705 if (dump_enabled_p ()) 5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5707 "the size of the group of accesses" 5708 " is not a power of 2 or not equal to 3\n"); 5709 return false; 5710 } 5711 5712 /* Check that the permutation is supported. */ 5713 if (VECTOR_MODE_P (mode)) 5714 { 5715 unsigned int i, j; 5716 if (count == 3) 5717 { 5718 unsigned int nelt; 5719 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5720 { 5721 if (dump_enabled_p ()) 5722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5723 "cannot handle groups of 3 loads for" 5724 " variable-length vectors\n"); 5725 return false; 5726 } 5727 5728 vec_perm_builder sel (nelt, nelt, 1); 5729 sel.quick_grow (nelt); 5730 vec_perm_indices indices; 5731 unsigned int k; 5732 for (k = 0; k < 3; k++) 5733 { 5734 for (i = 0; i < nelt; i++) 5735 if (3 * i + k < 2 * nelt) 5736 sel[i] = 3 * i + k; 5737 else 5738 sel[i] = 0; 5739 indices.new_vector (sel, 2, nelt); 5740 if (!can_vec_perm_const_p (mode, indices)) 5741 { 5742 if (dump_enabled_p ()) 5743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5744 "shuffle of 3 loads is not supported by" 5745 " target\n"); 5746 return false; 5747 } 5748 for (i = 0, j = 0; i < nelt; i++) 5749 if (3 * i + k < 2 * nelt) 5750 sel[i] = i; 5751 else 5752 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5753 indices.new_vector (sel, 2, nelt); 5754 if (!can_vec_perm_const_p (mode, indices)) 5755 { 5756 if (dump_enabled_p ()) 5757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5758 "shuffle of 3 loads is not supported by" 5759 " target\n"); 5760 return false; 5761 } 5762 } 5763 return true; 5764 } 5765 else 5766 { 5767 /* If length is not equal to 3 then only power of 2 is supported. */ 5768 gcc_assert (pow2p_hwi (count)); 5769 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5770 5771 /* The encoding has a single stepped pattern. */ 5772 vec_perm_builder sel (nelt, 1, 3); 5773 sel.quick_grow (3); 5774 for (i = 0; i < 3; i++) 5775 sel[i] = i * 2; 5776 vec_perm_indices indices (sel, 2, nelt); 5777 if (can_vec_perm_const_p (mode, indices)) 5778 { 5779 for (i = 0; i < 3; i++) 5780 sel[i] = i * 2 + 1; 5781 indices.new_vector (sel, 2, nelt); 5782 if (can_vec_perm_const_p (mode, indices)) 5783 return true; 5784 } 5785 } 5786 } 5787 5788 if (dump_enabled_p ()) 5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5790 "extract even/odd not supported by target\n"); 5791 return false; 5792 } 5793 5794 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5795 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5796 5797 bool 5798 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5799 bool masked_p) 5800 { 5801 if (masked_p) 5802 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5803 vec_mask_load_lanes_optab, 5804 vectype, count); 5805 else 5806 return vect_lanes_optab_supported_p ("vec_load_lanes", 5807 vec_load_lanes_optab, 5808 vectype, count); 5809 } 5810 5811 /* Function vect_permute_load_chain. 5812 5813 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5814 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5815 the input data correctly. Return the final references for loads in 5816 RESULT_CHAIN. 5817 5818 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5819 The input is 4 vectors each containing 8 elements. We assign a number to each 5820 element, the input sequence is: 5821 5822 1st vec: 0 1 2 3 4 5 6 7 5823 2nd vec: 8 9 10 11 12 13 14 15 5824 3rd vec: 16 17 18 19 20 21 22 23 5825 4th vec: 24 25 26 27 28 29 30 31 5826 5827 The output sequence should be: 5828 5829 1st vec: 0 4 8 12 16 20 24 28 5830 2nd vec: 1 5 9 13 17 21 25 29 5831 3rd vec: 2 6 10 14 18 22 26 30 5832 4th vec: 3 7 11 15 19 23 27 31 5833 5834 i.e., the first output vector should contain the first elements of each 5835 interleaving group, etc. 5836 5837 We use extract_even/odd instructions to create such output. The input of 5838 each extract_even/odd operation is two vectors 5839 1st vec 2nd vec 5840 0 1 2 3 4 5 6 7 5841 5842 and the output is the vector of extracted even/odd elements. The output of 5843 extract_even will be: 0 2 4 6 5844 and of extract_odd: 1 3 5 7 5845 5846 5847 The permutation is done in log LENGTH stages. In each stage extract_even 5848 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5849 their order. In our example, 5850 5851 E1: extract_even (1st vec, 2nd vec) 5852 E2: extract_odd (1st vec, 2nd vec) 5853 E3: extract_even (3rd vec, 4th vec) 5854 E4: extract_odd (3rd vec, 4th vec) 5855 5856 The output for the first stage will be: 5857 5858 E1: 0 2 4 6 8 10 12 14 5859 E2: 1 3 5 7 9 11 13 15 5860 E3: 16 18 20 22 24 26 28 30 5861 E4: 17 19 21 23 25 27 29 31 5862 5863 In order to proceed and create the correct sequence for the next stage (or 5864 for the correct output, if the second stage is the last one, as in our 5865 example), we first put the output of extract_even operation and then the 5866 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5867 The input for the second stage is: 5868 5869 1st vec (E1): 0 2 4 6 8 10 12 14 5870 2nd vec (E3): 16 18 20 22 24 26 28 30 5871 3rd vec (E2): 1 3 5 7 9 11 13 15 5872 4th vec (E4): 17 19 21 23 25 27 29 31 5873 5874 The output of the second stage: 5875 5876 E1: 0 4 8 12 16 20 24 28 5877 E2: 2 6 10 14 18 22 26 30 5878 E3: 1 5 9 13 17 21 25 29 5879 E4: 3 7 11 15 19 23 27 31 5880 5881 And RESULT_CHAIN after reordering: 5882 5883 1st vec (E1): 0 4 8 12 16 20 24 28 5884 2nd vec (E3): 1 5 9 13 17 21 25 29 5885 3rd vec (E2): 2 6 10 14 18 22 26 30 5886 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5887 5888 static void 5889 vect_permute_load_chain (vec<tree> dr_chain, 5890 unsigned int length, 5891 gimple *stmt, 5892 gimple_stmt_iterator *gsi, 5893 vec<tree> *result_chain) 5894 { 5895 tree data_ref, first_vect, second_vect; 5896 tree perm_mask_even, perm_mask_odd; 5897 tree perm3_mask_low, perm3_mask_high; 5898 gimple *perm_stmt; 5899 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5900 unsigned int i, j, log_length = exact_log2 (length); 5901 5902 result_chain->quick_grow (length); 5903 memcpy (result_chain->address (), dr_chain.address (), 5904 length * sizeof (tree)); 5905 5906 if (length == 3) 5907 { 5908 /* vect_grouped_load_supported ensures that this is constant. */ 5909 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5910 unsigned int k; 5911 5912 vec_perm_builder sel (nelt, nelt, 1); 5913 sel.quick_grow (nelt); 5914 vec_perm_indices indices; 5915 for (k = 0; k < 3; k++) 5916 { 5917 for (i = 0; i < nelt; i++) 5918 if (3 * i + k < 2 * nelt) 5919 sel[i] = 3 * i + k; 5920 else 5921 sel[i] = 0; 5922 indices.new_vector (sel, 2, nelt); 5923 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5924 5925 for (i = 0, j = 0; i < nelt; i++) 5926 if (3 * i + k < 2 * nelt) 5927 sel[i] = i; 5928 else 5929 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5930 indices.new_vector (sel, 2, nelt); 5931 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5932 5933 first_vect = dr_chain[0]; 5934 second_vect = dr_chain[1]; 5935 5936 /* Create interleaving stmt (low part of): 5937 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5938 ...}> */ 5939 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5940 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5941 second_vect, perm3_mask_low); 5942 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5943 5944 /* Create interleaving stmt (high part of): 5945 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5946 ...}> */ 5947 first_vect = data_ref; 5948 second_vect = dr_chain[2]; 5949 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5950 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5951 second_vect, perm3_mask_high); 5952 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5953 (*result_chain)[k] = data_ref; 5954 } 5955 } 5956 else 5957 { 5958 /* If length is not equal to 3 then only power of 2 is supported. */ 5959 gcc_assert (pow2p_hwi (length)); 5960 5961 /* The encoding has a single stepped pattern. */ 5962 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5963 vec_perm_builder sel (nelt, 1, 3); 5964 sel.quick_grow (3); 5965 for (i = 0; i < 3; ++i) 5966 sel[i] = i * 2; 5967 vec_perm_indices indices (sel, 2, nelt); 5968 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 5969 5970 for (i = 0; i < 3; ++i) 5971 sel[i] = i * 2 + 1; 5972 indices.new_vector (sel, 2, nelt); 5973 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 5974 5975 for (i = 0; i < log_length; i++) 5976 { 5977 for (j = 0; j < length; j += 2) 5978 { 5979 first_vect = dr_chain[j]; 5980 second_vect = dr_chain[j+1]; 5981 5982 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5983 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5984 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5985 first_vect, second_vect, 5986 perm_mask_even); 5987 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5988 (*result_chain)[j/2] = data_ref; 5989 5990 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5991 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5992 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5993 first_vect, second_vect, 5994 perm_mask_odd); 5995 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5996 (*result_chain)[j/2+length/2] = data_ref; 5997 } 5998 memcpy (dr_chain.address (), result_chain->address (), 5999 length * sizeof (tree)); 6000 } 6001 } 6002 } 6003 6004 /* Function vect_shift_permute_load_chain. 6005 6006 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 6007 sequence of stmts to reorder the input data accordingly. 6008 Return the final references for loads in RESULT_CHAIN. 6009 Return true if successed, false otherwise. 6010 6011 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 6012 The input is 3 vectors each containing 8 elements. We assign a 6013 number to each element, the input sequence is: 6014 6015 1st vec: 0 1 2 3 4 5 6 7 6016 2nd vec: 8 9 10 11 12 13 14 15 6017 3rd vec: 16 17 18 19 20 21 22 23 6018 6019 The output sequence should be: 6020 6021 1st vec: 0 3 6 9 12 15 18 21 6022 2nd vec: 1 4 7 10 13 16 19 22 6023 3rd vec: 2 5 8 11 14 17 20 23 6024 6025 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 6026 6027 First we shuffle all 3 vectors to get correct elements order: 6028 6029 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 6030 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 6031 3rd vec: (16 19 22) (17 20 23) (18 21) 6032 6033 Next we unite and shift vector 3 times: 6034 6035 1st step: 6036 shift right by 6 the concatenation of: 6037 "1st vec" and "2nd vec" 6038 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 6039 "2nd vec" and "3rd vec" 6040 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 6041 "3rd vec" and "1st vec" 6042 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 6043 | New vectors | 6044 6045 So that now new vectors are: 6046 6047 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 6048 2nd vec: (10 13) (16 19 22) (17 20 23) 6049 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 6050 6051 2nd step: 6052 shift right by 5 the concatenation of: 6053 "1st vec" and "3rd vec" 6054 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 6055 "2nd vec" and "1st vec" 6056 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 6057 "3rd vec" and "2nd vec" 6058 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 6059 | New vectors | 6060 6061 So that now new vectors are: 6062 6063 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 6064 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 6065 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 6066 6067 3rd step: 6068 shift right by 5 the concatenation of: 6069 "1st vec" and "1st vec" 6070 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 6071 shift right by 3 the concatenation of: 6072 "2nd vec" and "2nd vec" 6073 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 6074 | New vectors | 6075 6076 So that now all vectors are READY: 6077 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6078 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6079 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6080 6081 This algorithm is faster than one in vect_permute_load_chain if: 6082 1. "shift of a concatination" is faster than general permutation. 6083 This is usually so. 6084 2. The TARGET machine can't execute vector instructions in parallel. 6085 This is because each step of the algorithm depends on previous. 6086 The algorithm in vect_permute_load_chain is much more parallel. 6087 6088 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6089 */ 6090 6091 static bool 6092 vect_shift_permute_load_chain (vec<tree> dr_chain, 6093 unsigned int length, 6094 gimple *stmt, 6095 gimple_stmt_iterator *gsi, 6096 vec<tree> *result_chain) 6097 { 6098 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6099 tree perm2_mask1, perm2_mask2, perm3_mask; 6100 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6101 gimple *perm_stmt; 6102 6103 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 6104 unsigned int i; 6105 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6107 6108 unsigned HOST_WIDE_INT nelt, vf; 6109 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6110 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6111 /* Not supported for variable-length vectors. */ 6112 return false; 6113 6114 vec_perm_builder sel (nelt, nelt, 1); 6115 sel.quick_grow (nelt); 6116 6117 result_chain->quick_grow (length); 6118 memcpy (result_chain->address (), dr_chain.address (), 6119 length * sizeof (tree)); 6120 6121 if (pow2p_hwi (length) && vf > 4) 6122 { 6123 unsigned int j, log_length = exact_log2 (length); 6124 for (i = 0; i < nelt / 2; ++i) 6125 sel[i] = i * 2; 6126 for (i = 0; i < nelt / 2; ++i) 6127 sel[nelt / 2 + i] = i * 2 + 1; 6128 vec_perm_indices indices (sel, 2, nelt); 6129 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6130 { 6131 if (dump_enabled_p ()) 6132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6133 "shuffle of 2 fields structure is not \ 6134 supported by target\n"); 6135 return false; 6136 } 6137 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6138 6139 for (i = 0; i < nelt / 2; ++i) 6140 sel[i] = i * 2 + 1; 6141 for (i = 0; i < nelt / 2; ++i) 6142 sel[nelt / 2 + i] = i * 2; 6143 indices.new_vector (sel, 2, nelt); 6144 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6145 { 6146 if (dump_enabled_p ()) 6147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6148 "shuffle of 2 fields structure is not \ 6149 supported by target\n"); 6150 return false; 6151 } 6152 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6153 6154 /* Generating permutation constant to shift all elements. 6155 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6156 for (i = 0; i < nelt; i++) 6157 sel[i] = nelt / 2 + i; 6158 indices.new_vector (sel, 2, nelt); 6159 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6160 { 6161 if (dump_enabled_p ()) 6162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6163 "shift permutation is not supported by target\n"); 6164 return false; 6165 } 6166 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6167 6168 /* Generating permutation constant to select vector from 2. 6169 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6170 for (i = 0; i < nelt / 2; i++) 6171 sel[i] = i; 6172 for (i = nelt / 2; i < nelt; i++) 6173 sel[i] = nelt + i; 6174 indices.new_vector (sel, 2, nelt); 6175 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6176 { 6177 if (dump_enabled_p ()) 6178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6179 "select is not supported by target\n"); 6180 return false; 6181 } 6182 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6183 6184 for (i = 0; i < log_length; i++) 6185 { 6186 for (j = 0; j < length; j += 2) 6187 { 6188 first_vect = dr_chain[j]; 6189 second_vect = dr_chain[j + 1]; 6190 6191 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6192 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6193 first_vect, first_vect, 6194 perm2_mask1); 6195 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6196 vect[0] = data_ref; 6197 6198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6200 second_vect, second_vect, 6201 perm2_mask2); 6202 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6203 vect[1] = data_ref; 6204 6205 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6206 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6207 vect[0], vect[1], shift1_mask); 6208 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6209 (*result_chain)[j/2 + length/2] = data_ref; 6210 6211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6213 vect[0], vect[1], select_mask); 6214 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6215 (*result_chain)[j/2] = data_ref; 6216 } 6217 memcpy (dr_chain.address (), result_chain->address (), 6218 length * sizeof (tree)); 6219 } 6220 return true; 6221 } 6222 if (length == 3 && vf > 2) 6223 { 6224 unsigned int k = 0, l = 0; 6225 6226 /* Generating permutation constant to get all elements in rigth order. 6227 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6228 for (i = 0; i < nelt; i++) 6229 { 6230 if (3 * k + (l % 3) >= nelt) 6231 { 6232 k = 0; 6233 l += (3 - (nelt % 3)); 6234 } 6235 sel[i] = 3 * k + (l % 3); 6236 k++; 6237 } 6238 vec_perm_indices indices (sel, 2, nelt); 6239 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6240 { 6241 if (dump_enabled_p ()) 6242 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6243 "shuffle of 3 fields structure is not \ 6244 supported by target\n"); 6245 return false; 6246 } 6247 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6248 6249 /* Generating permutation constant to shift all elements. 6250 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6251 for (i = 0; i < nelt; i++) 6252 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6253 indices.new_vector (sel, 2, nelt); 6254 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6255 { 6256 if (dump_enabled_p ()) 6257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6258 "shift permutation is not supported by target\n"); 6259 return false; 6260 } 6261 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6262 6263 /* Generating permutation constant to shift all elements. 6264 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6265 for (i = 0; i < nelt; i++) 6266 sel[i] = 2 * (nelt / 3) + 1 + i; 6267 indices.new_vector (sel, 2, nelt); 6268 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6269 { 6270 if (dump_enabled_p ()) 6271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6272 "shift permutation is not supported by target\n"); 6273 return false; 6274 } 6275 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6276 6277 /* Generating permutation constant to shift all elements. 6278 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6279 for (i = 0; i < nelt; i++) 6280 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6281 indices.new_vector (sel, 2, nelt); 6282 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6283 { 6284 if (dump_enabled_p ()) 6285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6286 "shift permutation is not supported by target\n"); 6287 return false; 6288 } 6289 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6290 6291 /* Generating permutation constant to shift all elements. 6292 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6293 for (i = 0; i < nelt; i++) 6294 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6295 indices.new_vector (sel, 2, nelt); 6296 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6297 { 6298 if (dump_enabled_p ()) 6299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6300 "shift permutation is not supported by target\n"); 6301 return false; 6302 } 6303 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6304 6305 for (k = 0; k < 3; k++) 6306 { 6307 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6308 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6309 dr_chain[k], dr_chain[k], 6310 perm3_mask); 6311 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6312 vect[k] = data_ref; 6313 } 6314 6315 for (k = 0; k < 3; k++) 6316 { 6317 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6318 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6319 vect[k % 3], vect[(k + 1) % 3], 6320 shift1_mask); 6321 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6322 vect_shift[k] = data_ref; 6323 } 6324 6325 for (k = 0; k < 3; k++) 6326 { 6327 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6328 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6329 vect_shift[(4 - k) % 3], 6330 vect_shift[(3 - k) % 3], 6331 shift2_mask); 6332 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6333 vect[k] = data_ref; 6334 } 6335 6336 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6337 6338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6340 vect[0], shift3_mask); 6341 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6342 (*result_chain)[nelt % 3] = data_ref; 6343 6344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6346 vect[1], shift4_mask); 6347 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6348 (*result_chain)[0] = data_ref; 6349 return true; 6350 } 6351 return false; 6352 } 6353 6354 /* Function vect_transform_grouped_load. 6355 6356 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6357 to perform their permutation and ascribe the result vectorized statements to 6358 the scalar statements. 6359 */ 6360 6361 void 6362 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 6363 gimple_stmt_iterator *gsi) 6364 { 6365 machine_mode mode; 6366 vec<tree> result_chain = vNULL; 6367 6368 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6369 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6370 vectors, that are ready for vector computation. */ 6371 result_chain.create (size); 6372 6373 /* If reassociation width for vector type is 2 or greater target machine can 6374 execute 2 or more vector instructions in parallel. Otherwise try to 6375 get chain for loads group using vect_shift_permute_load_chain. */ 6376 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 6377 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6378 || pow2p_hwi (size) 6379 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 6380 gsi, &result_chain)) 6381 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 6382 vect_record_grouped_load_vectors (stmt, result_chain); 6383 result_chain.release (); 6384 } 6385 6386 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6387 generated as part of the vectorization of STMT. Assign the statement 6388 for each vector to the associated scalar statement. */ 6389 6390 void 6391 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6392 { 6393 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6394 gimple *next_stmt, *new_stmt; 6395 unsigned int i, gap_count; 6396 tree tmp_data_ref; 6397 6398 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6399 Since we scan the chain starting from it's first node, their order 6400 corresponds the order of data-refs in RESULT_CHAIN. */ 6401 next_stmt = first_stmt; 6402 gap_count = 1; 6403 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6404 { 6405 if (!next_stmt) 6406 break; 6407 6408 /* Skip the gaps. Loads created for the gaps will be removed by dead 6409 code elimination pass later. No need to check for the first stmt in 6410 the group, since it always exists. 6411 GROUP_GAP is the number of steps in elements from the previous 6412 access (if there is no gap GROUP_GAP is 1). We skip loads that 6413 correspond to the gaps. */ 6414 if (next_stmt != first_stmt 6415 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6416 { 6417 gap_count++; 6418 continue; 6419 } 6420 6421 while (next_stmt) 6422 { 6423 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6424 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6425 copies, and we put the new vector statement in the first available 6426 RELATED_STMT. */ 6427 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6428 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6429 else 6430 { 6431 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6432 { 6433 gimple *prev_stmt = 6434 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6435 gimple *rel_stmt = 6436 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6437 while (rel_stmt) 6438 { 6439 prev_stmt = rel_stmt; 6440 rel_stmt = 6441 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6442 } 6443 6444 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6445 new_stmt; 6446 } 6447 } 6448 6449 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6450 gap_count = 1; 6451 /* If NEXT_STMT accesses the same DR as the previous statement, 6452 put the same TMP_DATA_REF as its vectorized statement; otherwise 6453 get the next data-ref from RESULT_CHAIN. */ 6454 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6455 break; 6456 } 6457 } 6458 } 6459 6460 /* Function vect_force_dr_alignment_p. 6461 6462 Returns whether the alignment of a DECL can be forced to be aligned 6463 on ALIGNMENT bit boundary. */ 6464 6465 bool 6466 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6467 { 6468 if (!VAR_P (decl)) 6469 return false; 6470 6471 if (decl_in_symtab_p (decl) 6472 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6473 return false; 6474 6475 if (TREE_STATIC (decl)) 6476 return (alignment <= MAX_OFILE_ALIGNMENT); 6477 else 6478 return (alignment <= MAX_STACK_ALIGNMENT); 6479 } 6480 6481 6482 /* Return whether the data reference DR is supported with respect to its 6483 alignment. 6484 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6485 it is aligned, i.e., check if it is possible to vectorize it with different 6486 alignment. */ 6487 6488 enum dr_alignment_support 6489 vect_supportable_dr_alignment (struct data_reference *dr, 6490 bool check_aligned_accesses) 6491 { 6492 gimple *stmt = DR_STMT (dr); 6493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6494 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6495 machine_mode mode = TYPE_MODE (vectype); 6496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6497 struct loop *vect_loop = NULL; 6498 bool nested_in_vect_loop = false; 6499 6500 if (aligned_access_p (dr) && !check_aligned_accesses) 6501 return dr_aligned; 6502 6503 /* For now assume all conditional loads/stores support unaligned 6504 access without any special code. */ 6505 if (is_gimple_call (stmt) 6506 && gimple_call_internal_p (stmt) 6507 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6508 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6509 return dr_unaligned_supported; 6510 6511 if (loop_vinfo) 6512 { 6513 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6514 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6515 } 6516 6517 /* Possibly unaligned access. */ 6518 6519 /* We can choose between using the implicit realignment scheme (generating 6520 a misaligned_move stmt) and the explicit realignment scheme (generating 6521 aligned loads with a REALIGN_LOAD). There are two variants to the 6522 explicit realignment scheme: optimized, and unoptimized. 6523 We can optimize the realignment only if the step between consecutive 6524 vector loads is equal to the vector size. Since the vector memory 6525 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6526 is guaranteed that the misalignment amount remains the same throughout the 6527 execution of the vectorized loop. Therefore, we can create the 6528 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6529 at the loop preheader. 6530 6531 However, in the case of outer-loop vectorization, when vectorizing a 6532 memory access in the inner-loop nested within the LOOP that is now being 6533 vectorized, while it is guaranteed that the misalignment of the 6534 vectorized memory access will remain the same in different outer-loop 6535 iterations, it is *not* guaranteed that is will remain the same throughout 6536 the execution of the inner-loop. This is because the inner-loop advances 6537 with the original scalar step (and not in steps of VS). If the inner-loop 6538 step happens to be a multiple of VS, then the misalignment remains fixed 6539 and we can use the optimized realignment scheme. For example: 6540 6541 for (i=0; i<N; i++) 6542 for (j=0; j<M; j++) 6543 s += a[i+j]; 6544 6545 When vectorizing the i-loop in the above example, the step between 6546 consecutive vector loads is 1, and so the misalignment does not remain 6547 fixed across the execution of the inner-loop, and the realignment cannot 6548 be optimized (as illustrated in the following pseudo vectorized loop): 6549 6550 for (i=0; i<N; i+=4) 6551 for (j=0; j<M; j++){ 6552 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6553 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6554 // (assuming that we start from an aligned address). 6555 } 6556 6557 We therefore have to use the unoptimized realignment scheme: 6558 6559 for (i=0; i<N; i+=4) 6560 for (j=k; j<M; j+=4) 6561 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6562 // that the misalignment of the initial address is 6563 // 0). 6564 6565 The loop can then be vectorized as follows: 6566 6567 for (k=0; k<4; k++){ 6568 rt = get_realignment_token (&vp[k]); 6569 for (i=0; i<N; i+=4){ 6570 v1 = vp[i+k]; 6571 for (j=k; j<M; j+=4){ 6572 v2 = vp[i+j+VS-1]; 6573 va = REALIGN_LOAD <v1,v2,rt>; 6574 vs += va; 6575 v1 = v2; 6576 } 6577 } 6578 } */ 6579 6580 if (DR_IS_READ (dr)) 6581 { 6582 bool is_packed = false; 6583 tree type = (TREE_TYPE (DR_REF (dr))); 6584 6585 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6586 && (!targetm.vectorize.builtin_mask_for_load 6587 || targetm.vectorize.builtin_mask_for_load ())) 6588 { 6589 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6590 6591 /* If we are doing SLP then the accesses need not have the 6592 same alignment, instead it depends on the SLP group size. */ 6593 if (loop_vinfo 6594 && STMT_SLP_TYPE (stmt_info) 6595 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6596 * GROUP_SIZE (vinfo_for_stmt 6597 (GROUP_FIRST_ELEMENT (stmt_info))), 6598 TYPE_VECTOR_SUBPARTS (vectype))) 6599 ; 6600 else if (!loop_vinfo 6601 || (nested_in_vect_loop 6602 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6603 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6604 return dr_explicit_realign; 6605 else 6606 return dr_explicit_realign_optimized; 6607 } 6608 if (!known_alignment_for_access_p (dr)) 6609 is_packed = not_size_aligned (DR_REF (dr)); 6610 6611 if (targetm.vectorize.support_vector_misalignment 6612 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6613 /* Can't software pipeline the loads, but can at least do them. */ 6614 return dr_unaligned_supported; 6615 } 6616 else 6617 { 6618 bool is_packed = false; 6619 tree type = (TREE_TYPE (DR_REF (dr))); 6620 6621 if (!known_alignment_for_access_p (dr)) 6622 is_packed = not_size_aligned (DR_REF (dr)); 6623 6624 if (targetm.vectorize.support_vector_misalignment 6625 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6626 return dr_unaligned_supported; 6627 } 6628 6629 /* Unsupported. */ 6630 return dr_unaligned_unsupported; 6631 } 6632