1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "tree-cfg.h" 54 #include "tree-hash-traits.h" 55 #include "vec-perm-indices.h" 56 #include "internal-fn.h" 57 58 /* Return true if load- or store-lanes optab OPTAB is implemented for 59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 60 61 static bool 62 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 63 tree vectype, unsigned HOST_WIDE_INT count) 64 { 65 machine_mode mode, array_mode; 66 bool limit_p; 67 68 mode = TYPE_MODE (vectype); 69 if (!targetm.array_mode (mode, count).exists (&array_mode)) 70 { 71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 72 limit_p = !targetm.array_mode_supported_p (mode, count); 73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 74 { 75 if (dump_enabled_p ()) 76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 77 "no array mode for %s[" 78 HOST_WIDE_INT_PRINT_DEC "]\n", 79 GET_MODE_NAME (mode), count); 80 return false; 81 } 82 } 83 84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 85 { 86 if (dump_enabled_p ()) 87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 88 "cannot use %s<%s><%s>\n", name, 89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 90 return false; 91 } 92 93 if (dump_enabled_p ()) 94 dump_printf_loc (MSG_NOTE, vect_location, 95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 96 GET_MODE_NAME (mode)); 97 98 return true; 99 } 100 101 102 /* Return the smallest scalar part of STMT. 103 This is used to determine the vectype of the stmt. We generally set the 104 vectype according to the type of the result (lhs). For stmts whose 105 result-type is different than the type of the arguments (e.g., demotion, 106 promotion), vectype will be reset appropriately (later). Note that we have 107 to visit the smallest datatype in this function, because that determines the 108 VF. If the smallest datatype in the loop is present only as the rhs of a 109 promotion operation - we'd miss it. 110 Such a case, where a variable of this datatype does not appear in the lhs 111 anywhere in the loop, can only occur if it's an invariant: e.g.: 112 'int_x = (int) short_inv', which we'd expect to have been optimized away by 113 invariant motion. However, we cannot rely on invariant motion to always 114 take invariants out of the loop, and so in the case of promotion we also 115 have to check the rhs. 116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 117 types. */ 118 119 tree 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 121 HOST_WIDE_INT *rhs_size_unit) 122 { 123 tree scalar_type = gimple_expr_type (stmt); 124 HOST_WIDE_INT lhs, rhs; 125 126 /* During the analysis phase, this function is called on arbitrary 127 statements that might not have scalar results. */ 128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 129 return scalar_type; 130 131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 132 133 if (is_gimple_assign (stmt) 134 && (gimple_assign_cast_p (stmt) 135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR 136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR 137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 140 { 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 142 143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 144 if (rhs < lhs) 145 scalar_type = rhs_type; 146 } 147 148 *lhs_size_unit = lhs; 149 *rhs_size_unit = rhs; 150 return scalar_type; 151 } 152 153 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 155 tested at run-time. Return TRUE if DDR was successfully inserted. 156 Return false if versioning is not supported. */ 157 158 static bool 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 160 { 161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 162 163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 164 return false; 165 166 if (!runtime_alias_check_p (ddr, loop, 167 optimize_loop_nest_for_speed_p (loop))) 168 return false; 169 170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 171 return true; 172 } 173 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 175 176 static void 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 178 { 179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 180 for (unsigned int i = 0; i < checks.length(); ++i) 181 if (checks[i] == value) 182 return; 183 184 if (dump_enabled_p ()) 185 { 186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that "); 187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value); 188 dump_printf (MSG_NOTE, " is nonzero\n"); 189 } 190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 191 } 192 193 /* Return true if we know that the order of vectorized STMT_A and 194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 195 At least one of the statements is a write. */ 196 197 static bool 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 199 { 200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 202 203 /* Single statements are always kept in their original order. */ 204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 206 return true; 207 208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 209 SLP group are emitted at the position of the last scalar load and 210 all loads in an interleaving group are emitted at the position 211 of the first scalar load. 212 Stores in a group are emitted at the position of the last scalar store. 213 Compute that position and check whether the resulting order matches 214 the current one. 215 We have not yet decided between SLP and interleaving so we have 216 to conservatively assume both. */ 217 gimple *il_a; 218 gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 219 if (last_a) 220 { 221 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; 222 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 223 last_a = get_later_stmt (last_a, s); 224 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) 225 { 226 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s; 227 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 228 if (get_later_stmt (il_a, s) == il_a) 229 il_a = s; 230 } 231 else 232 il_a = last_a; 233 } 234 else 235 last_a = il_a = stmt_a; 236 gimple *il_b; 237 gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 238 if (last_b) 239 { 240 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; 241 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 242 last_b = get_later_stmt (last_b, s); 243 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) 244 { 245 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s; 246 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 247 if (get_later_stmt (il_b, s) == il_b) 248 il_b = s; 249 } 250 else 251 il_b = last_b; 252 } 253 else 254 last_b = il_b = stmt_b; 255 bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a); 256 return (/* SLP */ 257 (get_later_stmt (last_a, last_b) == last_a) == a_after_b 258 /* Interleaving */ 259 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b 260 /* Mixed */ 261 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b 262 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); 263 } 264 265 /* A subroutine of vect_analyze_data_ref_dependence. Handle 266 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 267 distances. These distances are conservatively correct but they don't 268 reflect a guaranteed dependence. 269 270 Return true if this function does all the work necessary to avoid 271 an alias or false if the caller should use the dependence distances 272 to limit the vectorization factor in the usual way. LOOP_DEPTH is 273 the depth of the loop described by LOOP_VINFO and the other arguments 274 are as for vect_analyze_data_ref_dependence. */ 275 276 static bool 277 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 278 loop_vec_info loop_vinfo, 279 int loop_depth, unsigned int *max_vf) 280 { 281 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 282 lambda_vector dist_v; 283 unsigned int i; 284 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 285 { 286 int dist = dist_v[loop_depth]; 287 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 288 { 289 /* If the user asserted safelen >= DIST consecutive iterations 290 can be executed concurrently, assume independence. 291 292 ??? An alternative would be to add the alias check even 293 in this case, and vectorize the fallback loop with the 294 maximum VF set to safelen. However, if the user has 295 explicitly given a length, it's less likely that that 296 would be a win. */ 297 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 298 { 299 if ((unsigned int) loop->safelen < *max_vf) 300 *max_vf = loop->safelen; 301 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 302 continue; 303 } 304 305 /* For dependence distances of 2 or more, we have the option 306 of limiting VF or checking for an alias at runtime. 307 Prefer to check at runtime if we can, to avoid limiting 308 the VF unnecessarily when the bases are in fact independent. 309 310 Note that the alias checks will be removed if the VF ends up 311 being small enough. */ 312 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 313 } 314 } 315 return true; 316 } 317 318 319 /* Function vect_analyze_data_ref_dependence. 320 321 Return TRUE if there (might) exist a dependence between a memory-reference 322 DRA and a memory-reference DRB. When versioning for alias may check a 323 dependence at run-time, return FALSE. Adjust *MAX_VF according to 324 the data dependence. */ 325 326 static bool 327 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 328 loop_vec_info loop_vinfo, 329 unsigned int *max_vf) 330 { 331 unsigned int i; 332 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 333 struct data_reference *dra = DDR_A (ddr); 334 struct data_reference *drb = DDR_B (ddr); 335 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 336 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 337 lambda_vector dist_v; 338 unsigned int loop_depth; 339 340 /* In loop analysis all data references should be vectorizable. */ 341 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 342 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 343 gcc_unreachable (); 344 345 /* Independent data accesses. */ 346 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 347 return false; 348 349 if (dra == drb 350 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 351 return false; 352 353 /* We do not have to consider dependences between accesses that belong 354 to the same group, unless the stride could be smaller than the 355 group size. */ 356 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 357 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b) 358 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 359 return false; 360 361 /* Even if we have an anti-dependence then, as the vectorized loop covers at 362 least two scalar iterations, there is always also a true dependence. 363 As the vectorizer does not re-order loads and stores we can ignore 364 the anti-dependence if TBAA can disambiguate both DRs similar to the 365 case with known negative distance anti-dependences (positive 366 distance anti-dependences would violate TBAA constraints). */ 367 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 368 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 369 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 370 get_alias_set (DR_REF (drb)))) 371 return false; 372 373 /* Unknown data dependence. */ 374 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 375 { 376 /* If user asserted safelen consecutive iterations can be 377 executed concurrently, assume independence. */ 378 if (loop->safelen >= 2) 379 { 380 if ((unsigned int) loop->safelen < *max_vf) 381 *max_vf = loop->safelen; 382 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 383 return false; 384 } 385 386 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 387 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 388 { 389 if (dump_enabled_p ()) 390 { 391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 392 "versioning for alias not supported for: " 393 "can't determine dependence between "); 394 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 395 DR_REF (dra)); 396 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 398 DR_REF (drb)); 399 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 400 } 401 return true; 402 } 403 404 if (dump_enabled_p ()) 405 { 406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 407 "versioning for alias required: " 408 "can't determine dependence between "); 409 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 410 DR_REF (dra)); 411 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 412 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 413 DR_REF (drb)); 414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 415 } 416 417 /* Add to list of ddrs that need to be tested at run-time. */ 418 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 419 } 420 421 /* Known data dependence. */ 422 if (DDR_NUM_DIST_VECTS (ddr) == 0) 423 { 424 /* If user asserted safelen consecutive iterations can be 425 executed concurrently, assume independence. */ 426 if (loop->safelen >= 2) 427 { 428 if ((unsigned int) loop->safelen < *max_vf) 429 *max_vf = loop->safelen; 430 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 431 return false; 432 } 433 434 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 435 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 436 { 437 if (dump_enabled_p ()) 438 { 439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 440 "versioning for alias not supported for: " 441 "bad dist vector for "); 442 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 443 DR_REF (dra)); 444 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 445 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 446 DR_REF (drb)); 447 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 448 } 449 return true; 450 } 451 452 if (dump_enabled_p ()) 453 { 454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 455 "versioning for alias required: " 456 "bad dist vector for "); 457 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 458 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 459 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 460 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 461 } 462 /* Add to list of ddrs that need to be tested at run-time. */ 463 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 464 } 465 466 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 467 468 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 469 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 470 loop_depth, max_vf)) 471 return false; 472 473 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 474 { 475 int dist = dist_v[loop_depth]; 476 477 if (dump_enabled_p ()) 478 dump_printf_loc (MSG_NOTE, vect_location, 479 "dependence distance = %d.\n", dist); 480 481 if (dist == 0) 482 { 483 if (dump_enabled_p ()) 484 { 485 dump_printf_loc (MSG_NOTE, vect_location, 486 "dependence distance == 0 between "); 487 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 488 dump_printf (MSG_NOTE, " and "); 489 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 490 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 491 } 492 493 /* When we perform grouped accesses and perform implicit CSE 494 by detecting equal accesses and doing disambiguation with 495 runtime alias tests like for 496 .. = a[i]; 497 .. = a[i+1]; 498 a[i] = ..; 499 a[i+1] = ..; 500 *p = ..; 501 .. = a[i]; 502 .. = a[i+1]; 503 where we will end up loading { a[i], a[i+1] } once, make 504 sure that inserting group loads before the first load and 505 stores after the last store will do the right thing. 506 Similar for groups like 507 a[i] = ...; 508 ... = a[i]; 509 a[i+1] = ...; 510 where loads from the group interleave with the store. */ 511 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 512 { 513 if (dump_enabled_p ()) 514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 515 "READ_WRITE dependence in interleaving.\n"); 516 return true; 517 } 518 519 if (loop->safelen < 2) 520 { 521 tree indicator = dr_zero_step_indicator (dra); 522 if (TREE_CODE (indicator) != INTEGER_CST) 523 vect_check_nonzero_value (loop_vinfo, indicator); 524 else if (integer_zerop (indicator)) 525 { 526 if (dump_enabled_p ()) 527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 528 "access also has a zero step\n"); 529 return true; 530 } 531 } 532 continue; 533 } 534 535 if (dist > 0 && DDR_REVERSED_P (ddr)) 536 { 537 /* If DDR_REVERSED_P the order of the data-refs in DDR was 538 reversed (to make distance vector positive), and the actual 539 distance is negative. */ 540 if (dump_enabled_p ()) 541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 542 "dependence distance negative.\n"); 543 /* Record a negative dependence distance to later limit the 544 amount of stmt copying / unrolling we can perform. 545 Only need to handle read-after-write dependence. */ 546 if (DR_IS_READ (drb) 547 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 548 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 549 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 550 continue; 551 } 552 553 unsigned int abs_dist = abs (dist); 554 if (abs_dist >= 2 && abs_dist < *max_vf) 555 { 556 /* The dependence distance requires reduction of the maximal 557 vectorization factor. */ 558 *max_vf = abs (dist); 559 if (dump_enabled_p ()) 560 dump_printf_loc (MSG_NOTE, vect_location, 561 "adjusting maximal vectorization factor to %i\n", 562 *max_vf); 563 } 564 565 if (abs_dist >= *max_vf) 566 { 567 /* Dependence distance does not create dependence, as far as 568 vectorization is concerned, in this case. */ 569 if (dump_enabled_p ()) 570 dump_printf_loc (MSG_NOTE, vect_location, 571 "dependence distance >= VF.\n"); 572 continue; 573 } 574 575 if (dump_enabled_p ()) 576 { 577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 578 "not vectorized, possible dependence " 579 "between data-refs "); 580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 581 dump_printf (MSG_NOTE, " and "); 582 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 583 dump_printf (MSG_NOTE, "\n"); 584 } 585 586 return true; 587 } 588 589 return false; 590 } 591 592 /* Function vect_analyze_data_ref_dependences. 593 594 Examine all the data references in the loop, and make sure there do not 595 exist any data dependences between them. Set *MAX_VF according to 596 the maximum vectorization factor the data dependences allow. */ 597 598 bool 599 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 600 unsigned int *max_vf) 601 { 602 unsigned int i; 603 struct data_dependence_relation *ddr; 604 605 if (dump_enabled_p ()) 606 dump_printf_loc (MSG_NOTE, vect_location, 607 "=== vect_analyze_data_ref_dependences ===\n"); 608 609 LOOP_VINFO_DDRS (loop_vinfo) 610 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 611 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 612 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 613 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 614 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 615 &LOOP_VINFO_DDRS (loop_vinfo), 616 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 617 return false; 618 619 /* For epilogues we either have no aliases or alias versioning 620 was applied to original loop. Therefore we may just get max_vf 621 using VF of original loop. */ 622 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 623 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 624 else 625 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 626 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 627 return false; 628 629 return true; 630 } 631 632 633 /* Function vect_slp_analyze_data_ref_dependence. 634 635 Return TRUE if there (might) exist a dependence between a memory-reference 636 DRA and a memory-reference DRB. When versioning for alias may check a 637 dependence at run-time, return FALSE. Adjust *MAX_VF according to 638 the data dependence. */ 639 640 static bool 641 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 642 { 643 struct data_reference *dra = DDR_A (ddr); 644 struct data_reference *drb = DDR_B (ddr); 645 646 /* We need to check dependences of statements marked as unvectorizable 647 as well, they still can prohibit vectorization. */ 648 649 /* Independent data accesses. */ 650 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 651 return false; 652 653 if (dra == drb) 654 return false; 655 656 /* Read-read is OK. */ 657 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 658 return false; 659 660 /* If dra and drb are part of the same interleaving chain consider 661 them independent. */ 662 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 663 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 664 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 665 return false; 666 667 /* Unknown data dependence. */ 668 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 669 { 670 if (dump_enabled_p ()) 671 { 672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 673 "can't determine dependence between "); 674 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 675 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 676 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 677 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 678 } 679 } 680 else if (dump_enabled_p ()) 681 { 682 dump_printf_loc (MSG_NOTE, vect_location, 683 "determined dependence between "); 684 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 685 dump_printf (MSG_NOTE, " and "); 686 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 687 dump_printf (MSG_NOTE, "\n"); 688 } 689 690 return true; 691 } 692 693 694 /* Analyze dependences involved in the transform of SLP NODE. STORES 695 contain the vector of scalar stores of this instance if we are 696 disambiguating the loads. */ 697 698 static bool 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 700 vec<gimple *> stores, gimple *last_store) 701 { 702 /* This walks over all stmts involved in the SLP load/store done 703 in NODE verifying we can sink them up to the last stmt in the 704 group. */ 705 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 706 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 707 { 708 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 709 if (access == last_access) 710 continue; 711 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 712 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 713 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 714 { 715 gimple *stmt = gsi_stmt (gsi); 716 if (! gimple_vuse (stmt) 717 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 718 continue; 719 720 /* If we couldn't record a (single) data reference for this 721 stmt we have to give up. */ 722 /* ??? Here and below if dependence analysis fails we can resort 723 to the alias oracle which can handle more kinds of stmts. */ 724 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 725 if (!dr_b) 726 return false; 727 728 bool dependent = false; 729 /* If we run into a store of this same instance (we've just 730 marked those) then delay dependence checking until we run 731 into the last store because this is where it will have 732 been sunk to (and we verify if we can do that as well). */ 733 if (gimple_visited_p (stmt)) 734 { 735 if (stmt != last_store) 736 continue; 737 unsigned i; 738 gimple *store; 739 FOR_EACH_VEC_ELT (stores, i, store) 740 { 741 data_reference *store_dr 742 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 743 ddr_p ddr = initialize_data_dependence_relation 744 (dr_a, store_dr, vNULL); 745 dependent = vect_slp_analyze_data_ref_dependence (ddr); 746 free_dependence_relation (ddr); 747 if (dependent) 748 break; 749 } 750 } 751 else 752 { 753 ddr_p ddr = initialize_data_dependence_relation (dr_a, 754 dr_b, vNULL); 755 dependent = vect_slp_analyze_data_ref_dependence (ddr); 756 free_dependence_relation (ddr); 757 } 758 if (dependent) 759 return false; 760 } 761 } 762 return true; 763 } 764 765 766 /* Function vect_analyze_data_ref_dependences. 767 768 Examine all the data references in the basic-block, and make sure there 769 do not exist any data dependences between them. Set *MAX_VF according to 770 the maximum vectorization factor the data dependences allow. */ 771 772 bool 773 vect_slp_analyze_instance_dependence (slp_instance instance) 774 { 775 if (dump_enabled_p ()) 776 dump_printf_loc (MSG_NOTE, vect_location, 777 "=== vect_slp_analyze_instance_dependence ===\n"); 778 779 /* The stores of this instance are at the root of the SLP tree. */ 780 slp_tree store = SLP_INSTANCE_TREE (instance); 781 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 782 store = NULL; 783 784 /* Verify we can sink stores to the vectorized stmt insert location. */ 785 gimple *last_store = NULL; 786 if (store) 787 { 788 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 789 return false; 790 791 /* Mark stores in this instance and remember the last one. */ 792 last_store = vect_find_last_scalar_stmt_in_slp (store); 793 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 794 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 795 } 796 797 bool res = true; 798 799 /* Verify we can sink loads to the vectorized stmt insert location, 800 special-casing stores of this instance. */ 801 slp_tree load; 802 unsigned int i; 803 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 804 if (! vect_slp_analyze_node_dependences (instance, load, 805 store 806 ? SLP_TREE_SCALAR_STMTS (store) 807 : vNULL, last_store)) 808 { 809 res = false; 810 break; 811 } 812 813 /* Unset the visited flag. */ 814 if (store) 815 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 816 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 817 818 return res; 819 } 820 821 /* Record in VINFO the base alignment guarantee given by DRB. STMT is 822 the statement that contains DRB, which is useful for recording in the 823 dump file. */ 824 825 static void 826 vect_record_base_alignment (vec_info *vinfo, gimple *stmt, 827 innermost_loop_behavior *drb) 828 { 829 bool existed; 830 innermost_loop_behavior *&entry 831 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 832 if (!existed || entry->base_alignment < drb->base_alignment) 833 { 834 entry = drb; 835 if (dump_enabled_p ()) 836 { 837 dump_printf_loc (MSG_NOTE, vect_location, 838 "recording new base alignment for "); 839 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address); 840 dump_printf (MSG_NOTE, "\n"); 841 dump_printf_loc (MSG_NOTE, vect_location, 842 " alignment: %d\n", drb->base_alignment); 843 dump_printf_loc (MSG_NOTE, vect_location, 844 " misalignment: %d\n", drb->base_misalignment); 845 dump_printf_loc (MSG_NOTE, vect_location, 846 " based on: "); 847 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 848 } 849 } 850 } 851 852 /* If the region we're going to vectorize is reached, all unconditional 853 data references occur at least once. We can therefore pool the base 854 alignment guarantees from each unconditional reference. Do this by 855 going through all the data references in VINFO and checking whether 856 the containing statement makes the reference unconditionally. If so, 857 record the alignment of the base address in VINFO so that it can be 858 used for all other references with the same base. */ 859 860 void 861 vect_record_base_alignments (vec_info *vinfo) 862 { 863 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 864 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 865 data_reference *dr; 866 unsigned int i; 867 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr) 868 if (!DR_IS_CONDITIONAL_IN_STMT (dr)) 869 { 870 gimple *stmt = DR_STMT (dr); 871 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr)); 872 873 /* If DR is nested in the loop that is being vectorized, we can also 874 record the alignment of the base wrt the outer loop. */ 875 if (loop && nested_in_vect_loop_p (loop, stmt)) 876 { 877 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 878 vect_record_base_alignment 879 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 880 } 881 } 882 } 883 884 /* Return the target alignment for the vectorized form of DR. */ 885 886 static unsigned int 887 vect_calculate_target_alignment (struct data_reference *dr) 888 { 889 gimple *stmt = DR_STMT (dr); 890 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 891 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 892 return targetm.vectorize.preferred_vector_alignment (vectype); 893 } 894 895 /* Function vect_compute_data_ref_alignment 896 897 Compute the misalignment of the data reference DR. 898 899 Output: 900 1. If during the misalignment computation it is found that the data reference 901 cannot be vectorized then false is returned. 902 2. DR_MISALIGNMENT (DR) is defined. 903 904 FOR NOW: No analysis is actually performed. Misalignment is calculated 905 only for trivial cases. TODO. */ 906 907 bool 908 vect_compute_data_ref_alignment (struct data_reference *dr) 909 { 910 gimple *stmt = DR_STMT (dr); 911 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 912 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 914 struct loop *loop = NULL; 915 tree ref = DR_REF (dr); 916 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 917 918 if (dump_enabled_p ()) 919 dump_printf_loc (MSG_NOTE, vect_location, 920 "vect_compute_data_ref_alignment:\n"); 921 922 if (loop_vinfo) 923 loop = LOOP_VINFO_LOOP (loop_vinfo); 924 925 /* Initialize misalignment to unknown. */ 926 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 927 928 innermost_loop_behavior *drb = vect_dr_behavior (dr); 929 bool step_preserves_misalignment_p; 930 931 unsigned HOST_WIDE_INT vector_alignment 932 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT; 933 DR_TARGET_ALIGNMENT (dr) = vector_alignment; 934 935 /* No step for BB vectorization. */ 936 if (!loop) 937 { 938 gcc_assert (integer_zerop (drb->step)); 939 step_preserves_misalignment_p = true; 940 } 941 942 /* In case the dataref is in an inner-loop of the loop that is being 943 vectorized (LOOP), we use the base and misalignment information 944 relative to the outer-loop (LOOP). This is ok only if the misalignment 945 stays the same throughout the execution of the inner-loop, which is why 946 we have to check that the stride of the dataref in the inner-loop evenly 947 divides by the vector alignment. */ 948 else if (nested_in_vect_loop_p (loop, stmt)) 949 { 950 step_preserves_misalignment_p 951 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0; 952 953 if (dump_enabled_p ()) 954 { 955 if (step_preserves_misalignment_p) 956 dump_printf_loc (MSG_NOTE, vect_location, 957 "inner step divides the vector alignment.\n"); 958 else 959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 960 "inner step doesn't divide the vector" 961 " alignment.\n"); 962 } 963 } 964 965 /* Similarly we can only use base and misalignment information relative to 966 an innermost loop if the misalignment stays the same throughout the 967 execution of the loop. As above, this is the case if the stride of 968 the dataref evenly divides by the alignment. */ 969 else 970 { 971 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 972 step_preserves_misalignment_p 973 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment); 974 975 if (!step_preserves_misalignment_p && dump_enabled_p ()) 976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 977 "step doesn't divide the vector alignment.\n"); 978 } 979 980 unsigned int base_alignment = drb->base_alignment; 981 unsigned int base_misalignment = drb->base_misalignment; 982 983 /* Calculate the maximum of the pooled base address alignment and the 984 alignment that we can compute for DR itself. */ 985 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 986 if (entry && base_alignment < (*entry)->base_alignment) 987 { 988 base_alignment = (*entry)->base_alignment; 989 base_misalignment = (*entry)->base_misalignment; 990 } 991 992 if (drb->offset_alignment < vector_alignment 993 || !step_preserves_misalignment_p 994 /* We need to know whether the step wrt the vectorized loop is 995 negative when computing the starting misalignment below. */ 996 || TREE_CODE (drb->step) != INTEGER_CST) 997 { 998 if (dump_enabled_p ()) 999 { 1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1001 "Unknown alignment for access: "); 1002 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1003 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1004 } 1005 return true; 1006 } 1007 1008 if (base_alignment < vector_alignment) 1009 { 1010 unsigned int max_alignment; 1011 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 1012 if (max_alignment < vector_alignment 1013 || !vect_can_force_dr_alignment_p (base, 1014 vector_alignment * BITS_PER_UNIT)) 1015 { 1016 if (dump_enabled_p ()) 1017 { 1018 dump_printf_loc (MSG_NOTE, vect_location, 1019 "can't force alignment of ref: "); 1020 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 1021 dump_printf (MSG_NOTE, "\n"); 1022 } 1023 return true; 1024 } 1025 1026 /* Force the alignment of the decl. 1027 NOTE: This is the only change to the code we make during 1028 the analysis phase, before deciding to vectorize the loop. */ 1029 if (dump_enabled_p ()) 1030 { 1031 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 1032 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 1033 dump_printf (MSG_NOTE, "\n"); 1034 } 1035 1036 DR_VECT_AUX (dr)->base_decl = base; 1037 DR_VECT_AUX (dr)->base_misaligned = true; 1038 base_misalignment = 0; 1039 } 1040 poly_int64 misalignment 1041 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 1042 1043 /* If this is a backward running DR then first access in the larger 1044 vectype actually is N-1 elements before the address in the DR. 1045 Adjust misalign accordingly. */ 1046 if (tree_int_cst_sgn (drb->step) < 0) 1047 /* PLUS because STEP is negative. */ 1048 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1049 * TREE_INT_CST_LOW (drb->step)); 1050 1051 unsigned int const_misalignment; 1052 if (!known_misalignment (misalignment, vector_alignment, 1053 &const_misalignment)) 1054 { 1055 if (dump_enabled_p ()) 1056 { 1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1058 "Non-constant misalignment for access: "); 1059 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1060 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1061 } 1062 return true; 1063 } 1064 1065 SET_DR_MISALIGNMENT (dr, const_misalignment); 1066 1067 if (dump_enabled_p ()) 1068 { 1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1070 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 1071 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1072 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1073 } 1074 1075 return true; 1076 } 1077 1078 /* Function vect_update_misalignment_for_peel. 1079 Sets DR's misalignment 1080 - to 0 if it has the same alignment as DR_PEEL, 1081 - to the misalignment computed using NPEEL if DR's salignment is known, 1082 - to -1 (unknown) otherwise. 1083 1084 DR - the data reference whose misalignment is to be adjusted. 1085 DR_PEEL - the data reference whose misalignment is being made 1086 zero in the vector loop by the peel. 1087 NPEEL - the number of iterations in the peel loop if the misalignment 1088 of DR_PEEL is known at compile time. */ 1089 1090 static void 1091 vect_update_misalignment_for_peel (struct data_reference *dr, 1092 struct data_reference *dr_peel, int npeel) 1093 { 1094 unsigned int i; 1095 vec<dr_p> same_aligned_drs; 1096 struct data_reference *current_dr; 1097 int dr_size = vect_get_scalar_dr_size (dr); 1098 int dr_peel_size = vect_get_scalar_dr_size (dr_peel); 1099 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1100 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1101 1102 /* For interleaved data accesses the step in the loop must be multiplied by 1103 the size of the interleaving group. */ 1104 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1105 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1106 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1107 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1108 1109 /* It can be assumed that the data refs with the same alignment as dr_peel 1110 are aligned in the vector loop. */ 1111 same_aligned_drs 1112 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1113 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1114 { 1115 if (current_dr != dr) 1116 continue; 1117 gcc_assert (!known_alignment_for_access_p (dr) 1118 || !known_alignment_for_access_p (dr_peel) 1119 || (DR_MISALIGNMENT (dr) / dr_size 1120 == DR_MISALIGNMENT (dr_peel) / dr_peel_size)); 1121 SET_DR_MISALIGNMENT (dr, 0); 1122 return; 1123 } 1124 1125 if (known_alignment_for_access_p (dr) 1126 && known_alignment_for_access_p (dr_peel)) 1127 { 1128 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1129 int misal = DR_MISALIGNMENT (dr); 1130 misal += negative ? -npeel * dr_size : npeel * dr_size; 1131 misal &= DR_TARGET_ALIGNMENT (dr) - 1; 1132 SET_DR_MISALIGNMENT (dr, misal); 1133 return; 1134 } 1135 1136 if (dump_enabled_p ()) 1137 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1138 "to unknown (-1).\n"); 1139 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 1140 } 1141 1142 1143 /* Function verify_data_ref_alignment 1144 1145 Return TRUE if DR can be handled with respect to alignment. */ 1146 1147 static bool 1148 verify_data_ref_alignment (data_reference_p dr) 1149 { 1150 enum dr_alignment_support supportable_dr_alignment 1151 = vect_supportable_dr_alignment (dr, false); 1152 if (!supportable_dr_alignment) 1153 { 1154 if (dump_enabled_p ()) 1155 { 1156 if (DR_IS_READ (dr)) 1157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1158 "not vectorized: unsupported unaligned load."); 1159 else 1160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1161 "not vectorized: unsupported unaligned " 1162 "store."); 1163 1164 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1165 DR_REF (dr)); 1166 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1167 } 1168 return false; 1169 } 1170 1171 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1172 dump_printf_loc (MSG_NOTE, vect_location, 1173 "Vectorizing an unaligned access.\n"); 1174 1175 return true; 1176 } 1177 1178 /* Function vect_verify_datarefs_alignment 1179 1180 Return TRUE if all data references in the loop can be 1181 handled with respect to alignment. */ 1182 1183 bool 1184 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1185 { 1186 vec<data_reference_p> datarefs = vinfo->datarefs; 1187 struct data_reference *dr; 1188 unsigned int i; 1189 1190 FOR_EACH_VEC_ELT (datarefs, i, dr) 1191 { 1192 gimple *stmt = DR_STMT (dr); 1193 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1194 1195 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1196 continue; 1197 1198 /* For interleaving, only the alignment of the first access matters. */ 1199 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1200 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1201 continue; 1202 1203 /* Strided accesses perform only component accesses, alignment is 1204 irrelevant for them. */ 1205 if (STMT_VINFO_STRIDED_P (stmt_info) 1206 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1207 continue; 1208 1209 if (! verify_data_ref_alignment (dr)) 1210 return false; 1211 } 1212 1213 return true; 1214 } 1215 1216 /* Given an memory reference EXP return whether its alignment is less 1217 than its size. */ 1218 1219 static bool 1220 not_size_aligned (tree exp) 1221 { 1222 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1223 return true; 1224 1225 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1226 > get_object_alignment (exp)); 1227 } 1228 1229 /* Function vector_alignment_reachable_p 1230 1231 Return true if vector alignment for DR is reachable by peeling 1232 a few loop iterations. Return false otherwise. */ 1233 1234 static bool 1235 vector_alignment_reachable_p (struct data_reference *dr) 1236 { 1237 gimple *stmt = DR_STMT (dr); 1238 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1239 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1240 1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1242 { 1243 /* For interleaved access we peel only if number of iterations in 1244 the prolog loop ({VF - misalignment}), is a multiple of the 1245 number of the interleaved accesses. */ 1246 int elem_size, mis_in_elements; 1247 1248 /* FORNOW: handle only known alignment. */ 1249 if (!known_alignment_for_access_p (dr)) 1250 return false; 1251 1252 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1253 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1254 elem_size = vector_element_size (vector_size, nelements); 1255 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1256 1257 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info))) 1258 return false; 1259 } 1260 1261 /* If misalignment is known at the compile time then allow peeling 1262 only if natural alignment is reachable through peeling. */ 1263 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1264 { 1265 HOST_WIDE_INT elmsize = 1266 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1267 if (dump_enabled_p ()) 1268 { 1269 dump_printf_loc (MSG_NOTE, vect_location, 1270 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1271 dump_printf (MSG_NOTE, 1272 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1273 } 1274 if (DR_MISALIGNMENT (dr) % elmsize) 1275 { 1276 if (dump_enabled_p ()) 1277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1278 "data size does not divide the misalignment.\n"); 1279 return false; 1280 } 1281 } 1282 1283 if (!known_alignment_for_access_p (dr)) 1284 { 1285 tree type = TREE_TYPE (DR_REF (dr)); 1286 bool is_packed = not_size_aligned (DR_REF (dr)); 1287 if (dump_enabled_p ()) 1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1289 "Unknown misalignment, %snaturally aligned\n", 1290 is_packed ? "not " : ""); 1291 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1292 } 1293 1294 return true; 1295 } 1296 1297 1298 /* Calculate the cost of the memory access represented by DR. */ 1299 1300 static void 1301 vect_get_data_access_cost (struct data_reference *dr, 1302 unsigned int *inside_cost, 1303 unsigned int *outside_cost, 1304 stmt_vector_for_cost *body_cost_vec) 1305 { 1306 gimple *stmt = DR_STMT (dr); 1307 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1308 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1309 int ncopies; 1310 1311 if (PURE_SLP_STMT (stmt_info)) 1312 ncopies = 1; 1313 else 1314 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1315 1316 if (DR_IS_READ (dr)) 1317 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1318 NULL, body_cost_vec, false); 1319 else 1320 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1321 1322 if (dump_enabled_p ()) 1323 dump_printf_loc (MSG_NOTE, vect_location, 1324 "vect_get_data_access_cost: inside_cost = %d, " 1325 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1326 } 1327 1328 1329 typedef struct _vect_peel_info 1330 { 1331 struct data_reference *dr; 1332 int npeel; 1333 unsigned int count; 1334 } *vect_peel_info; 1335 1336 typedef struct _vect_peel_extended_info 1337 { 1338 struct _vect_peel_info peel_info; 1339 unsigned int inside_cost; 1340 unsigned int outside_cost; 1341 } *vect_peel_extended_info; 1342 1343 1344 /* Peeling hashtable helpers. */ 1345 1346 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1347 { 1348 static inline hashval_t hash (const _vect_peel_info *); 1349 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1350 }; 1351 1352 inline hashval_t 1353 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1354 { 1355 return (hashval_t) peel_info->npeel; 1356 } 1357 1358 inline bool 1359 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1360 { 1361 return (a->npeel == b->npeel); 1362 } 1363 1364 1365 /* Insert DR into peeling hash table with NPEEL as key. */ 1366 1367 static void 1368 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1369 loop_vec_info loop_vinfo, struct data_reference *dr, 1370 int npeel) 1371 { 1372 struct _vect_peel_info elem, *slot; 1373 _vect_peel_info **new_slot; 1374 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1375 1376 elem.npeel = npeel; 1377 slot = peeling_htab->find (&elem); 1378 if (slot) 1379 slot->count++; 1380 else 1381 { 1382 slot = XNEW (struct _vect_peel_info); 1383 slot->npeel = npeel; 1384 slot->dr = dr; 1385 slot->count = 1; 1386 new_slot = peeling_htab->find_slot (slot, INSERT); 1387 *new_slot = slot; 1388 } 1389 1390 if (!supportable_dr_alignment 1391 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1392 slot->count += VECT_MAX_COST; 1393 } 1394 1395 1396 /* Traverse peeling hash table to find peeling option that aligns maximum 1397 number of data accesses. */ 1398 1399 int 1400 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1401 _vect_peel_extended_info *max) 1402 { 1403 vect_peel_info elem = *slot; 1404 1405 if (elem->count > max->peel_info.count 1406 || (elem->count == max->peel_info.count 1407 && max->peel_info.npeel > elem->npeel)) 1408 { 1409 max->peel_info.npeel = elem->npeel; 1410 max->peel_info.count = elem->count; 1411 max->peel_info.dr = elem->dr; 1412 } 1413 1414 return 1; 1415 } 1416 1417 /* Get the costs of peeling NPEEL iterations checking data access costs 1418 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's 1419 misalignment will be zero after peeling. */ 1420 1421 static void 1422 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs, 1423 struct data_reference *dr0, 1424 unsigned int *inside_cost, 1425 unsigned int *outside_cost, 1426 stmt_vector_for_cost *body_cost_vec, 1427 unsigned int npeel, 1428 bool unknown_misalignment) 1429 { 1430 unsigned i; 1431 data_reference *dr; 1432 1433 FOR_EACH_VEC_ELT (datarefs, i, dr) 1434 { 1435 gimple *stmt = DR_STMT (dr); 1436 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1437 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1438 continue; 1439 1440 /* For interleaving, only the alignment of the first access 1441 matters. */ 1442 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1443 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1444 continue; 1445 1446 /* Strided accesses perform only component accesses, alignment is 1447 irrelevant for them. */ 1448 if (STMT_VINFO_STRIDED_P (stmt_info) 1449 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1450 continue; 1451 1452 int save_misalignment; 1453 save_misalignment = DR_MISALIGNMENT (dr); 1454 if (npeel == 0) 1455 ; 1456 else if (unknown_misalignment && dr == dr0) 1457 SET_DR_MISALIGNMENT (dr, 0); 1458 else 1459 vect_update_misalignment_for_peel (dr, dr0, npeel); 1460 vect_get_data_access_cost (dr, inside_cost, outside_cost, 1461 body_cost_vec); 1462 SET_DR_MISALIGNMENT (dr, save_misalignment); 1463 } 1464 } 1465 1466 /* Traverse peeling hash table and calculate cost for each peeling option. 1467 Find the one with the lowest cost. */ 1468 1469 int 1470 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1471 _vect_peel_extended_info *min) 1472 { 1473 vect_peel_info elem = *slot; 1474 int dummy; 1475 unsigned int inside_cost = 0, outside_cost = 0; 1476 gimple *stmt = DR_STMT (elem->dr); 1477 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1478 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1479 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1480 epilogue_cost_vec; 1481 1482 prologue_cost_vec.create (2); 1483 body_cost_vec.create (2); 1484 epilogue_cost_vec.create (2); 1485 1486 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo), 1487 elem->dr, &inside_cost, &outside_cost, 1488 &body_cost_vec, elem->npeel, false); 1489 1490 body_cost_vec.release (); 1491 1492 outside_cost += vect_get_known_peeling_cost 1493 (loop_vinfo, elem->npeel, &dummy, 1494 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1495 &prologue_cost_vec, &epilogue_cost_vec); 1496 1497 /* Prologue and epilogue costs are added to the target model later. 1498 These costs depend only on the scalar iteration cost, the 1499 number of peeling iterations finally chosen, and the number of 1500 misaligned statements. So discard the information found here. */ 1501 prologue_cost_vec.release (); 1502 epilogue_cost_vec.release (); 1503 1504 if (inside_cost < min->inside_cost 1505 || (inside_cost == min->inside_cost 1506 && outside_cost < min->outside_cost)) 1507 { 1508 min->inside_cost = inside_cost; 1509 min->outside_cost = outside_cost; 1510 min->peel_info.dr = elem->dr; 1511 min->peel_info.npeel = elem->npeel; 1512 min->peel_info.count = elem->count; 1513 } 1514 1515 return 1; 1516 } 1517 1518 1519 /* Choose best peeling option by traversing peeling hash table and either 1520 choosing an option with the lowest cost (if cost model is enabled) or the 1521 option that aligns as many accesses as possible. */ 1522 1523 static struct _vect_peel_extended_info 1524 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1525 loop_vec_info loop_vinfo) 1526 { 1527 struct _vect_peel_extended_info res; 1528 1529 res.peel_info.dr = NULL; 1530 1531 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1532 { 1533 res.inside_cost = INT_MAX; 1534 res.outside_cost = INT_MAX; 1535 peeling_htab->traverse <_vect_peel_extended_info *, 1536 vect_peeling_hash_get_lowest_cost> (&res); 1537 } 1538 else 1539 { 1540 res.peel_info.count = 0; 1541 peeling_htab->traverse <_vect_peel_extended_info *, 1542 vect_peeling_hash_get_most_frequent> (&res); 1543 res.inside_cost = 0; 1544 res.outside_cost = 0; 1545 } 1546 1547 return res; 1548 } 1549 1550 /* Return true if the new peeling NPEEL is supported. */ 1551 1552 static bool 1553 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0, 1554 unsigned npeel) 1555 { 1556 unsigned i; 1557 struct data_reference *dr = NULL; 1558 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1559 gimple *stmt; 1560 stmt_vec_info stmt_info; 1561 enum dr_alignment_support supportable_dr_alignment; 1562 1563 /* Ensure that all data refs can be vectorized after the peel. */ 1564 FOR_EACH_VEC_ELT (datarefs, i, dr) 1565 { 1566 int save_misalignment; 1567 1568 if (dr == dr0) 1569 continue; 1570 1571 stmt = DR_STMT (dr); 1572 stmt_info = vinfo_for_stmt (stmt); 1573 /* For interleaving, only the alignment of the first access 1574 matters. */ 1575 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1576 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1577 continue; 1578 1579 /* Strided accesses perform only component accesses, alignment is 1580 irrelevant for them. */ 1581 if (STMT_VINFO_STRIDED_P (stmt_info) 1582 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1583 continue; 1584 1585 save_misalignment = DR_MISALIGNMENT (dr); 1586 vect_update_misalignment_for_peel (dr, dr0, npeel); 1587 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1588 SET_DR_MISALIGNMENT (dr, save_misalignment); 1589 1590 if (!supportable_dr_alignment) 1591 return false; 1592 } 1593 1594 return true; 1595 } 1596 1597 /* Function vect_enhance_data_refs_alignment 1598 1599 This pass will use loop versioning and loop peeling in order to enhance 1600 the alignment of data references in the loop. 1601 1602 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1603 original loop is to be vectorized. Any other loops that are created by 1604 the transformations performed in this pass - are not supposed to be 1605 vectorized. This restriction will be relaxed. 1606 1607 This pass will require a cost model to guide it whether to apply peeling 1608 or versioning or a combination of the two. For example, the scheme that 1609 intel uses when given a loop with several memory accesses, is as follows: 1610 choose one memory access ('p') which alignment you want to force by doing 1611 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1612 other accesses are not necessarily aligned, or (2) use loop versioning to 1613 generate one loop in which all accesses are aligned, and another loop in 1614 which only 'p' is necessarily aligned. 1615 1616 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1617 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1618 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1619 1620 Devising a cost model is the most critical aspect of this work. It will 1621 guide us on which access to peel for, whether to use loop versioning, how 1622 many versions to create, etc. The cost model will probably consist of 1623 generic considerations as well as target specific considerations (on 1624 powerpc for example, misaligned stores are more painful than misaligned 1625 loads). 1626 1627 Here are the general steps involved in alignment enhancements: 1628 1629 -- original loop, before alignment analysis: 1630 for (i=0; i<N; i++){ 1631 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1633 } 1634 1635 -- After vect_compute_data_refs_alignment: 1636 for (i=0; i<N; i++){ 1637 x = q[i]; # DR_MISALIGNMENT(q) = 3 1638 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1639 } 1640 1641 -- Possibility 1: we do loop versioning: 1642 if (p is aligned) { 1643 for (i=0; i<N; i++){ # loop 1A 1644 x = q[i]; # DR_MISALIGNMENT(q) = 3 1645 p[i] = y; # DR_MISALIGNMENT(p) = 0 1646 } 1647 } 1648 else { 1649 for (i=0; i<N; i++){ # loop 1B 1650 x = q[i]; # DR_MISALIGNMENT(q) = 3 1651 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1652 } 1653 } 1654 1655 -- Possibility 2: we do loop peeling: 1656 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1657 x = q[i]; 1658 p[i] = y; 1659 } 1660 for (i = 3; i < N; i++){ # loop 2A 1661 x = q[i]; # DR_MISALIGNMENT(q) = 0 1662 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1663 } 1664 1665 -- Possibility 3: combination of loop peeling and versioning: 1666 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1667 x = q[i]; 1668 p[i] = y; 1669 } 1670 if (p is aligned) { 1671 for (i = 3; i<N; i++){ # loop 3A 1672 x = q[i]; # DR_MISALIGNMENT(q) = 0 1673 p[i] = y; # DR_MISALIGNMENT(p) = 0 1674 } 1675 } 1676 else { 1677 for (i = 3; i<N; i++){ # loop 3B 1678 x = q[i]; # DR_MISALIGNMENT(q) = 0 1679 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1680 } 1681 } 1682 1683 These loops are later passed to loop_transform to be vectorized. The 1684 vectorizer will use the alignment information to guide the transformation 1685 (whether to generate regular loads/stores, or with special handling for 1686 misalignment). */ 1687 1688 bool 1689 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1690 { 1691 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1692 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1693 enum dr_alignment_support supportable_dr_alignment; 1694 struct data_reference *dr0 = NULL, *first_store = NULL; 1695 struct data_reference *dr; 1696 unsigned int i, j; 1697 bool do_peeling = false; 1698 bool do_versioning = false; 1699 bool stat; 1700 gimple *stmt; 1701 stmt_vec_info stmt_info; 1702 unsigned int npeel = 0; 1703 bool one_misalignment_known = false; 1704 bool one_misalignment_unknown = false; 1705 bool one_dr_unsupportable = false; 1706 struct data_reference *unsupportable_dr = NULL; 1707 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1708 unsigned possible_npeel_number = 1; 1709 tree vectype; 1710 unsigned int mis, same_align_drs_max = 0; 1711 hash_table<peel_info_hasher> peeling_htab (1); 1712 1713 if (dump_enabled_p ()) 1714 dump_printf_loc (MSG_NOTE, vect_location, 1715 "=== vect_enhance_data_refs_alignment ===\n"); 1716 1717 /* Reset data so we can safely be called multiple times. */ 1718 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1719 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1720 1721 /* While cost model enhancements are expected in the future, the high level 1722 view of the code at this time is as follows: 1723 1724 A) If there is a misaligned access then see if peeling to align 1725 this access can make all data references satisfy 1726 vect_supportable_dr_alignment. If so, update data structures 1727 as needed and return true. 1728 1729 B) If peeling wasn't possible and there is a data reference with an 1730 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1731 then see if loop versioning checks can be used to make all data 1732 references satisfy vect_supportable_dr_alignment. If so, update 1733 data structures as needed and return true. 1734 1735 C) If neither peeling nor versioning were successful then return false if 1736 any data reference does not satisfy vect_supportable_dr_alignment. 1737 1738 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1739 1740 Note, Possibility 3 above (which is peeling and versioning together) is not 1741 being done at this time. */ 1742 1743 /* (1) Peeling to force alignment. */ 1744 1745 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1746 Considerations: 1747 + How many accesses will become aligned due to the peeling 1748 - How many accesses will become unaligned due to the peeling, 1749 and the cost of misaligned accesses. 1750 - The cost of peeling (the extra runtime checks, the increase 1751 in code size). */ 1752 1753 FOR_EACH_VEC_ELT (datarefs, i, dr) 1754 { 1755 stmt = DR_STMT (dr); 1756 stmt_info = vinfo_for_stmt (stmt); 1757 1758 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1759 continue; 1760 1761 /* For interleaving, only the alignment of the first access 1762 matters. */ 1763 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1764 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1765 continue; 1766 1767 /* For invariant accesses there is nothing to enhance. */ 1768 if (integer_zerop (DR_STEP (dr))) 1769 continue; 1770 1771 /* Strided accesses perform only component accesses, alignment is 1772 irrelevant for them. */ 1773 if (STMT_VINFO_STRIDED_P (stmt_info) 1774 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1775 continue; 1776 1777 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1778 do_peeling = vector_alignment_reachable_p (dr); 1779 if (do_peeling) 1780 { 1781 if (known_alignment_for_access_p (dr)) 1782 { 1783 unsigned int npeel_tmp = 0; 1784 bool negative = tree_int_cst_compare (DR_STEP (dr), 1785 size_zero_node) < 0; 1786 1787 vectype = STMT_VINFO_VECTYPE (stmt_info); 1788 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); 1789 unsigned int dr_size = vect_get_scalar_dr_size (dr); 1790 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr)); 1791 if (DR_MISALIGNMENT (dr) != 0) 1792 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1793 1794 /* For multiple types, it is possible that the bigger type access 1795 will have more than one peeling option. E.g., a loop with two 1796 types: one of size (vector size / 4), and the other one of 1797 size (vector size / 8). Vectorization factor will 8. If both 1798 accesses are misaligned by 3, the first one needs one scalar 1799 iteration to be aligned, and the second one needs 5. But the 1800 first one will be aligned also by peeling 5 scalar 1801 iterations, and in that case both accesses will be aligned. 1802 Hence, except for the immediate peeling amount, we also want 1803 to try to add full vector size, while we don't exceed 1804 vectorization factor. 1805 We do this automatically for cost model, since we calculate 1806 cost for every peeling option. */ 1807 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1808 { 1809 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1810 ? vf * GROUP_SIZE (stmt_info) : vf); 1811 possible_npeel_number 1812 = vect_get_num_vectors (nscalars, vectype); 1813 1814 /* NPEEL_TMP is 0 when there is no misalignment, but also 1815 allow peeling NELEMENTS. */ 1816 if (DR_MISALIGNMENT (dr) == 0) 1817 possible_npeel_number++; 1818 } 1819 1820 /* Save info about DR in the hash table. Also include peeling 1821 amounts according to the explanation above. */ 1822 for (j = 0; j < possible_npeel_number; j++) 1823 { 1824 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1825 dr, npeel_tmp); 1826 npeel_tmp += target_align / dr_size; 1827 } 1828 1829 one_misalignment_known = true; 1830 } 1831 else 1832 { 1833 /* If we don't know any misalignment values, we prefer 1834 peeling for data-ref that has the maximum number of data-refs 1835 with the same alignment, unless the target prefers to align 1836 stores over load. */ 1837 unsigned same_align_drs 1838 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1839 if (!dr0 1840 || same_align_drs_max < same_align_drs) 1841 { 1842 same_align_drs_max = same_align_drs; 1843 dr0 = dr; 1844 } 1845 /* For data-refs with the same number of related 1846 accesses prefer the one where the misalign 1847 computation will be invariant in the outermost loop. */ 1848 else if (same_align_drs_max == same_align_drs) 1849 { 1850 struct loop *ivloop0, *ivloop; 1851 ivloop0 = outermost_invariant_loop_for_expr 1852 (loop, DR_BASE_ADDRESS (dr0)); 1853 ivloop = outermost_invariant_loop_for_expr 1854 (loop, DR_BASE_ADDRESS (dr)); 1855 if ((ivloop && !ivloop0) 1856 || (ivloop && ivloop0 1857 && flow_loop_nested_p (ivloop, ivloop0))) 1858 dr0 = dr; 1859 } 1860 1861 one_misalignment_unknown = true; 1862 1863 /* Check for data refs with unsupportable alignment that 1864 can be peeled. */ 1865 if (!supportable_dr_alignment) 1866 { 1867 one_dr_unsupportable = true; 1868 unsupportable_dr = dr; 1869 } 1870 1871 if (!first_store && DR_IS_WRITE (dr)) 1872 first_store = dr; 1873 } 1874 } 1875 else 1876 { 1877 if (!aligned_access_p (dr)) 1878 { 1879 if (dump_enabled_p ()) 1880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1881 "vector alignment may not be reachable\n"); 1882 break; 1883 } 1884 } 1885 } 1886 1887 /* Check if we can possibly peel the loop. */ 1888 if (!vect_can_advance_ivs_p (loop_vinfo) 1889 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1890 || loop->inner) 1891 do_peeling = false; 1892 1893 struct _vect_peel_extended_info peel_for_known_alignment; 1894 struct _vect_peel_extended_info peel_for_unknown_alignment; 1895 struct _vect_peel_extended_info best_peel; 1896 1897 peel_for_unknown_alignment.inside_cost = INT_MAX; 1898 peel_for_unknown_alignment.outside_cost = INT_MAX; 1899 peel_for_unknown_alignment.peel_info.count = 0; 1900 1901 if (do_peeling 1902 && one_misalignment_unknown) 1903 { 1904 /* Check if the target requires to prefer stores over loads, i.e., if 1905 misaligned stores are more expensive than misaligned loads (taking 1906 drs with same alignment into account). */ 1907 unsigned int load_inside_cost = 0; 1908 unsigned int load_outside_cost = 0; 1909 unsigned int store_inside_cost = 0; 1910 unsigned int store_outside_cost = 0; 1911 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1912 1913 stmt_vector_for_cost dummy; 1914 dummy.create (2); 1915 vect_get_peeling_costs_all_drs (datarefs, dr0, 1916 &load_inside_cost, 1917 &load_outside_cost, 1918 &dummy, estimated_npeels, true); 1919 dummy.release (); 1920 1921 if (first_store) 1922 { 1923 dummy.create (2); 1924 vect_get_peeling_costs_all_drs (datarefs, first_store, 1925 &store_inside_cost, 1926 &store_outside_cost, 1927 &dummy, estimated_npeels, true); 1928 dummy.release (); 1929 } 1930 else 1931 { 1932 store_inside_cost = INT_MAX; 1933 store_outside_cost = INT_MAX; 1934 } 1935 1936 if (load_inside_cost > store_inside_cost 1937 || (load_inside_cost == store_inside_cost 1938 && load_outside_cost > store_outside_cost)) 1939 { 1940 dr0 = first_store; 1941 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1942 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1943 } 1944 else 1945 { 1946 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1947 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1948 } 1949 1950 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1951 prologue_cost_vec.create (2); 1952 epilogue_cost_vec.create (2); 1953 1954 int dummy2; 1955 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1956 (loop_vinfo, estimated_npeels, &dummy2, 1957 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1958 &prologue_cost_vec, &epilogue_cost_vec); 1959 1960 prologue_cost_vec.release (); 1961 epilogue_cost_vec.release (); 1962 1963 peel_for_unknown_alignment.peel_info.count = 1 1964 + STMT_VINFO_SAME_ALIGN_REFS 1965 (vinfo_for_stmt (DR_STMT (dr0))).length (); 1966 } 1967 1968 peel_for_unknown_alignment.peel_info.npeel = 0; 1969 peel_for_unknown_alignment.peel_info.dr = dr0; 1970 1971 best_peel = peel_for_unknown_alignment; 1972 1973 peel_for_known_alignment.inside_cost = INT_MAX; 1974 peel_for_known_alignment.outside_cost = INT_MAX; 1975 peel_for_known_alignment.peel_info.count = 0; 1976 peel_for_known_alignment.peel_info.dr = NULL; 1977 1978 if (do_peeling && one_misalignment_known) 1979 { 1980 /* Peeling is possible, but there is no data access that is not supported 1981 unless aligned. So we try to choose the best possible peeling from 1982 the hash table. */ 1983 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 1984 (&peeling_htab, loop_vinfo); 1985 } 1986 1987 /* Compare costs of peeling for known and unknown alignment. */ 1988 if (peel_for_known_alignment.peel_info.dr != NULL 1989 && peel_for_unknown_alignment.inside_cost 1990 >= peel_for_known_alignment.inside_cost) 1991 { 1992 best_peel = peel_for_known_alignment; 1993 1994 /* If the best peeling for known alignment has NPEEL == 0, perform no 1995 peeling at all except if there is an unsupportable dr that we can 1996 align. */ 1997 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 1998 do_peeling = false; 1999 } 2000 2001 /* If there is an unsupportable data ref, prefer this over all choices so far 2002 since we'd have to discard a chosen peeling except when it accidentally 2003 aligned the unsupportable data ref. */ 2004 if (one_dr_unsupportable) 2005 dr0 = unsupportable_dr; 2006 else if (do_peeling) 2007 { 2008 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 2009 TODO: Use nopeel_outside_cost or get rid of it? */ 2010 unsigned nopeel_inside_cost = 0; 2011 unsigned nopeel_outside_cost = 0; 2012 2013 stmt_vector_for_cost dummy; 2014 dummy.create (2); 2015 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost, 2016 &nopeel_outside_cost, &dummy, 0, false); 2017 dummy.release (); 2018 2019 /* Add epilogue costs. As we do not peel for alignment here, no prologue 2020 costs will be recorded. */ 2021 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 2022 prologue_cost_vec.create (2); 2023 epilogue_cost_vec.create (2); 2024 2025 int dummy2; 2026 nopeel_outside_cost += vect_get_known_peeling_cost 2027 (loop_vinfo, 0, &dummy2, 2028 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 2029 &prologue_cost_vec, &epilogue_cost_vec); 2030 2031 prologue_cost_vec.release (); 2032 epilogue_cost_vec.release (); 2033 2034 npeel = best_peel.peel_info.npeel; 2035 dr0 = best_peel.peel_info.dr; 2036 2037 /* If no peeling is not more expensive than the best peeling we 2038 have so far, don't perform any peeling. */ 2039 if (nopeel_inside_cost <= best_peel.inside_cost) 2040 do_peeling = false; 2041 } 2042 2043 if (do_peeling) 2044 { 2045 stmt = DR_STMT (dr0); 2046 stmt_info = vinfo_for_stmt (stmt); 2047 vectype = STMT_VINFO_VECTYPE (stmt_info); 2048 2049 if (known_alignment_for_access_p (dr0)) 2050 { 2051 bool negative = tree_int_cst_compare (DR_STEP (dr0), 2052 size_zero_node) < 0; 2053 if (!npeel) 2054 { 2055 /* Since it's known at compile time, compute the number of 2056 iterations in the peeled loop (the peeling factor) for use in 2057 updating DR_MISALIGNMENT values. The peeling factor is the 2058 vectorization factor minus the misalignment as an element 2059 count. */ 2060 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0); 2061 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2062 npeel = ((mis & (target_align - 1)) 2063 / vect_get_scalar_dr_size (dr0)); 2064 } 2065 2066 /* For interleaved data access every iteration accesses all the 2067 members of the group, therefore we divide the number of iterations 2068 by the group size. */ 2069 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 2070 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2071 npeel /= GROUP_SIZE (stmt_info); 2072 2073 if (dump_enabled_p ()) 2074 dump_printf_loc (MSG_NOTE, vect_location, 2075 "Try peeling by %d\n", npeel); 2076 } 2077 2078 /* Ensure that all datarefs can be vectorized after the peel. */ 2079 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel)) 2080 do_peeling = false; 2081 2082 /* Check if all datarefs are supportable and log. */ 2083 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 2084 { 2085 stat = vect_verify_datarefs_alignment (loop_vinfo); 2086 if (!stat) 2087 do_peeling = false; 2088 else 2089 return stat; 2090 } 2091 2092 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2093 if (do_peeling) 2094 { 2095 unsigned max_allowed_peel 2096 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 2097 if (max_allowed_peel != (unsigned)-1) 2098 { 2099 unsigned max_peel = npeel; 2100 if (max_peel == 0) 2101 { 2102 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2103 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1; 2104 } 2105 if (max_peel > max_allowed_peel) 2106 { 2107 do_peeling = false; 2108 if (dump_enabled_p ()) 2109 dump_printf_loc (MSG_NOTE, vect_location, 2110 "Disable peeling, max peels reached: %d\n", max_peel); 2111 } 2112 } 2113 } 2114 2115 /* Cost model #2 - if peeling may result in a remaining loop not 2116 iterating enough to be vectorized then do not peel. Since this 2117 is a cost heuristic rather than a correctness decision, use the 2118 most likely runtime value for variable vectorization factors. */ 2119 if (do_peeling 2120 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2121 { 2122 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2123 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2124 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2125 < assumed_vf + max_peel) 2126 do_peeling = false; 2127 } 2128 2129 if (do_peeling) 2130 { 2131 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2132 If the misalignment of DR_i is identical to that of dr0 then set 2133 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2134 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2135 by the peeling factor times the element size of DR_i (MOD the 2136 vectorization factor times the size). Otherwise, the 2137 misalignment of DR_i must be set to unknown. */ 2138 FOR_EACH_VEC_ELT (datarefs, i, dr) 2139 if (dr != dr0) 2140 { 2141 /* Strided accesses perform only component accesses, alignment 2142 is irrelevant for them. */ 2143 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2144 if (STMT_VINFO_STRIDED_P (stmt_info) 2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2146 continue; 2147 2148 vect_update_misalignment_for_peel (dr, dr0, npeel); 2149 } 2150 2151 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 2152 if (npeel) 2153 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2154 else 2155 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2156 = DR_MISALIGNMENT (dr0); 2157 SET_DR_MISALIGNMENT (dr0, 0); 2158 if (dump_enabled_p ()) 2159 { 2160 dump_printf_loc (MSG_NOTE, vect_location, 2161 "Alignment of access forced using peeling.\n"); 2162 dump_printf_loc (MSG_NOTE, vect_location, 2163 "Peeling for alignment will be applied.\n"); 2164 } 2165 2166 /* The inside-loop cost will be accounted for in vectorizable_load 2167 and vectorizable_store correctly with adjusted alignments. 2168 Drop the body_cst_vec on the floor here. */ 2169 stat = vect_verify_datarefs_alignment (loop_vinfo); 2170 gcc_assert (stat); 2171 return stat; 2172 } 2173 } 2174 2175 /* (2) Versioning to force alignment. */ 2176 2177 /* Try versioning if: 2178 1) optimize loop for speed 2179 2) there is at least one unsupported misaligned data ref with an unknown 2180 misalignment, and 2181 3) all misaligned data refs with a known misalignment are supported, and 2182 4) the number of runtime alignment checks is within reason. */ 2183 2184 do_versioning = 2185 optimize_loop_nest_for_speed_p (loop) 2186 && (!loop->inner); /* FORNOW */ 2187 2188 if (do_versioning) 2189 { 2190 FOR_EACH_VEC_ELT (datarefs, i, dr) 2191 { 2192 stmt = DR_STMT (dr); 2193 stmt_info = vinfo_for_stmt (stmt); 2194 2195 /* For interleaving, only the alignment of the first access 2196 matters. */ 2197 if (aligned_access_p (dr) 2198 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2199 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2200 continue; 2201 2202 if (STMT_VINFO_STRIDED_P (stmt_info)) 2203 { 2204 /* Strided loads perform only component accesses, alignment is 2205 irrelevant for them. */ 2206 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2207 continue; 2208 do_versioning = false; 2209 break; 2210 } 2211 2212 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2213 2214 if (!supportable_dr_alignment) 2215 { 2216 gimple *stmt; 2217 int mask; 2218 tree vectype; 2219 2220 if (known_alignment_for_access_p (dr) 2221 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2222 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2223 { 2224 do_versioning = false; 2225 break; 2226 } 2227 2228 stmt = DR_STMT (dr); 2229 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2230 gcc_assert (vectype); 2231 2232 /* At present we don't support versioning for alignment 2233 with variable VF, since there's no guarantee that the 2234 VF is a power of two. We could relax this if we added 2235 a way of enforcing a power-of-two size. */ 2236 unsigned HOST_WIDE_INT size; 2237 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2238 { 2239 do_versioning = false; 2240 break; 2241 } 2242 2243 /* The rightmost bits of an aligned address must be zeros. 2244 Construct the mask needed for this test. For example, 2245 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2246 mask must be 15 = 0xf. */ 2247 mask = size - 1; 2248 2249 /* FORNOW: use the same mask to test all potentially unaligned 2250 references in the loop. The vectorizer currently supports 2251 a single vector size, see the reference to 2252 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2253 vectorization factor is computed. */ 2254 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2255 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2256 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2257 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2258 DR_STMT (dr)); 2259 } 2260 } 2261 2262 /* Versioning requires at least one misaligned data reference. */ 2263 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2264 do_versioning = false; 2265 else if (!do_versioning) 2266 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2267 } 2268 2269 if (do_versioning) 2270 { 2271 vec<gimple *> may_misalign_stmts 2272 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2273 gimple *stmt; 2274 2275 /* It can now be assumed that the data references in the statements 2276 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2277 of the loop being vectorized. */ 2278 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2279 { 2280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2281 dr = STMT_VINFO_DATA_REF (stmt_info); 2282 SET_DR_MISALIGNMENT (dr, 0); 2283 if (dump_enabled_p ()) 2284 dump_printf_loc (MSG_NOTE, vect_location, 2285 "Alignment of access forced using versioning.\n"); 2286 } 2287 2288 if (dump_enabled_p ()) 2289 dump_printf_loc (MSG_NOTE, vect_location, 2290 "Versioning for alignment will be applied.\n"); 2291 2292 /* Peeling and versioning can't be done together at this time. */ 2293 gcc_assert (! (do_peeling && do_versioning)); 2294 2295 stat = vect_verify_datarefs_alignment (loop_vinfo); 2296 gcc_assert (stat); 2297 return stat; 2298 } 2299 2300 /* This point is reached if neither peeling nor versioning is being done. */ 2301 gcc_assert (! (do_peeling || do_versioning)); 2302 2303 stat = vect_verify_datarefs_alignment (loop_vinfo); 2304 return stat; 2305 } 2306 2307 2308 /* Function vect_find_same_alignment_drs. 2309 2310 Update group and alignment relations according to the chosen 2311 vectorization factor. */ 2312 2313 static void 2314 vect_find_same_alignment_drs (struct data_dependence_relation *ddr) 2315 { 2316 struct data_reference *dra = DDR_A (ddr); 2317 struct data_reference *drb = DDR_B (ddr); 2318 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2319 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2320 2321 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2322 return; 2323 2324 if (dra == drb) 2325 return; 2326 2327 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2328 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2329 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2330 return; 2331 2332 /* Two references with distance zero have the same alignment. */ 2333 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2334 - wi::to_poly_offset (DR_INIT (drb))); 2335 if (maybe_ne (diff, 0)) 2336 { 2337 /* Get the wider of the two alignments. */ 2338 unsigned int align_a = (vect_calculate_target_alignment (dra) 2339 / BITS_PER_UNIT); 2340 unsigned int align_b = (vect_calculate_target_alignment (drb) 2341 / BITS_PER_UNIT); 2342 unsigned int max_align = MAX (align_a, align_b); 2343 2344 /* Require the gap to be a multiple of the larger vector alignment. */ 2345 if (!multiple_p (diff, max_align)) 2346 return; 2347 } 2348 2349 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2350 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2351 if (dump_enabled_p ()) 2352 { 2353 dump_printf_loc (MSG_NOTE, vect_location, 2354 "accesses have the same alignment: "); 2355 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2356 dump_printf (MSG_NOTE, " and "); 2357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2358 dump_printf (MSG_NOTE, "\n"); 2359 } 2360 } 2361 2362 2363 /* Function vect_analyze_data_refs_alignment 2364 2365 Analyze the alignment of the data-references in the loop. 2366 Return FALSE if a data reference is found that cannot be vectorized. */ 2367 2368 bool 2369 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2370 { 2371 if (dump_enabled_p ()) 2372 dump_printf_loc (MSG_NOTE, vect_location, 2373 "=== vect_analyze_data_refs_alignment ===\n"); 2374 2375 /* Mark groups of data references with same alignment using 2376 data dependence information. */ 2377 vec<ddr_p> ddrs = vinfo->ddrs; 2378 struct data_dependence_relation *ddr; 2379 unsigned int i; 2380 2381 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2382 vect_find_same_alignment_drs (ddr); 2383 2384 vec<data_reference_p> datarefs = vinfo->datarefs; 2385 struct data_reference *dr; 2386 2387 vect_record_base_alignments (vinfo); 2388 FOR_EACH_VEC_ELT (datarefs, i, dr) 2389 { 2390 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2391 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2392 && !vect_compute_data_ref_alignment (dr)) 2393 { 2394 /* Strided accesses perform only component accesses, misalignment 2395 information is irrelevant for them. */ 2396 if (STMT_VINFO_STRIDED_P (stmt_info) 2397 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2398 continue; 2399 2400 if (dump_enabled_p ()) 2401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2402 "not vectorized: can't calculate alignment " 2403 "for data ref.\n"); 2404 2405 return false; 2406 } 2407 } 2408 2409 return true; 2410 } 2411 2412 2413 /* Analyze alignment of DRs of stmts in NODE. */ 2414 2415 static bool 2416 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2417 { 2418 /* We vectorize from the first scalar stmt in the node unless 2419 the node is permuted in which case we start from the first 2420 element in the group. */ 2421 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2422 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2423 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2424 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2425 2426 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2427 if (! vect_compute_data_ref_alignment (dr) 2428 /* For creating the data-ref pointer we need alignment of the 2429 first element anyway. */ 2430 || (dr != first_dr 2431 && ! vect_compute_data_ref_alignment (first_dr)) 2432 || ! verify_data_ref_alignment (dr)) 2433 { 2434 if (dump_enabled_p ()) 2435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2436 "not vectorized: bad data alignment in basic " 2437 "block.\n"); 2438 return false; 2439 } 2440 2441 return true; 2442 } 2443 2444 /* Function vect_slp_analyze_instance_alignment 2445 2446 Analyze the alignment of the data-references in the SLP instance. 2447 Return FALSE if a data reference is found that cannot be vectorized. */ 2448 2449 bool 2450 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2451 { 2452 if (dump_enabled_p ()) 2453 dump_printf_loc (MSG_NOTE, vect_location, 2454 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2455 2456 slp_tree node; 2457 unsigned i; 2458 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2459 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2460 return false; 2461 2462 node = SLP_INSTANCE_TREE (instance); 2463 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2464 && ! vect_slp_analyze_and_verify_node_alignment 2465 (SLP_INSTANCE_TREE (instance))) 2466 return false; 2467 2468 return true; 2469 } 2470 2471 2472 /* Analyze groups of accesses: check that DR belongs to a group of 2473 accesses of legal size, step, etc. Detect gaps, single element 2474 interleaving, and other special cases. Set grouped access info. 2475 Collect groups of strided stores for further use in SLP analysis. 2476 Worker for vect_analyze_group_access. */ 2477 2478 static bool 2479 vect_analyze_group_access_1 (struct data_reference *dr) 2480 { 2481 tree step = DR_STEP (dr); 2482 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2483 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2484 gimple *stmt = DR_STMT (dr); 2485 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2486 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2487 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2488 HOST_WIDE_INT dr_step = -1; 2489 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2490 bool slp_impossible = false; 2491 2492 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2493 size of the interleaving group (including gaps). */ 2494 if (tree_fits_shwi_p (step)) 2495 { 2496 dr_step = tree_to_shwi (step); 2497 /* Check that STEP is a multiple of type size. Otherwise there is 2498 a non-element-sized gap at the end of the group which we 2499 cannot represent in GROUP_GAP or GROUP_SIZE. 2500 ??? As we can handle non-constant step fine here we should 2501 simply remove uses of GROUP_GAP between the last and first 2502 element and instead rely on DR_STEP. GROUP_SIZE then would 2503 simply not include that gap. */ 2504 if ((dr_step % type_size) != 0) 2505 { 2506 if (dump_enabled_p ()) 2507 { 2508 dump_printf_loc (MSG_NOTE, vect_location, 2509 "Step "); 2510 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2511 dump_printf (MSG_NOTE, 2512 " is not a multiple of the element size for "); 2513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2514 dump_printf (MSG_NOTE, "\n"); 2515 } 2516 return false; 2517 } 2518 groupsize = absu_hwi (dr_step) / type_size; 2519 } 2520 else 2521 groupsize = 0; 2522 2523 /* Not consecutive access is possible only if it is a part of interleaving. */ 2524 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2525 { 2526 /* Check if it this DR is a part of interleaving, and is a single 2527 element of the group that is accessed in the loop. */ 2528 2529 /* Gaps are supported only for loads. STEP must be a multiple of the type 2530 size. */ 2531 if (DR_IS_READ (dr) 2532 && (dr_step % type_size) == 0 2533 && groupsize > 0) 2534 { 2535 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2536 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2537 GROUP_GAP (stmt_info) = groupsize - 1; 2538 if (dump_enabled_p ()) 2539 { 2540 dump_printf_loc (MSG_NOTE, vect_location, 2541 "Detected single element interleaving "); 2542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2543 dump_printf (MSG_NOTE, " step "); 2544 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2545 dump_printf (MSG_NOTE, "\n"); 2546 } 2547 2548 return true; 2549 } 2550 2551 if (dump_enabled_p ()) 2552 { 2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2554 "not consecutive access "); 2555 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2556 } 2557 2558 if (bb_vinfo) 2559 { 2560 /* Mark the statement as unvectorizable. */ 2561 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2562 return true; 2563 } 2564 2565 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2566 STMT_VINFO_STRIDED_P (stmt_info) = true; 2567 return true; 2568 } 2569 2570 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2571 { 2572 /* First stmt in the interleaving chain. Check the chain. */ 2573 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2574 struct data_reference *data_ref = dr; 2575 unsigned int count = 1; 2576 tree prev_init = DR_INIT (data_ref); 2577 gimple *prev = stmt; 2578 HOST_WIDE_INT diff, gaps = 0; 2579 2580 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2581 while (next) 2582 { 2583 /* Skip same data-refs. In case that two or more stmts share 2584 data-ref (supported only for loads), we vectorize only the first 2585 stmt, and the rest get their vectorized loads from the first 2586 one. */ 2587 if (!tree_int_cst_compare (DR_INIT (data_ref), 2588 DR_INIT (STMT_VINFO_DATA_REF ( 2589 vinfo_for_stmt (next))))) 2590 { 2591 if (DR_IS_WRITE (data_ref)) 2592 { 2593 if (dump_enabled_p ()) 2594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2595 "Two store stmts share the same dr.\n"); 2596 return false; 2597 } 2598 2599 if (dump_enabled_p ()) 2600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2601 "Two or more load stmts share the same dr.\n"); 2602 2603 /* For load use the same data-ref load. */ 2604 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2605 2606 prev = next; 2607 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2608 continue; 2609 } 2610 2611 prev = next; 2612 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2613 2614 /* All group members have the same STEP by construction. */ 2615 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2616 2617 /* Check that the distance between two accesses is equal to the type 2618 size. Otherwise, we have gaps. */ 2619 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2620 - TREE_INT_CST_LOW (prev_init)) / type_size; 2621 if (diff != 1) 2622 { 2623 /* FORNOW: SLP of accesses with gaps is not supported. */ 2624 slp_impossible = true; 2625 if (DR_IS_WRITE (data_ref)) 2626 { 2627 if (dump_enabled_p ()) 2628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2629 "interleaved store with gaps\n"); 2630 return false; 2631 } 2632 2633 gaps += diff - 1; 2634 } 2635 2636 last_accessed_element += diff; 2637 2638 /* Store the gap from the previous member of the group. If there is no 2639 gap in the access, GROUP_GAP is always 1. */ 2640 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2641 2642 prev_init = DR_INIT (data_ref); 2643 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2644 /* Count the number of data-refs in the chain. */ 2645 count++; 2646 } 2647 2648 if (groupsize == 0) 2649 groupsize = count + gaps; 2650 2651 /* This could be UINT_MAX but as we are generating code in a very 2652 inefficient way we have to cap earlier. See PR78699 for example. */ 2653 if (groupsize > 4096) 2654 { 2655 if (dump_enabled_p ()) 2656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2657 "group is too large\n"); 2658 return false; 2659 } 2660 2661 /* Check that the size of the interleaving is equal to count for stores, 2662 i.e., that there are no gaps. */ 2663 if (groupsize != count 2664 && !DR_IS_READ (dr)) 2665 { 2666 if (dump_enabled_p ()) 2667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2668 "interleaved store with gaps\n"); 2669 return false; 2670 } 2671 2672 /* If there is a gap after the last load in the group it is the 2673 difference between the groupsize and the last accessed 2674 element. 2675 When there is no gap, this difference should be 0. */ 2676 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2677 2678 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2679 if (dump_enabled_p ()) 2680 { 2681 dump_printf_loc (MSG_NOTE, vect_location, 2682 "Detected interleaving "); 2683 if (DR_IS_READ (dr)) 2684 dump_printf (MSG_NOTE, "load "); 2685 else 2686 dump_printf (MSG_NOTE, "store "); 2687 dump_printf (MSG_NOTE, "of size %u starting with ", 2688 (unsigned)groupsize); 2689 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2690 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2691 dump_printf_loc (MSG_NOTE, vect_location, 2692 "There is a gap of %u elements after the group\n", 2693 GROUP_GAP (vinfo_for_stmt (stmt))); 2694 } 2695 2696 /* SLP: create an SLP data structure for every interleaving group of 2697 stores for further analysis in vect_analyse_slp. */ 2698 if (DR_IS_WRITE (dr) && !slp_impossible) 2699 { 2700 if (loop_vinfo) 2701 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2702 if (bb_vinfo) 2703 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2704 } 2705 } 2706 2707 return true; 2708 } 2709 2710 /* Analyze groups of accesses: check that DR belongs to a group of 2711 accesses of legal size, step, etc. Detect gaps, single element 2712 interleaving, and other special cases. Set grouped access info. 2713 Collect groups of strided stores for further use in SLP analysis. */ 2714 2715 static bool 2716 vect_analyze_group_access (struct data_reference *dr) 2717 { 2718 if (!vect_analyze_group_access_1 (dr)) 2719 { 2720 /* Dissolve the group if present. */ 2721 gimple *next; 2722 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2723 while (stmt) 2724 { 2725 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2726 next = GROUP_NEXT_ELEMENT (vinfo); 2727 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2728 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2729 stmt = next; 2730 } 2731 return false; 2732 } 2733 return true; 2734 } 2735 2736 /* Analyze the access pattern of the data-reference DR. 2737 In case of non-consecutive accesses call vect_analyze_group_access() to 2738 analyze groups of accesses. */ 2739 2740 static bool 2741 vect_analyze_data_ref_access (struct data_reference *dr) 2742 { 2743 tree step = DR_STEP (dr); 2744 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2745 gimple *stmt = DR_STMT (dr); 2746 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2747 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2748 struct loop *loop = NULL; 2749 2750 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2751 return true; 2752 2753 if (loop_vinfo) 2754 loop = LOOP_VINFO_LOOP (loop_vinfo); 2755 2756 if (loop_vinfo && !step) 2757 { 2758 if (dump_enabled_p ()) 2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2760 "bad data-ref access in loop\n"); 2761 return false; 2762 } 2763 2764 /* Allow loads with zero step in inner-loop vectorization. */ 2765 if (loop_vinfo && integer_zerop (step)) 2766 { 2767 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2768 if (!nested_in_vect_loop_p (loop, stmt)) 2769 return DR_IS_READ (dr); 2770 /* Allow references with zero step for outer loops marked 2771 with pragma omp simd only - it guarantees absence of 2772 loop-carried dependencies between inner loop iterations. */ 2773 if (loop->safelen < 2) 2774 { 2775 if (dump_enabled_p ()) 2776 dump_printf_loc (MSG_NOTE, vect_location, 2777 "zero step in inner loop of nest\n"); 2778 return false; 2779 } 2780 } 2781 2782 if (loop && nested_in_vect_loop_p (loop, stmt)) 2783 { 2784 /* Interleaved accesses are not yet supported within outer-loop 2785 vectorization for references in the inner-loop. */ 2786 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2787 2788 /* For the rest of the analysis we use the outer-loop step. */ 2789 step = STMT_VINFO_DR_STEP (stmt_info); 2790 if (integer_zerop (step)) 2791 { 2792 if (dump_enabled_p ()) 2793 dump_printf_loc (MSG_NOTE, vect_location, 2794 "zero step in outer loop.\n"); 2795 return DR_IS_READ (dr); 2796 } 2797 } 2798 2799 /* Consecutive? */ 2800 if (TREE_CODE (step) == INTEGER_CST) 2801 { 2802 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2803 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2804 || (dr_step < 0 2805 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2806 { 2807 /* Mark that it is not interleaving. */ 2808 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2809 return true; 2810 } 2811 } 2812 2813 if (loop && nested_in_vect_loop_p (loop, stmt)) 2814 { 2815 if (dump_enabled_p ()) 2816 dump_printf_loc (MSG_NOTE, vect_location, 2817 "grouped access in outer loop.\n"); 2818 return false; 2819 } 2820 2821 2822 /* Assume this is a DR handled by non-constant strided load case. */ 2823 if (TREE_CODE (step) != INTEGER_CST) 2824 return (STMT_VINFO_STRIDED_P (stmt_info) 2825 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2826 || vect_analyze_group_access (dr))); 2827 2828 /* Not consecutive access - check if it's a part of interleaving group. */ 2829 return vect_analyze_group_access (dr); 2830 } 2831 2832 /* Compare two data-references DRA and DRB to group them into chunks 2833 suitable for grouping. */ 2834 2835 static int 2836 dr_group_sort_cmp (const void *dra_, const void *drb_) 2837 { 2838 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2839 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2840 int cmp; 2841 2842 /* Stabilize sort. */ 2843 if (dra == drb) 2844 return 0; 2845 2846 /* DRs in different loops never belong to the same group. */ 2847 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2848 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2849 if (loopa != loopb) 2850 return loopa->num < loopb->num ? -1 : 1; 2851 2852 /* Ordering of DRs according to base. */ 2853 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2854 DR_BASE_ADDRESS (drb)); 2855 if (cmp != 0) 2856 return cmp; 2857 2858 /* And according to DR_OFFSET. */ 2859 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2860 if (cmp != 0) 2861 return cmp; 2862 2863 /* Put reads before writes. */ 2864 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2865 return DR_IS_READ (dra) ? -1 : 1; 2866 2867 /* Then sort after access size. */ 2868 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2869 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2870 if (cmp != 0) 2871 return cmp; 2872 2873 /* And after step. */ 2874 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2875 if (cmp != 0) 2876 return cmp; 2877 2878 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2879 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2880 if (cmp == 0) 2881 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2882 return cmp; 2883 } 2884 2885 /* If OP is the result of a conversion, return the unconverted value, 2886 otherwise return null. */ 2887 2888 static tree 2889 strip_conversion (tree op) 2890 { 2891 if (TREE_CODE (op) != SSA_NAME) 2892 return NULL_TREE; 2893 gimple *stmt = SSA_NAME_DEF_STMT (op); 2894 if (!is_gimple_assign (stmt) 2895 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2896 return NULL_TREE; 2897 return gimple_assign_rhs1 (stmt); 2898 } 2899 2900 /* Return true if vectorizable_* routines can handle statements STMT1 2901 and STMT2 being in a single group. */ 2902 2903 static bool 2904 can_group_stmts_p (gimple *stmt1, gimple *stmt2) 2905 { 2906 if (gimple_assign_single_p (stmt1)) 2907 return gimple_assign_single_p (stmt2); 2908 2909 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1)) 2910 { 2911 /* Check for two masked loads or two masked stores. */ 2912 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2)) 2913 return false; 2914 internal_fn ifn = gimple_call_internal_fn (stmt1); 2915 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2916 return false; 2917 if (ifn != gimple_call_internal_fn (stmt2)) 2918 return false; 2919 2920 /* Check that the masks are the same. Cope with casts of masks, 2921 like those created by build_mask_conversion. */ 2922 tree mask1 = gimple_call_arg (stmt1, 2); 2923 tree mask2 = gimple_call_arg (stmt2, 2); 2924 if (!operand_equal_p (mask1, mask2, 0)) 2925 { 2926 mask1 = strip_conversion (mask1); 2927 if (!mask1) 2928 return false; 2929 mask2 = strip_conversion (mask2); 2930 if (!mask2) 2931 return false; 2932 if (!operand_equal_p (mask1, mask2, 0)) 2933 return false; 2934 } 2935 return true; 2936 } 2937 2938 return false; 2939 } 2940 2941 /* Function vect_analyze_data_ref_accesses. 2942 2943 Analyze the access pattern of all the data references in the loop. 2944 2945 FORNOW: the only access pattern that is considered vectorizable is a 2946 simple step 1 (consecutive) access. 2947 2948 FORNOW: handle only arrays and pointer accesses. */ 2949 2950 bool 2951 vect_analyze_data_ref_accesses (vec_info *vinfo) 2952 { 2953 unsigned int i; 2954 vec<data_reference_p> datarefs = vinfo->datarefs; 2955 struct data_reference *dr; 2956 2957 if (dump_enabled_p ()) 2958 dump_printf_loc (MSG_NOTE, vect_location, 2959 "=== vect_analyze_data_ref_accesses ===\n"); 2960 2961 if (datarefs.is_empty ()) 2962 return true; 2963 2964 /* Sort the array of datarefs to make building the interleaving chains 2965 linear. Don't modify the original vector's order, it is needed for 2966 determining what dependencies are reversed. */ 2967 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2968 datarefs_copy.qsort (dr_group_sort_cmp); 2969 2970 /* Build the interleaving chains. */ 2971 for (i = 0; i < datarefs_copy.length () - 1;) 2972 { 2973 data_reference_p dra = datarefs_copy[i]; 2974 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2975 stmt_vec_info lastinfo = NULL; 2976 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 2977 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 2978 { 2979 ++i; 2980 continue; 2981 } 2982 for (i = i + 1; i < datarefs_copy.length (); ++i) 2983 { 2984 data_reference_p drb = datarefs_copy[i]; 2985 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2986 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 2987 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2988 break; 2989 2990 /* ??? Imperfect sorting (non-compatible types, non-modulo 2991 accesses, same accesses) can lead to a group to be artificially 2992 split here as we don't just skip over those. If it really 2993 matters we can push those to a worklist and re-iterate 2994 over them. The we can just skip ahead to the next DR here. */ 2995 2996 /* DRs in a different loop should not be put into the same 2997 interleaving group. */ 2998 if (gimple_bb (DR_STMT (dra))->loop_father 2999 != gimple_bb (DR_STMT (drb))->loop_father) 3000 break; 3001 3002 /* Check that the data-refs have same first location (except init) 3003 and they are both either store or load (not load and store, 3004 not masked loads or stores). */ 3005 if (DR_IS_READ (dra) != DR_IS_READ (drb) 3006 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 3007 DR_BASE_ADDRESS (drb)) != 0 3008 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 3009 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb))) 3010 break; 3011 3012 /* Check that the data-refs have the same constant size. */ 3013 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 3014 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 3015 if (!tree_fits_uhwi_p (sza) 3016 || !tree_fits_uhwi_p (szb) 3017 || !tree_int_cst_equal (sza, szb)) 3018 break; 3019 3020 /* Check that the data-refs have the same step. */ 3021 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 3022 break; 3023 3024 /* Check the types are compatible. 3025 ??? We don't distinguish this during sorting. */ 3026 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 3027 TREE_TYPE (DR_REF (drb)))) 3028 break; 3029 3030 /* Check that the DR_INITs are compile-time constants. */ 3031 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 3032 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 3033 break; 3034 3035 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 3036 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 3037 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 3038 HOST_WIDE_INT init_prev 3039 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 3040 gcc_assert (init_a <= init_b 3041 && init_a <= init_prev 3042 && init_prev <= init_b); 3043 3044 /* Do not place the same access in the interleaving chain twice. */ 3045 if (init_b == init_prev) 3046 { 3047 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3048 < gimple_uid (DR_STMT (drb))); 3049 /* ??? For now we simply "drop" the later reference which is 3050 otherwise the same rather than finishing off this group. 3051 In the end we'd want to re-process duplicates forming 3052 multiple groups from the refs, likely by just collecting 3053 all candidates (including duplicates and split points 3054 below) in a vector and then process them together. */ 3055 continue; 3056 } 3057 3058 /* If init_b == init_a + the size of the type * k, we have an 3059 interleaving, and DRA is accessed before DRB. */ 3060 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3061 if (type_size_a == 0 3062 || (init_b - init_a) % type_size_a != 0) 3063 break; 3064 3065 /* If we have a store, the accesses are adjacent. This splits 3066 groups into chunks we support (we don't support vectorization 3067 of stores with gaps). */ 3068 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3069 break; 3070 3071 /* If the step (if not zero or non-constant) is greater than the 3072 difference between data-refs' inits this splits groups into 3073 suitable sizes. */ 3074 if (tree_fits_shwi_p (DR_STEP (dra))) 3075 { 3076 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3077 if (step != 0 && step <= (init_b - init_a)) 3078 break; 3079 } 3080 3081 if (dump_enabled_p ()) 3082 { 3083 dump_printf_loc (MSG_NOTE, vect_location, 3084 "Detected interleaving "); 3085 if (DR_IS_READ (dra)) 3086 dump_printf (MSG_NOTE, "load "); 3087 else 3088 dump_printf (MSG_NOTE, "store "); 3089 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 3090 dump_printf (MSG_NOTE, " and "); 3091 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 3092 dump_printf (MSG_NOTE, "\n"); 3093 } 3094 3095 /* Link the found element into the group list. */ 3096 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 3097 { 3098 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 3099 lastinfo = stmtinfo_a; 3100 } 3101 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 3102 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 3103 lastinfo = stmtinfo_b; 3104 } 3105 } 3106 3107 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3108 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 3109 && !vect_analyze_data_ref_access (dr)) 3110 { 3111 if (dump_enabled_p ()) 3112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3113 "not vectorized: complicated access pattern.\n"); 3114 3115 if (is_a <bb_vec_info> (vinfo)) 3116 { 3117 /* Mark the statement as not vectorizable. */ 3118 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3119 continue; 3120 } 3121 else 3122 { 3123 datarefs_copy.release (); 3124 return false; 3125 } 3126 } 3127 3128 datarefs_copy.release (); 3129 return true; 3130 } 3131 3132 /* Function vect_vfa_segment_size. 3133 3134 Input: 3135 DR: The data reference. 3136 LENGTH_FACTOR: segment length to consider. 3137 3138 Return a value suitable for the dr_with_seg_len::seg_len field. 3139 This is the "distance travelled" by the pointer from the first 3140 iteration in the segment to the last. Note that it does not include 3141 the size of the access; in effect it only describes the first byte. */ 3142 3143 static tree 3144 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3145 { 3146 length_factor = size_binop (MINUS_EXPR, 3147 fold_convert (sizetype, length_factor), 3148 size_one_node); 3149 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)), 3150 length_factor); 3151 } 3152 3153 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)), 3154 gives the worst-case number of bytes covered by the segment. */ 3155 3156 static unsigned HOST_WIDE_INT 3157 vect_vfa_access_size (data_reference *dr) 3158 { 3159 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr)); 3160 tree ref_type = TREE_TYPE (DR_REF (dr)); 3161 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3162 unsigned HOST_WIDE_INT access_size = ref_size; 3163 if (GROUP_FIRST_ELEMENT (stmt_vinfo)) 3164 { 3165 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr)); 3166 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo); 3167 } 3168 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3169 && (vect_supportable_dr_alignment (dr, false) 3170 == dr_explicit_realign_optimized)) 3171 { 3172 /* We might access a full vector's worth. */ 3173 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3174 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3175 } 3176 return access_size; 3177 } 3178 3179 /* Get the minimum alignment for all the scalar accesses that DR describes. */ 3180 3181 static unsigned int 3182 vect_vfa_align (const data_reference *dr) 3183 { 3184 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr))); 3185 } 3186 3187 /* Function vect_no_alias_p. 3188 3189 Given data references A and B with equal base and offset, see whether 3190 the alias relation can be decided at compilation time. Return 1 if 3191 it can and the references alias, 0 if it can and the references do 3192 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3193 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3194 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3195 3196 static int 3197 vect_compile_time_alias (struct data_reference *a, struct data_reference *b, 3198 tree segment_length_a, tree segment_length_b, 3199 unsigned HOST_WIDE_INT access_size_a, 3200 unsigned HOST_WIDE_INT access_size_b) 3201 { 3202 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a)); 3203 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b)); 3204 poly_uint64 const_length_a; 3205 poly_uint64 const_length_b; 3206 3207 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3208 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3209 [a, a+12) */ 3210 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3211 { 3212 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3213 offset_a = (offset_a + access_size_a) - const_length_a; 3214 } 3215 else 3216 const_length_a = tree_to_poly_uint64 (segment_length_a); 3217 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3218 { 3219 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3220 offset_b = (offset_b + access_size_b) - const_length_b; 3221 } 3222 else 3223 const_length_b = tree_to_poly_uint64 (segment_length_b); 3224 3225 const_length_a += access_size_a; 3226 const_length_b += access_size_b; 3227 3228 if (ranges_known_overlap_p (offset_a, const_length_a, 3229 offset_b, const_length_b)) 3230 return 1; 3231 3232 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3233 offset_b, const_length_b)) 3234 return 0; 3235 3236 return -1; 3237 } 3238 3239 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3240 in DDR is >= VF. */ 3241 3242 static bool 3243 dependence_distance_ge_vf (data_dependence_relation *ddr, 3244 unsigned int loop_depth, poly_uint64 vf) 3245 { 3246 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3247 || DDR_NUM_DIST_VECTS (ddr) == 0) 3248 return false; 3249 3250 /* If the dependence is exact, we should have limited the VF instead. */ 3251 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3252 3253 unsigned int i; 3254 lambda_vector dist_v; 3255 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3256 { 3257 HOST_WIDE_INT dist = dist_v[loop_depth]; 3258 if (dist != 0 3259 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3260 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3261 return false; 3262 } 3263 3264 if (dump_enabled_p ()) 3265 { 3266 dump_printf_loc (MSG_NOTE, vect_location, 3267 "dependence distance between "); 3268 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 3269 dump_printf (MSG_NOTE, " and "); 3270 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 3271 dump_printf (MSG_NOTE, " is >= VF\n"); 3272 } 3273 3274 return true; 3275 } 3276 3277 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3278 3279 static void 3280 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound) 3281 { 3282 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs"); 3283 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr); 3284 dump_printf (dump_kind, ") >= "); 3285 dump_dec (dump_kind, lower_bound.min_value); 3286 } 3287 3288 /* Record that the vectorized loop requires the vec_lower_bound described 3289 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3290 3291 static void 3292 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3293 poly_uint64 min_value) 3294 { 3295 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3296 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3297 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3298 { 3299 unsigned_p &= lower_bounds[i].unsigned_p; 3300 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3301 if (lower_bounds[i].unsigned_p != unsigned_p 3302 || maybe_lt (lower_bounds[i].min_value, min_value)) 3303 { 3304 lower_bounds[i].unsigned_p = unsigned_p; 3305 lower_bounds[i].min_value = min_value; 3306 if (dump_enabled_p ()) 3307 { 3308 dump_printf_loc (MSG_NOTE, vect_location, 3309 "updating run-time check to "); 3310 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3311 dump_printf (MSG_NOTE, "\n"); 3312 } 3313 } 3314 return; 3315 } 3316 3317 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3318 if (dump_enabled_p ()) 3319 { 3320 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3321 dump_lower_bound (MSG_NOTE, lower_bound); 3322 dump_printf (MSG_NOTE, "\n"); 3323 } 3324 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3325 } 3326 3327 /* Return true if it's unlikely that the step of the vectorized form of DR 3328 will span fewer than GAP bytes. */ 3329 3330 static bool 3331 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap) 3332 { 3333 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 3334 HOST_WIDE_INT count 3335 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3336 if (GROUP_FIRST_ELEMENT (stmt_info)) 3337 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 3338 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr); 3339 } 3340 3341 /* Return true if we know that there is no alias between DR_A and DR_B 3342 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set 3343 *LOWER_BOUND_OUT to this N. */ 3344 3345 static bool 3346 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b, 3347 poly_uint64 *lower_bound_out) 3348 { 3349 /* Check that there is a constant gap of known sign between DR_A 3350 and DR_B. */ 3351 poly_int64 init_a, init_b; 3352 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3353 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3354 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3355 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3356 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3357 || !ordered_p (init_a, init_b)) 3358 return false; 3359 3360 /* Sort DR_A and DR_B by the address they access. */ 3361 if (maybe_lt (init_b, init_a)) 3362 { 3363 std::swap (init_a, init_b); 3364 std::swap (dr_a, dr_b); 3365 } 3366 3367 /* If the two accesses could be dependent within a scalar iteration, 3368 make sure that we'd retain their order. */ 3369 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b) 3370 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b))) 3371 return false; 3372 3373 /* There is no alias if abs (DR_STEP) is greater than or equal to 3374 the bytes spanned by the combination of the two accesses. */ 3375 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a; 3376 return true; 3377 } 3378 3379 /* Function vect_prune_runtime_alias_test_list. 3380 3381 Prune a list of ddrs to be tested at run-time by versioning for alias. 3382 Merge several alias checks into one if possible. 3383 Return FALSE if resulting list of ddrs is longer then allowed by 3384 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3385 3386 bool 3387 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3388 { 3389 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3390 hash_set <tree_pair_hash> compared_objects; 3391 3392 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3393 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3394 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3395 vec<vec_object_pair> &check_unequal_addrs 3396 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3397 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3398 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3399 3400 ddr_p ddr; 3401 unsigned int i; 3402 tree length_factor; 3403 3404 if (dump_enabled_p ()) 3405 dump_printf_loc (MSG_NOTE, vect_location, 3406 "=== vect_prune_runtime_alias_test_list ===\n"); 3407 3408 /* Step values are irrelevant for aliasing if the number of vector 3409 iterations is equal to the number of scalar iterations (which can 3410 happen for fully-SLP loops). */ 3411 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3412 3413 if (!ignore_step_p) 3414 { 3415 /* Convert the checks for nonzero steps into bound tests. */ 3416 tree value; 3417 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3418 vect_check_lower_bound (loop_vinfo, value, true, 1); 3419 } 3420 3421 if (may_alias_ddrs.is_empty ()) 3422 return true; 3423 3424 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3425 3426 unsigned int loop_depth 3427 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3428 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3429 3430 /* First, we collect all data ref pairs for aliasing checks. */ 3431 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3432 { 3433 int comp_res; 3434 poly_uint64 lower_bound; 3435 struct data_reference *dr_a, *dr_b; 3436 gimple *dr_group_first_a, *dr_group_first_b; 3437 tree segment_length_a, segment_length_b; 3438 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3439 unsigned int align_a, align_b; 3440 gimple *stmt_a, *stmt_b; 3441 3442 /* Ignore the alias if the VF we chose ended up being no greater 3443 than the dependence distance. */ 3444 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3445 continue; 3446 3447 if (DDR_OBJECT_A (ddr)) 3448 { 3449 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3450 if (!compared_objects.add (new_pair)) 3451 { 3452 if (dump_enabled_p ()) 3453 { 3454 dump_printf_loc (MSG_NOTE, vect_location, "checking that "); 3455 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first); 3456 dump_printf (MSG_NOTE, " and "); 3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second); 3458 dump_printf (MSG_NOTE, " have different addresses\n"); 3459 } 3460 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3461 } 3462 continue; 3463 } 3464 3465 dr_a = DDR_A (ddr); 3466 stmt_a = DR_STMT (DDR_A (ddr)); 3467 3468 dr_b = DDR_B (ddr); 3469 stmt_b = DR_STMT (DDR_B (ddr)); 3470 3471 /* Skip the pair if inter-iteration dependencies are irrelevant 3472 and intra-iteration dependencies are guaranteed to be honored. */ 3473 if (ignore_step_p 3474 && (vect_preserves_scalar_order_p (stmt_a, stmt_b) 3475 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound))) 3476 { 3477 if (dump_enabled_p ()) 3478 { 3479 dump_printf_loc (MSG_NOTE, vect_location, 3480 "no need for alias check between "); 3481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3482 dump_printf (MSG_NOTE, " and "); 3483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3484 dump_printf (MSG_NOTE, " when VF is 1\n"); 3485 } 3486 continue; 3487 } 3488 3489 /* See whether we can handle the alias using a bounds check on 3490 the step, and whether that's likely to be the best approach. 3491 (It might not be, for example, if the minimum step is much larger 3492 than the number of bytes handled by one vector iteration.) */ 3493 if (!ignore_step_p 3494 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST 3495 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound) 3496 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound) 3497 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound))) 3498 { 3499 bool unsigned_p = dr_known_forward_stride_p (dr_a); 3500 if (dump_enabled_p ()) 3501 { 3502 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "); 3503 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3504 dump_printf (MSG_NOTE, " and "); 3505 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3506 dump_printf (MSG_NOTE, " when the step "); 3507 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a)); 3508 dump_printf (MSG_NOTE, " is outside "); 3509 if (unsigned_p) 3510 dump_printf (MSG_NOTE, "[0"); 3511 else 3512 { 3513 dump_printf (MSG_NOTE, "("); 3514 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3515 } 3516 dump_printf (MSG_NOTE, ", "); 3517 dump_dec (MSG_NOTE, lower_bound); 3518 dump_printf (MSG_NOTE, ")\n"); 3519 } 3520 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p, 3521 lower_bound); 3522 continue; 3523 } 3524 3525 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3526 if (dr_group_first_a) 3527 { 3528 stmt_a = dr_group_first_a; 3529 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3530 } 3531 3532 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3533 if (dr_group_first_b) 3534 { 3535 stmt_b = dr_group_first_b; 3536 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3537 } 3538 3539 if (ignore_step_p) 3540 { 3541 segment_length_a = size_zero_node; 3542 segment_length_b = size_zero_node; 3543 } 3544 else 3545 { 3546 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3547 length_factor = scalar_loop_iters; 3548 else 3549 length_factor = size_int (vect_factor); 3550 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3551 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3552 } 3553 access_size_a = vect_vfa_access_size (dr_a); 3554 access_size_b = vect_vfa_access_size (dr_b); 3555 align_a = vect_vfa_align (dr_a); 3556 align_b = vect_vfa_align (dr_b); 3557 3558 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 3559 DR_BASE_ADDRESS (dr_b)); 3560 if (comp_res == 0) 3561 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), 3562 DR_OFFSET (dr_b)); 3563 3564 /* See whether the alias is known at compilation time. */ 3565 if (comp_res == 0 3566 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3567 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3568 && poly_int_tree_p (segment_length_a) 3569 && poly_int_tree_p (segment_length_b)) 3570 { 3571 int res = vect_compile_time_alias (dr_a, dr_b, 3572 segment_length_a, 3573 segment_length_b, 3574 access_size_a, 3575 access_size_b); 3576 if (res >= 0 && dump_enabled_p ()) 3577 { 3578 dump_printf_loc (MSG_NOTE, vect_location, 3579 "can tell at compile time that "); 3580 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3581 dump_printf (MSG_NOTE, " and "); 3582 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3583 if (res == 0) 3584 dump_printf (MSG_NOTE, " do not alias\n"); 3585 else 3586 dump_printf (MSG_NOTE, " alias\n"); 3587 } 3588 3589 if (res == 0) 3590 continue; 3591 3592 if (res == 1) 3593 { 3594 if (dump_enabled_p ()) 3595 dump_printf_loc (MSG_NOTE, vect_location, 3596 "not vectorized: compilation time alias.\n"); 3597 return false; 3598 } 3599 } 3600 3601 dr_with_seg_len_pair_t dr_with_seg_len_pair 3602 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a), 3603 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b)); 3604 3605 /* Canonicalize pairs by sorting the two DR members. */ 3606 if (comp_res > 0) 3607 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3608 3609 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3610 } 3611 3612 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3613 3614 unsigned int count = (comp_alias_ddrs.length () 3615 + check_unequal_addrs.length ()); 3616 3617 dump_printf_loc (MSG_NOTE, vect_location, 3618 "improved number of alias checks from %d to %d\n", 3619 may_alias_ddrs.length (), count); 3620 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3621 { 3622 if (dump_enabled_p ()) 3623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3624 "number of versioning for alias " 3625 "run-time tests exceeds %d " 3626 "(--param vect-max-version-for-alias-checks)\n", 3627 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3628 return false; 3629 } 3630 3631 return true; 3632 } 3633 3634 /* Check whether we can use an internal function for a gather load 3635 or scatter store. READ_P is true for loads and false for stores. 3636 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3637 the type of the memory elements being loaded or stored. OFFSET_BITS 3638 is the number of bits in each scalar offset and OFFSET_SIGN is the 3639 sign of the offset. SCALE is the amount by which the offset should 3640 be multiplied *after* it has been converted to address width. 3641 3642 Return true if the function is supported, storing the function 3643 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ 3644 3645 bool 3646 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, 3647 tree memory_type, unsigned int offset_bits, 3648 signop offset_sign, int scale, 3649 internal_fn *ifn_out, tree *element_type_out) 3650 { 3651 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3652 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3653 if (offset_bits > element_bits) 3654 /* Internal functions require the offset to be the same width as 3655 the vector elements. We can extend narrower offsets, but it isn't 3656 safe to truncate wider offsets. */ 3657 return false; 3658 3659 if (element_bits != memory_bits) 3660 /* For now the vector elements must be the same width as the 3661 memory elements. */ 3662 return false; 3663 3664 /* Work out which function we need. */ 3665 internal_fn ifn; 3666 if (read_p) 3667 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3668 else 3669 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3670 3671 /* Test whether the target supports this combination. */ 3672 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3673 offset_sign, scale)) 3674 return false; 3675 3676 *ifn_out = ifn; 3677 *element_type_out = TREE_TYPE (vectype); 3678 return true; 3679 } 3680 3681 /* CALL is a call to an internal gather load or scatter store function. 3682 Describe the operation in INFO. */ 3683 3684 static void 3685 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info) 3686 { 3687 stmt_vec_info stmt_info = vinfo_for_stmt (call); 3688 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3689 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3690 3691 info->ifn = gimple_call_internal_fn (call); 3692 info->decl = NULL_TREE; 3693 info->base = gimple_call_arg (call, 0); 3694 info->offset = gimple_call_arg (call, 1); 3695 info->offset_dt = vect_unknown_def_type; 3696 info->offset_vectype = NULL_TREE; 3697 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3698 info->element_type = TREE_TYPE (vectype); 3699 info->memory_type = TREE_TYPE (DR_REF (dr)); 3700 } 3701 3702 /* Return true if a non-affine read or write in STMT is suitable for a 3703 gather load or scatter store. Describe the operation in *INFO if so. */ 3704 3705 bool 3706 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3707 gather_scatter_info *info) 3708 { 3709 HOST_WIDE_INT scale = 1; 3710 poly_int64 pbitpos, pbitsize; 3711 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3712 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3713 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3714 tree offtype = NULL_TREE; 3715 tree decl = NULL_TREE, base, off; 3716 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3717 tree memory_type = TREE_TYPE (DR_REF (dr)); 3718 machine_mode pmode; 3719 int punsignedp, reversep, pvolatilep = 0; 3720 internal_fn ifn; 3721 tree element_type; 3722 bool masked_p = false; 3723 3724 /* See whether this is already a call to a gather/scatter internal function. 3725 If not, see whether it's a masked load or store. */ 3726 gcall *call = dyn_cast <gcall *> (stmt); 3727 if (call && gimple_call_internal_p (call)) 3728 { 3729 ifn = gimple_call_internal_fn (stmt); 3730 if (internal_gather_scatter_fn_p (ifn)) 3731 { 3732 vect_describe_gather_scatter_call (call, info); 3733 return true; 3734 } 3735 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3736 } 3737 3738 /* True if we should aim to use internal functions rather than 3739 built-in functions. */ 3740 bool use_ifn_p = (DR_IS_READ (dr) 3741 ? supports_vec_gather_load_p () 3742 : supports_vec_scatter_store_p ()); 3743 3744 base = DR_REF (dr); 3745 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3746 see if we can use the def stmt of the address. */ 3747 if (masked_p 3748 && TREE_CODE (base) == MEM_REF 3749 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3750 && integer_zerop (TREE_OPERAND (base, 1)) 3751 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3752 { 3753 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3754 if (is_gimple_assign (def_stmt) 3755 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3756 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3757 } 3758 3759 /* The gather and scatter builtins need address of the form 3760 loop_invariant + vector * {1, 2, 4, 8} 3761 or 3762 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3763 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3764 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3765 multiplications and additions in it. To get a vector, we need 3766 a single SSA_NAME that will be defined in the loop and will 3767 contain everything that is not loop invariant and that can be 3768 vectorized. The following code attempts to find such a preexistng 3769 SSA_NAME OFF and put the loop invariants into a tree BASE 3770 that can be gimplified before the loop. */ 3771 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3772 &punsignedp, &reversep, &pvolatilep); 3773 gcc_assert (base && !reversep); 3774 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3775 3776 if (TREE_CODE (base) == MEM_REF) 3777 { 3778 if (!integer_zerop (TREE_OPERAND (base, 1))) 3779 { 3780 if (off == NULL_TREE) 3781 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3782 else 3783 off = size_binop (PLUS_EXPR, off, 3784 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3785 } 3786 base = TREE_OPERAND (base, 0); 3787 } 3788 else 3789 base = build_fold_addr_expr (base); 3790 3791 if (off == NULL_TREE) 3792 off = size_zero_node; 3793 3794 /* If base is not loop invariant, either off is 0, then we start with just 3795 the constant offset in the loop invariant BASE and continue with base 3796 as OFF, otherwise give up. 3797 We could handle that case by gimplifying the addition of base + off 3798 into some SSA_NAME and use that as off, but for now punt. */ 3799 if (!expr_invariant_in_loop_p (loop, base)) 3800 { 3801 if (!integer_zerop (off)) 3802 return false; 3803 off = base; 3804 base = size_int (pbytepos); 3805 } 3806 /* Otherwise put base + constant offset into the loop invariant BASE 3807 and continue with OFF. */ 3808 else 3809 { 3810 base = fold_convert (sizetype, base); 3811 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3812 } 3813 3814 /* OFF at this point may be either a SSA_NAME or some tree expression 3815 from get_inner_reference. Try to peel off loop invariants from it 3816 into BASE as long as possible. */ 3817 STRIP_NOPS (off); 3818 while (offtype == NULL_TREE) 3819 { 3820 enum tree_code code; 3821 tree op0, op1, add = NULL_TREE; 3822 3823 if (TREE_CODE (off) == SSA_NAME) 3824 { 3825 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3826 3827 if (expr_invariant_in_loop_p (loop, off)) 3828 return false; 3829 3830 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3831 break; 3832 3833 op0 = gimple_assign_rhs1 (def_stmt); 3834 code = gimple_assign_rhs_code (def_stmt); 3835 op1 = gimple_assign_rhs2 (def_stmt); 3836 } 3837 else 3838 { 3839 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3840 return false; 3841 code = TREE_CODE (off); 3842 extract_ops_from_tree (off, &code, &op0, &op1); 3843 } 3844 switch (code) 3845 { 3846 case POINTER_PLUS_EXPR: 3847 case PLUS_EXPR: 3848 if (expr_invariant_in_loop_p (loop, op0)) 3849 { 3850 add = op0; 3851 off = op1; 3852 do_add: 3853 add = fold_convert (sizetype, add); 3854 if (scale != 1) 3855 add = size_binop (MULT_EXPR, add, size_int (scale)); 3856 base = size_binop (PLUS_EXPR, base, add); 3857 continue; 3858 } 3859 if (expr_invariant_in_loop_p (loop, op1)) 3860 { 3861 add = op1; 3862 off = op0; 3863 goto do_add; 3864 } 3865 break; 3866 case MINUS_EXPR: 3867 if (expr_invariant_in_loop_p (loop, op1)) 3868 { 3869 add = fold_convert (sizetype, op1); 3870 add = size_binop (MINUS_EXPR, size_zero_node, add); 3871 off = op0; 3872 goto do_add; 3873 } 3874 break; 3875 case MULT_EXPR: 3876 if (scale == 1 && tree_fits_shwi_p (op1)) 3877 { 3878 int new_scale = tree_to_shwi (op1); 3879 /* Only treat this as a scaling operation if the target 3880 supports it. */ 3881 if (use_ifn_p 3882 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, 3883 vectype, memory_type, 1, 3884 TYPE_SIGN (TREE_TYPE (op0)), 3885 new_scale, &ifn, 3886 &element_type)) 3887 break; 3888 scale = new_scale; 3889 off = op0; 3890 continue; 3891 } 3892 break; 3893 case SSA_NAME: 3894 off = op0; 3895 continue; 3896 CASE_CONVERT: 3897 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3898 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3899 break; 3900 if (TYPE_PRECISION (TREE_TYPE (op0)) 3901 == TYPE_PRECISION (TREE_TYPE (off))) 3902 { 3903 off = op0; 3904 continue; 3905 } 3906 3907 /* The internal functions need the offset to be the same width 3908 as the elements of VECTYPE. Don't include operations that 3909 cast the offset from that width to a different width. */ 3910 if (use_ifn_p 3911 && (int_size_in_bytes (TREE_TYPE (vectype)) 3912 == int_size_in_bytes (TREE_TYPE (off)))) 3913 break; 3914 3915 if (TYPE_PRECISION (TREE_TYPE (op0)) 3916 < TYPE_PRECISION (TREE_TYPE (off))) 3917 { 3918 off = op0; 3919 offtype = TREE_TYPE (off); 3920 STRIP_NOPS (off); 3921 continue; 3922 } 3923 break; 3924 default: 3925 break; 3926 } 3927 break; 3928 } 3929 3930 /* If at the end OFF still isn't a SSA_NAME or isn't 3931 defined in the loop, punt. */ 3932 if (TREE_CODE (off) != SSA_NAME 3933 || expr_invariant_in_loop_p (loop, off)) 3934 return false; 3935 3936 if (offtype == NULL_TREE) 3937 offtype = TREE_TYPE (off); 3938 3939 if (use_ifn_p) 3940 { 3941 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, 3942 memory_type, TYPE_PRECISION (offtype), 3943 TYPE_SIGN (offtype), scale, &ifn, 3944 &element_type)) 3945 return false; 3946 } 3947 else 3948 { 3949 if (DR_IS_READ (dr)) 3950 { 3951 if (targetm.vectorize.builtin_gather) 3952 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 3953 } 3954 else 3955 { 3956 if (targetm.vectorize.builtin_scatter) 3957 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 3958 } 3959 3960 if (!decl) 3961 return false; 3962 3963 ifn = IFN_LAST; 3964 element_type = TREE_TYPE (vectype); 3965 } 3966 3967 info->ifn = ifn; 3968 info->decl = decl; 3969 info->base = base; 3970 info->offset = off; 3971 info->offset_dt = vect_unknown_def_type; 3972 info->offset_vectype = NULL_TREE; 3973 info->scale = scale; 3974 info->element_type = element_type; 3975 info->memory_type = memory_type; 3976 return true; 3977 } 3978 3979 /* Function vect_analyze_data_refs. 3980 3981 Find all the data references in the loop or basic block. 3982 3983 The general structure of the analysis of data refs in the vectorizer is as 3984 follows: 3985 1- vect_analyze_data_refs(loop/bb): call 3986 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3987 in the loop/bb and their dependences. 3988 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3989 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3990 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3991 3992 */ 3993 3994 bool 3995 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) 3996 { 3997 struct loop *loop = NULL; 3998 unsigned int i; 3999 struct data_reference *dr; 4000 tree scalar_type; 4001 4002 if (dump_enabled_p ()) 4003 dump_printf_loc (MSG_NOTE, vect_location, 4004 "=== vect_analyze_data_refs ===\n"); 4005 4006 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 4007 loop = LOOP_VINFO_LOOP (loop_vinfo); 4008 4009 /* Go through the data-refs, check that the analysis succeeded. Update 4010 pointer from stmt_vec_info struct to DR and vectype. */ 4011 4012 vec<data_reference_p> datarefs = vinfo->datarefs; 4013 FOR_EACH_VEC_ELT (datarefs, i, dr) 4014 { 4015 gimple *stmt; 4016 stmt_vec_info stmt_info; 4017 tree base, offset, init; 4018 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 4019 bool simd_lane_access = false; 4020 poly_uint64 vf; 4021 4022 again: 4023 if (!dr || !DR_REF (dr)) 4024 { 4025 if (dump_enabled_p ()) 4026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4027 "not vectorized: unhandled data-ref\n"); 4028 return false; 4029 } 4030 4031 stmt = DR_STMT (dr); 4032 stmt_info = vinfo_for_stmt (stmt); 4033 4034 /* Discard clobbers from the dataref vector. We will remove 4035 clobber stmts during vectorization. */ 4036 if (gimple_clobber_p (stmt)) 4037 { 4038 free_data_ref (dr); 4039 if (i == datarefs.length () - 1) 4040 { 4041 datarefs.pop (); 4042 break; 4043 } 4044 datarefs.ordered_remove (i); 4045 dr = datarefs[i]; 4046 goto again; 4047 } 4048 4049 /* Check that analysis of the data-ref succeeded. */ 4050 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4051 || !DR_STEP (dr)) 4052 { 4053 bool maybe_gather 4054 = DR_IS_READ (dr) 4055 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4056 && (targetm.vectorize.builtin_gather != NULL 4057 || supports_vec_gather_load_p ()); 4058 bool maybe_scatter 4059 = DR_IS_WRITE (dr) 4060 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4061 && (targetm.vectorize.builtin_scatter != NULL 4062 || supports_vec_scatter_store_p ()); 4063 bool maybe_simd_lane_access 4064 = is_a <loop_vec_info> (vinfo) && loop->simduid; 4065 4066 /* If target supports vector gather loads or scatter stores, or if 4067 this might be a SIMD lane access, see if they can't be used. */ 4068 if (is_a <loop_vec_info> (vinfo) 4069 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 4070 && !nested_in_vect_loop_p (loop, stmt)) 4071 { 4072 struct data_reference *newdr 4073 = create_data_ref (NULL, loop_containing_stmt (stmt), 4074 DR_REF (dr), stmt, !maybe_scatter, 4075 DR_IS_CONDITIONAL_IN_STMT (dr)); 4076 gcc_assert (newdr != NULL && DR_REF (newdr)); 4077 if (DR_BASE_ADDRESS (newdr) 4078 && DR_OFFSET (newdr) 4079 && DR_INIT (newdr) 4080 && DR_STEP (newdr) 4081 && integer_zerop (DR_STEP (newdr))) 4082 { 4083 if (maybe_simd_lane_access) 4084 { 4085 tree off = DR_OFFSET (newdr); 4086 STRIP_NOPS (off); 4087 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4088 && TREE_CODE (off) == MULT_EXPR 4089 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4090 { 4091 tree step = TREE_OPERAND (off, 1); 4092 off = TREE_OPERAND (off, 0); 4093 STRIP_NOPS (off); 4094 if (CONVERT_EXPR_P (off) 4095 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 4096 0))) 4097 < TYPE_PRECISION (TREE_TYPE (off))) 4098 off = TREE_OPERAND (off, 0); 4099 if (TREE_CODE (off) == SSA_NAME) 4100 { 4101 gimple *def = SSA_NAME_DEF_STMT (off); 4102 tree reft = TREE_TYPE (DR_REF (newdr)); 4103 if (is_gimple_call (def) 4104 && gimple_call_internal_p (def) 4105 && (gimple_call_internal_fn (def) 4106 == IFN_GOMP_SIMD_LANE)) 4107 { 4108 tree arg = gimple_call_arg (def, 0); 4109 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4110 arg = SSA_NAME_VAR (arg); 4111 if (arg == loop->simduid 4112 /* For now. */ 4113 && tree_int_cst_equal 4114 (TYPE_SIZE_UNIT (reft), 4115 step)) 4116 { 4117 DR_OFFSET (newdr) = ssize_int (0); 4118 DR_STEP (newdr) = step; 4119 DR_OFFSET_ALIGNMENT (newdr) 4120 = BIGGEST_ALIGNMENT; 4121 DR_STEP_ALIGNMENT (newdr) 4122 = highest_pow2_factor (step); 4123 dr = newdr; 4124 simd_lane_access = true; 4125 } 4126 } 4127 } 4128 } 4129 } 4130 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 4131 { 4132 dr = newdr; 4133 if (maybe_gather) 4134 gatherscatter = GATHER; 4135 else 4136 gatherscatter = SCATTER; 4137 } 4138 } 4139 if (gatherscatter == SG_NONE && !simd_lane_access) 4140 free_data_ref (newdr); 4141 } 4142 4143 if (gatherscatter == SG_NONE && !simd_lane_access) 4144 { 4145 if (dump_enabled_p ()) 4146 { 4147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4148 "not vectorized: data ref analysis " 4149 "failed "); 4150 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4151 } 4152 4153 if (is_a <bb_vec_info> (vinfo)) 4154 break; 4155 4156 return false; 4157 } 4158 } 4159 4160 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4161 { 4162 if (dump_enabled_p ()) 4163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4164 "not vectorized: base addr of dr is a " 4165 "constant\n"); 4166 4167 if (is_a <bb_vec_info> (vinfo)) 4168 break; 4169 4170 if (gatherscatter != SG_NONE || simd_lane_access) 4171 free_data_ref (dr); 4172 return false; 4173 } 4174 4175 if (TREE_THIS_VOLATILE (DR_REF (dr))) 4176 { 4177 if (dump_enabled_p ()) 4178 { 4179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4180 "not vectorized: volatile type "); 4181 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4182 } 4183 4184 if (is_a <bb_vec_info> (vinfo)) 4185 break; 4186 4187 return false; 4188 } 4189 4190 if (stmt_can_throw_internal (stmt)) 4191 { 4192 if (dump_enabled_p ()) 4193 { 4194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4195 "not vectorized: statement can throw an " 4196 "exception "); 4197 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4198 } 4199 4200 if (is_a <bb_vec_info> (vinfo)) 4201 break; 4202 4203 if (gatherscatter != SG_NONE || simd_lane_access) 4204 free_data_ref (dr); 4205 return false; 4206 } 4207 4208 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4209 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4210 { 4211 if (dump_enabled_p ()) 4212 { 4213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4214 "not vectorized: statement is bitfield " 4215 "access "); 4216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4217 } 4218 4219 if (is_a <bb_vec_info> (vinfo)) 4220 break; 4221 4222 if (gatherscatter != SG_NONE || simd_lane_access) 4223 free_data_ref (dr); 4224 return false; 4225 } 4226 4227 base = unshare_expr (DR_BASE_ADDRESS (dr)); 4228 offset = unshare_expr (DR_OFFSET (dr)); 4229 init = unshare_expr (DR_INIT (dr)); 4230 4231 if (is_gimple_call (stmt) 4232 && (!gimple_call_internal_p (stmt) 4233 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 4234 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 4235 { 4236 if (dump_enabled_p ()) 4237 { 4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4239 "not vectorized: dr in a call "); 4240 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4241 } 4242 4243 if (is_a <bb_vec_info> (vinfo)) 4244 break; 4245 4246 if (gatherscatter != SG_NONE || simd_lane_access) 4247 free_data_ref (dr); 4248 return false; 4249 } 4250 4251 /* Update DR field in stmt_vec_info struct. */ 4252 4253 /* If the dataref is in an inner-loop of the loop that is considered for 4254 for vectorization, we also want to analyze the access relative to 4255 the outer-loop (DR contains information only relative to the 4256 inner-most enclosing loop). We do that by building a reference to the 4257 first location accessed by the inner-loop, and analyze it relative to 4258 the outer-loop. */ 4259 if (loop && nested_in_vect_loop_p (loop, stmt)) 4260 { 4261 /* Build a reference to the first location accessed by the 4262 inner loop: *(BASE + INIT + OFFSET). By construction, 4263 this address must be invariant in the inner loop, so we 4264 can consider it as being used in the outer loop. */ 4265 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4266 init, offset); 4267 tree init_addr = fold_build_pointer_plus (base, init_offset); 4268 tree init_ref = build_fold_indirect_ref (init_addr); 4269 4270 if (dump_enabled_p ()) 4271 { 4272 dump_printf_loc (MSG_NOTE, vect_location, 4273 "analyze in outer loop: "); 4274 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref); 4275 dump_printf (MSG_NOTE, "\n"); 4276 } 4277 4278 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4279 init_ref, loop)) 4280 /* dr_analyze_innermost already explained the failure. */ 4281 return false; 4282 4283 if (dump_enabled_p ()) 4284 { 4285 dump_printf_loc (MSG_NOTE, vect_location, 4286 "\touter base_address: "); 4287 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4288 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4289 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 4290 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4291 STMT_VINFO_DR_OFFSET (stmt_info)); 4292 dump_printf (MSG_NOTE, 4293 "\n\touter constant offset from base address: "); 4294 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4295 STMT_VINFO_DR_INIT (stmt_info)); 4296 dump_printf (MSG_NOTE, "\n\touter step: "); 4297 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4298 STMT_VINFO_DR_STEP (stmt_info)); 4299 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n", 4300 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info)); 4301 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n", 4302 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info)); 4303 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n", 4304 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info)); 4305 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n", 4306 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4307 } 4308 } 4309 4310 if (STMT_VINFO_DATA_REF (stmt_info)) 4311 { 4312 if (dump_enabled_p ()) 4313 { 4314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4315 "not vectorized: more than one data ref " 4316 "in stmt: "); 4317 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4318 } 4319 4320 if (is_a <bb_vec_info> (vinfo)) 4321 break; 4322 4323 if (gatherscatter != SG_NONE || simd_lane_access) 4324 free_data_ref (dr); 4325 return false; 4326 } 4327 4328 STMT_VINFO_DATA_REF (stmt_info) = dr; 4329 if (simd_lane_access) 4330 { 4331 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4332 free_data_ref (datarefs[i]); 4333 datarefs[i] = dr; 4334 } 4335 4336 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4337 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4338 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4339 { 4340 if (dump_enabled_p ()) 4341 { 4342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4343 "not vectorized: base object not addressable " 4344 "for stmt: "); 4345 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4346 } 4347 if (is_a <bb_vec_info> (vinfo)) 4348 { 4349 /* In BB vectorization the ref can still participate 4350 in dependence analysis, we just can't vectorize it. */ 4351 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4352 continue; 4353 } 4354 return false; 4355 } 4356 4357 /* Set vectype for STMT. */ 4358 scalar_type = TREE_TYPE (DR_REF (dr)); 4359 STMT_VINFO_VECTYPE (stmt_info) 4360 = get_vectype_for_scalar_type (scalar_type); 4361 if (!STMT_VINFO_VECTYPE (stmt_info)) 4362 { 4363 if (dump_enabled_p ()) 4364 { 4365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4366 "not vectorized: no vectype for stmt: "); 4367 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4368 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4369 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4370 scalar_type); 4371 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4372 } 4373 4374 if (is_a <bb_vec_info> (vinfo)) 4375 { 4376 /* No vector type is fine, the ref can still participate 4377 in dependence analysis, we just can't vectorize it. */ 4378 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4379 continue; 4380 } 4381 4382 if (gatherscatter != SG_NONE || simd_lane_access) 4383 { 4384 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4385 if (gatherscatter != SG_NONE) 4386 free_data_ref (dr); 4387 } 4388 return false; 4389 } 4390 else 4391 { 4392 if (dump_enabled_p ()) 4393 { 4394 dump_printf_loc (MSG_NOTE, vect_location, 4395 "got vectype for stmt: "); 4396 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4397 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4398 STMT_VINFO_VECTYPE (stmt_info)); 4399 dump_printf (MSG_NOTE, "\n"); 4400 } 4401 } 4402 4403 /* Adjust the minimal vectorization factor according to the 4404 vector type. */ 4405 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4406 *min_vf = upper_bound (*min_vf, vf); 4407 4408 if (gatherscatter != SG_NONE) 4409 { 4410 gather_scatter_info gs_info; 4411 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4412 &gs_info) 4413 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4414 { 4415 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4416 free_data_ref (dr); 4417 if (dump_enabled_p ()) 4418 { 4419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4420 (gatherscatter == GATHER) ? 4421 "not vectorized: not suitable for gather " 4422 "load " : 4423 "not vectorized: not suitable for scatter " 4424 "store "); 4425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4426 } 4427 return false; 4428 } 4429 4430 free_data_ref (datarefs[i]); 4431 datarefs[i] = dr; 4432 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4433 } 4434 4435 else if (is_a <loop_vec_info> (vinfo) 4436 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4437 { 4438 if (nested_in_vect_loop_p (loop, stmt)) 4439 { 4440 if (dump_enabled_p ()) 4441 { 4442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4443 "not vectorized: not suitable for strided " 4444 "load "); 4445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4446 } 4447 return false; 4448 } 4449 STMT_VINFO_STRIDED_P (stmt_info) = true; 4450 } 4451 } 4452 4453 /* If we stopped analysis at the first dataref we could not analyze 4454 when trying to vectorize a basic-block mark the rest of the datarefs 4455 as not vectorizable and truncate the vector of datarefs. That 4456 avoids spending useless time in analyzing their dependence. */ 4457 if (i != datarefs.length ()) 4458 { 4459 gcc_assert (is_a <bb_vec_info> (vinfo)); 4460 for (unsigned j = i; j < datarefs.length (); ++j) 4461 { 4462 data_reference_p dr = datarefs[j]; 4463 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4464 free_data_ref (dr); 4465 } 4466 datarefs.truncate (i); 4467 } 4468 4469 return true; 4470 } 4471 4472 4473 /* Function vect_get_new_vect_var. 4474 4475 Returns a name for a new variable. The current naming scheme appends the 4476 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4477 the name of vectorizer generated variables, and appends that to NAME if 4478 provided. */ 4479 4480 tree 4481 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4482 { 4483 const char *prefix; 4484 tree new_vect_var; 4485 4486 switch (var_kind) 4487 { 4488 case vect_simple_var: 4489 prefix = "vect"; 4490 break; 4491 case vect_scalar_var: 4492 prefix = "stmp"; 4493 break; 4494 case vect_mask_var: 4495 prefix = "mask"; 4496 break; 4497 case vect_pointer_var: 4498 prefix = "vectp"; 4499 break; 4500 default: 4501 gcc_unreachable (); 4502 } 4503 4504 if (name) 4505 { 4506 char* tmp = concat (prefix, "_", name, NULL); 4507 new_vect_var = create_tmp_reg (type, tmp); 4508 free (tmp); 4509 } 4510 else 4511 new_vect_var = create_tmp_reg (type, prefix); 4512 4513 return new_vect_var; 4514 } 4515 4516 /* Like vect_get_new_vect_var but return an SSA name. */ 4517 4518 tree 4519 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4520 { 4521 const char *prefix; 4522 tree new_vect_var; 4523 4524 switch (var_kind) 4525 { 4526 case vect_simple_var: 4527 prefix = "vect"; 4528 break; 4529 case vect_scalar_var: 4530 prefix = "stmp"; 4531 break; 4532 case vect_pointer_var: 4533 prefix = "vectp"; 4534 break; 4535 default: 4536 gcc_unreachable (); 4537 } 4538 4539 if (name) 4540 { 4541 char* tmp = concat (prefix, "_", name, NULL); 4542 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4543 free (tmp); 4544 } 4545 else 4546 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4547 4548 return new_vect_var; 4549 } 4550 4551 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4552 4553 static void 4554 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr) 4555 { 4556 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4557 int misalign = DR_MISALIGNMENT (dr); 4558 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4559 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4560 else 4561 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4562 DR_TARGET_ALIGNMENT (dr), misalign); 4563 } 4564 4565 /* Function vect_create_addr_base_for_vector_ref. 4566 4567 Create an expression that computes the address of the first memory location 4568 that will be accessed for a data reference. 4569 4570 Input: 4571 STMT: The statement containing the data reference. 4572 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4573 OFFSET: Optional. If supplied, it is be added to the initial address. 4574 LOOP: Specify relative to which loop-nest should the address be computed. 4575 For example, when the dataref is in an inner-loop nested in an 4576 outer-loop that is now being vectorized, LOOP can be either the 4577 outer-loop, or the inner-loop. The first memory location accessed 4578 by the following dataref ('in' points to short): 4579 4580 for (i=0; i<N; i++) 4581 for (j=0; j<M; j++) 4582 s += in[i+j] 4583 4584 is as follows: 4585 if LOOP=i_loop: &in (relative to i_loop) 4586 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4587 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4588 initial address. Unlike OFFSET, which is number of elements to 4589 be added, BYTE_OFFSET is measured in bytes. 4590 4591 Output: 4592 1. Return an SSA_NAME whose value is the address of the memory location of 4593 the first vector of the data reference. 4594 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4595 these statement(s) which define the returned SSA_NAME. 4596 4597 FORNOW: We are only handling array accesses with step 1. */ 4598 4599 tree 4600 vect_create_addr_base_for_vector_ref (gimple *stmt, 4601 gimple_seq *new_stmt_list, 4602 tree offset, 4603 tree byte_offset) 4604 { 4605 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4606 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4607 const char *base_name; 4608 tree addr_base; 4609 tree dest; 4610 gimple_seq seq = NULL; 4611 tree vect_ptr_type; 4612 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4613 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4614 innermost_loop_behavior *drb = vect_dr_behavior (dr); 4615 4616 tree data_ref_base = unshare_expr (drb->base_address); 4617 tree base_offset = unshare_expr (drb->offset); 4618 tree init = unshare_expr (drb->init); 4619 4620 if (loop_vinfo) 4621 base_name = get_name (data_ref_base); 4622 else 4623 { 4624 base_offset = ssize_int (0); 4625 init = ssize_int (0); 4626 base_name = get_name (DR_REF (dr)); 4627 } 4628 4629 /* Create base_offset */ 4630 base_offset = size_binop (PLUS_EXPR, 4631 fold_convert (sizetype, base_offset), 4632 fold_convert (sizetype, init)); 4633 4634 if (offset) 4635 { 4636 offset = fold_build2 (MULT_EXPR, sizetype, 4637 fold_convert (sizetype, offset), step); 4638 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4639 base_offset, offset); 4640 } 4641 if (byte_offset) 4642 { 4643 byte_offset = fold_convert (sizetype, byte_offset); 4644 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4645 base_offset, byte_offset); 4646 } 4647 4648 /* base + base_offset */ 4649 if (loop_vinfo) 4650 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4651 else 4652 { 4653 addr_base = build1 (ADDR_EXPR, 4654 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4655 unshare_expr (DR_REF (dr))); 4656 } 4657 4658 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4659 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4660 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4661 gimple_seq_add_seq (new_stmt_list, seq); 4662 4663 if (DR_PTR_INFO (dr) 4664 && TREE_CODE (addr_base) == SSA_NAME 4665 && !SSA_NAME_PTR_INFO (addr_base)) 4666 { 4667 vect_duplicate_ssa_name_ptr_info (addr_base, dr); 4668 if (offset || byte_offset) 4669 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4670 } 4671 4672 if (dump_enabled_p ()) 4673 { 4674 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4675 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4676 dump_printf (MSG_NOTE, "\n"); 4677 } 4678 4679 return addr_base; 4680 } 4681 4682 4683 /* Function vect_create_data_ref_ptr. 4684 4685 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4686 location accessed in the loop by STMT, along with the def-use update 4687 chain to appropriately advance the pointer through the loop iterations. 4688 Also set aliasing information for the pointer. This pointer is used by 4689 the callers to this function to create a memory reference expression for 4690 vector load/store access. 4691 4692 Input: 4693 1. STMT: a stmt that references memory. Expected to be of the form 4694 GIMPLE_ASSIGN <name, data-ref> or 4695 GIMPLE_ASSIGN <data-ref, name>. 4696 2. AGGR_TYPE: the type of the reference, which should be either a vector 4697 or an array. 4698 3. AT_LOOP: the loop where the vector memref is to be created. 4699 4. OFFSET (optional): an offset to be added to the initial address accessed 4700 by the data-ref in STMT. 4701 5. BSI: location where the new stmts are to be placed if there is no loop 4702 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4703 pointing to the initial address. 4704 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4705 to the initial address accessed by the data-ref in STMT. This is 4706 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4707 in bytes. 4708 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4709 to the IV during each iteration of the loop. NULL says to move 4710 by one copy of AGGR_TYPE up or down, depending on the step of the 4711 data reference. 4712 4713 Output: 4714 1. Declare a new ptr to vector_type, and have it point to the base of the 4715 data reference (initial addressed accessed by the data reference). 4716 For example, for vector of type V8HI, the following code is generated: 4717 4718 v8hi *ap; 4719 ap = (v8hi *)initial_address; 4720 4721 if OFFSET is not supplied: 4722 initial_address = &a[init]; 4723 if OFFSET is supplied: 4724 initial_address = &a[init + OFFSET]; 4725 if BYTE_OFFSET is supplied: 4726 initial_address = &a[init] + BYTE_OFFSET; 4727 4728 Return the initial_address in INITIAL_ADDRESS. 4729 4730 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4731 update the pointer in each iteration of the loop. 4732 4733 Return the increment stmt that updates the pointer in PTR_INCR. 4734 4735 3. Set INV_P to true if the access pattern of the data reference in the 4736 vectorized loop is invariant. Set it to false otherwise. 4737 4738 4. Return the pointer. */ 4739 4740 tree 4741 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4742 tree offset, tree *initial_address, 4743 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4744 bool only_init, bool *inv_p, tree byte_offset, 4745 tree iv_step) 4746 { 4747 const char *base_name; 4748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4749 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4750 struct loop *loop = NULL; 4751 bool nested_in_vect_loop = false; 4752 struct loop *containing_loop = NULL; 4753 tree aggr_ptr_type; 4754 tree aggr_ptr; 4755 tree new_temp; 4756 gimple_seq new_stmt_list = NULL; 4757 edge pe = NULL; 4758 basic_block new_bb; 4759 tree aggr_ptr_init; 4760 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4761 tree aptr; 4762 gimple_stmt_iterator incr_gsi; 4763 bool insert_after; 4764 tree indx_before_incr, indx_after_incr; 4765 gimple *incr; 4766 tree step; 4767 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4768 4769 gcc_assert (iv_step != NULL_TREE 4770 || TREE_CODE (aggr_type) == ARRAY_TYPE 4771 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4772 4773 if (loop_vinfo) 4774 { 4775 loop = LOOP_VINFO_LOOP (loop_vinfo); 4776 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4777 containing_loop = (gimple_bb (stmt))->loop_father; 4778 pe = loop_preheader_edge (loop); 4779 } 4780 else 4781 { 4782 gcc_assert (bb_vinfo); 4783 only_init = true; 4784 *ptr_incr = NULL; 4785 } 4786 4787 /* Check the step (evolution) of the load in LOOP, and record 4788 whether it's invariant. */ 4789 step = vect_dr_behavior (dr)->step; 4790 if (integer_zerop (step)) 4791 *inv_p = true; 4792 else 4793 *inv_p = false; 4794 4795 /* Create an expression for the first address accessed by this load 4796 in LOOP. */ 4797 base_name = get_name (DR_BASE_ADDRESS (dr)); 4798 4799 if (dump_enabled_p ()) 4800 { 4801 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4802 dump_printf_loc (MSG_NOTE, vect_location, 4803 "create %s-pointer variable to type: ", 4804 get_tree_code_name (TREE_CODE (aggr_type))); 4805 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4806 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4807 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4808 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4809 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4810 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4811 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4812 else 4813 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4814 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4815 dump_printf (MSG_NOTE, "\n"); 4816 } 4817 4818 /* (1) Create the new aggregate-pointer variable. 4819 Vector and array types inherit the alias set of their component 4820 type by default so we need to use a ref-all pointer if the data 4821 reference does not conflict with the created aggregated data 4822 reference because it is not addressable. */ 4823 bool need_ref_all = false; 4824 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4825 get_alias_set (DR_REF (dr)))) 4826 need_ref_all = true; 4827 /* Likewise for any of the data references in the stmt group. */ 4828 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4829 { 4830 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4831 do 4832 { 4833 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4834 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4835 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4836 get_alias_set (DR_REF (sdr)))) 4837 { 4838 need_ref_all = true; 4839 break; 4840 } 4841 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4842 } 4843 while (orig_stmt); 4844 } 4845 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4846 need_ref_all); 4847 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4848 4849 4850 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4851 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4852 def-use update cycles for the pointer: one relative to the outer-loop 4853 (LOOP), which is what steps (3) and (4) below do. The other is relative 4854 to the inner-loop (which is the inner-most loop containing the dataref), 4855 and this is done be step (5) below. 4856 4857 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4858 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4859 redundant. Steps (3),(4) create the following: 4860 4861 vp0 = &base_addr; 4862 LOOP: vp1 = phi(vp0,vp2) 4863 ... 4864 ... 4865 vp2 = vp1 + step 4866 goto LOOP 4867 4868 If there is an inner-loop nested in loop, then step (5) will also be 4869 applied, and an additional update in the inner-loop will be created: 4870 4871 vp0 = &base_addr; 4872 LOOP: vp1 = phi(vp0,vp2) 4873 ... 4874 inner: vp3 = phi(vp1,vp4) 4875 vp4 = vp3 + inner_step 4876 if () goto inner 4877 ... 4878 vp2 = vp1 + step 4879 if () goto LOOP */ 4880 4881 /* (2) Calculate the initial address of the aggregate-pointer, and set 4882 the aggregate-pointer to point to it before the loop. */ 4883 4884 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4885 4886 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4887 offset, byte_offset); 4888 if (new_stmt_list) 4889 { 4890 if (pe) 4891 { 4892 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4893 gcc_assert (!new_bb); 4894 } 4895 else 4896 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4897 } 4898 4899 *initial_address = new_temp; 4900 aggr_ptr_init = new_temp; 4901 4902 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4903 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4904 inner-loop nested in LOOP (during outer-loop vectorization). */ 4905 4906 /* No update in loop is required. */ 4907 if (only_init && (!loop_vinfo || at_loop == loop)) 4908 aptr = aggr_ptr_init; 4909 else 4910 { 4911 if (iv_step == NULL_TREE) 4912 { 4913 /* The step of the aggregate pointer is the type size. */ 4914 iv_step = TYPE_SIZE_UNIT (aggr_type); 4915 /* One exception to the above is when the scalar step of the load in 4916 LOOP is zero. In this case the step here is also zero. */ 4917 if (*inv_p) 4918 iv_step = size_zero_node; 4919 else if (tree_int_cst_sgn (step) == -1) 4920 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4921 } 4922 4923 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4924 4925 create_iv (aggr_ptr_init, 4926 fold_convert (aggr_ptr_type, iv_step), 4927 aggr_ptr, loop, &incr_gsi, insert_after, 4928 &indx_before_incr, &indx_after_incr); 4929 incr = gsi_stmt (incr_gsi); 4930 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4931 4932 /* Copy the points-to information if it exists. */ 4933 if (DR_PTR_INFO (dr)) 4934 { 4935 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4936 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4937 } 4938 if (ptr_incr) 4939 *ptr_incr = incr; 4940 4941 aptr = indx_before_incr; 4942 } 4943 4944 if (!nested_in_vect_loop || only_init) 4945 return aptr; 4946 4947 4948 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4949 nested in LOOP, if exists. */ 4950 4951 gcc_assert (nested_in_vect_loop); 4952 if (!only_init) 4953 { 4954 standard_iv_increment_position (containing_loop, &incr_gsi, 4955 &insert_after); 4956 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4957 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4958 &indx_after_incr); 4959 incr = gsi_stmt (incr_gsi); 4960 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4961 4962 /* Copy the points-to information if it exists. */ 4963 if (DR_PTR_INFO (dr)) 4964 { 4965 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4966 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4967 } 4968 if (ptr_incr) 4969 *ptr_incr = incr; 4970 4971 return indx_before_incr; 4972 } 4973 else 4974 gcc_unreachable (); 4975 } 4976 4977 4978 /* Function bump_vector_ptr 4979 4980 Increment a pointer (to a vector type) by vector-size. If requested, 4981 i.e. if PTR-INCR is given, then also connect the new increment stmt 4982 to the existing def-use update-chain of the pointer, by modifying 4983 the PTR_INCR as illustrated below: 4984 4985 The pointer def-use update-chain before this function: 4986 DATAREF_PTR = phi (p_0, p_2) 4987 .... 4988 PTR_INCR: p_2 = DATAREF_PTR + step 4989 4990 The pointer def-use update-chain after this function: 4991 DATAREF_PTR = phi (p_0, p_2) 4992 .... 4993 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4994 .... 4995 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4996 4997 Input: 4998 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4999 in the loop. 5000 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 5001 the loop. The increment amount across iterations is expected 5002 to be vector_size. 5003 BSI - location where the new update stmt is to be placed. 5004 STMT - the original scalar memory-access stmt that is being vectorized. 5005 BUMP - optional. The offset by which to bump the pointer. If not given, 5006 the offset is assumed to be vector_size. 5007 5008 Output: Return NEW_DATAREF_PTR as illustrated above. 5009 5010 */ 5011 5012 tree 5013 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 5014 gimple *stmt, tree bump) 5015 { 5016 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5017 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5018 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5019 tree update = TYPE_SIZE_UNIT (vectype); 5020 gassign *incr_stmt; 5021 ssa_op_iter iter; 5022 use_operand_p use_p; 5023 tree new_dataref_ptr; 5024 5025 if (bump) 5026 update = bump; 5027 5028 if (TREE_CODE (dataref_ptr) == SSA_NAME) 5029 new_dataref_ptr = copy_ssa_name (dataref_ptr); 5030 else 5031 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 5032 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 5033 dataref_ptr, update); 5034 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 5035 5036 /* Copy the points-to information if it exists. */ 5037 if (DR_PTR_INFO (dr)) 5038 { 5039 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 5040 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 5041 } 5042 5043 if (!ptr_incr) 5044 return new_dataref_ptr; 5045 5046 /* Update the vector-pointer's cross-iteration increment. */ 5047 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 5048 { 5049 tree use = USE_FROM_PTR (use_p); 5050 5051 if (use == dataref_ptr) 5052 SET_USE (use_p, new_dataref_ptr); 5053 else 5054 gcc_assert (operand_equal_p (use, update, 0)); 5055 } 5056 5057 return new_dataref_ptr; 5058 } 5059 5060 5061 /* Copy memory reference info such as base/clique from the SRC reference 5062 to the DEST MEM_REF. */ 5063 5064 void 5065 vect_copy_ref_info (tree dest, tree src) 5066 { 5067 if (TREE_CODE (dest) != MEM_REF) 5068 return; 5069 5070 tree src_base = src; 5071 while (handled_component_p (src_base)) 5072 src_base = TREE_OPERAND (src_base, 0); 5073 if (TREE_CODE (src_base) != MEM_REF 5074 && TREE_CODE (src_base) != TARGET_MEM_REF) 5075 return; 5076 5077 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 5078 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 5079 } 5080 5081 5082 /* Function vect_create_destination_var. 5083 5084 Create a new temporary of type VECTYPE. */ 5085 5086 tree 5087 vect_create_destination_var (tree scalar_dest, tree vectype) 5088 { 5089 tree vec_dest; 5090 const char *name; 5091 char *new_name; 5092 tree type; 5093 enum vect_var_kind kind; 5094 5095 kind = vectype 5096 ? VECTOR_BOOLEAN_TYPE_P (vectype) 5097 ? vect_mask_var 5098 : vect_simple_var 5099 : vect_scalar_var; 5100 type = vectype ? vectype : TREE_TYPE (scalar_dest); 5101 5102 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 5103 5104 name = get_name (scalar_dest); 5105 if (name) 5106 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 5107 else 5108 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 5109 vec_dest = vect_get_new_vect_var (type, kind, new_name); 5110 free (new_name); 5111 5112 return vec_dest; 5113 } 5114 5115 /* Function vect_grouped_store_supported. 5116 5117 Returns TRUE if interleave high and interleave low permutations 5118 are supported, and FALSE otherwise. */ 5119 5120 bool 5121 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 5122 { 5123 machine_mode mode = TYPE_MODE (vectype); 5124 5125 /* vect_permute_store_chain requires the group size to be equal to 3 or 5126 be a power of two. */ 5127 if (count != 3 && exact_log2 (count) == -1) 5128 { 5129 if (dump_enabled_p ()) 5130 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5131 "the size of the group of accesses" 5132 " is not a power of 2 or not eqaul to 3\n"); 5133 return false; 5134 } 5135 5136 /* Check that the permutation is supported. */ 5137 if (VECTOR_MODE_P (mode)) 5138 { 5139 unsigned int i; 5140 if (count == 3) 5141 { 5142 unsigned int j0 = 0, j1 = 0, j2 = 0; 5143 unsigned int i, j; 5144 5145 unsigned int nelt; 5146 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5147 { 5148 if (dump_enabled_p ()) 5149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5150 "cannot handle groups of 3 stores for" 5151 " variable-length vectors\n"); 5152 return false; 5153 } 5154 5155 vec_perm_builder sel (nelt, nelt, 1); 5156 sel.quick_grow (nelt); 5157 vec_perm_indices indices; 5158 for (j = 0; j < 3; j++) 5159 { 5160 int nelt0 = ((3 - j) * nelt) % 3; 5161 int nelt1 = ((3 - j) * nelt + 1) % 3; 5162 int nelt2 = ((3 - j) * nelt + 2) % 3; 5163 for (i = 0; i < nelt; i++) 5164 { 5165 if (3 * i + nelt0 < nelt) 5166 sel[3 * i + nelt0] = j0++; 5167 if (3 * i + nelt1 < nelt) 5168 sel[3 * i + nelt1] = nelt + j1++; 5169 if (3 * i + nelt2 < nelt) 5170 sel[3 * i + nelt2] = 0; 5171 } 5172 indices.new_vector (sel, 2, nelt); 5173 if (!can_vec_perm_const_p (mode, indices)) 5174 { 5175 if (dump_enabled_p ()) 5176 dump_printf (MSG_MISSED_OPTIMIZATION, 5177 "permutation op not supported by target.\n"); 5178 return false; 5179 } 5180 5181 for (i = 0; i < nelt; i++) 5182 { 5183 if (3 * i + nelt0 < nelt) 5184 sel[3 * i + nelt0] = 3 * i + nelt0; 5185 if (3 * i + nelt1 < nelt) 5186 sel[3 * i + nelt1] = 3 * i + nelt1; 5187 if (3 * i + nelt2 < nelt) 5188 sel[3 * i + nelt2] = nelt + j2++; 5189 } 5190 indices.new_vector (sel, 2, nelt); 5191 if (!can_vec_perm_const_p (mode, indices)) 5192 { 5193 if (dump_enabled_p ()) 5194 dump_printf (MSG_MISSED_OPTIMIZATION, 5195 "permutation op not supported by target.\n"); 5196 return false; 5197 } 5198 } 5199 return true; 5200 } 5201 else 5202 { 5203 /* If length is not equal to 3 then only power of 2 is supported. */ 5204 gcc_assert (pow2p_hwi (count)); 5205 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5206 5207 /* The encoding has 2 interleaved stepped patterns. */ 5208 vec_perm_builder sel (nelt, 2, 3); 5209 sel.quick_grow (6); 5210 for (i = 0; i < 3; i++) 5211 { 5212 sel[i * 2] = i; 5213 sel[i * 2 + 1] = i + nelt; 5214 } 5215 vec_perm_indices indices (sel, 2, nelt); 5216 if (can_vec_perm_const_p (mode, indices)) 5217 { 5218 for (i = 0; i < 6; i++) 5219 sel[i] += exact_div (nelt, 2); 5220 indices.new_vector (sel, 2, nelt); 5221 if (can_vec_perm_const_p (mode, indices)) 5222 return true; 5223 } 5224 } 5225 } 5226 5227 if (dump_enabled_p ()) 5228 dump_printf (MSG_MISSED_OPTIMIZATION, 5229 "permutaion op not supported by target.\n"); 5230 return false; 5231 } 5232 5233 5234 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5235 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5236 5237 bool 5238 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5239 bool masked_p) 5240 { 5241 if (masked_p) 5242 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5243 vec_mask_store_lanes_optab, 5244 vectype, count); 5245 else 5246 return vect_lanes_optab_supported_p ("vec_store_lanes", 5247 vec_store_lanes_optab, 5248 vectype, count); 5249 } 5250 5251 5252 /* Function vect_permute_store_chain. 5253 5254 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5255 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5256 the data correctly for the stores. Return the final references for stores 5257 in RESULT_CHAIN. 5258 5259 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5260 The input is 4 vectors each containing 8 elements. We assign a number to 5261 each element, the input sequence is: 5262 5263 1st vec: 0 1 2 3 4 5 6 7 5264 2nd vec: 8 9 10 11 12 13 14 15 5265 3rd vec: 16 17 18 19 20 21 22 23 5266 4th vec: 24 25 26 27 28 29 30 31 5267 5268 The output sequence should be: 5269 5270 1st vec: 0 8 16 24 1 9 17 25 5271 2nd vec: 2 10 18 26 3 11 19 27 5272 3rd vec: 4 12 20 28 5 13 21 30 5273 4th vec: 6 14 22 30 7 15 23 31 5274 5275 i.e., we interleave the contents of the four vectors in their order. 5276 5277 We use interleave_high/low instructions to create such output. The input of 5278 each interleave_high/low operation is two vectors: 5279 1st vec 2nd vec 5280 0 1 2 3 4 5 6 7 5281 the even elements of the result vector are obtained left-to-right from the 5282 high/low elements of the first vector. The odd elements of the result are 5283 obtained left-to-right from the high/low elements of the second vector. 5284 The output of interleave_high will be: 0 4 1 5 5285 and of interleave_low: 2 6 3 7 5286 5287 5288 The permutation is done in log LENGTH stages. In each stage interleave_high 5289 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5290 where the first argument is taken from the first half of DR_CHAIN and the 5291 second argument from it's second half. 5292 In our example, 5293 5294 I1: interleave_high (1st vec, 3rd vec) 5295 I2: interleave_low (1st vec, 3rd vec) 5296 I3: interleave_high (2nd vec, 4th vec) 5297 I4: interleave_low (2nd vec, 4th vec) 5298 5299 The output for the first stage is: 5300 5301 I1: 0 16 1 17 2 18 3 19 5302 I2: 4 20 5 21 6 22 7 23 5303 I3: 8 24 9 25 10 26 11 27 5304 I4: 12 28 13 29 14 30 15 31 5305 5306 The output of the second stage, i.e. the final result is: 5307 5308 I1: 0 8 16 24 1 9 17 25 5309 I2: 2 10 18 26 3 11 19 27 5310 I3: 4 12 20 28 5 13 21 30 5311 I4: 6 14 22 30 7 15 23 31. */ 5312 5313 void 5314 vect_permute_store_chain (vec<tree> dr_chain, 5315 unsigned int length, 5316 gimple *stmt, 5317 gimple_stmt_iterator *gsi, 5318 vec<tree> *result_chain) 5319 { 5320 tree vect1, vect2, high, low; 5321 gimple *perm_stmt; 5322 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5323 tree perm_mask_low, perm_mask_high; 5324 tree data_ref; 5325 tree perm3_mask_low, perm3_mask_high; 5326 unsigned int i, j, n, log_length = exact_log2 (length); 5327 5328 result_chain->quick_grow (length); 5329 memcpy (result_chain->address (), dr_chain.address (), 5330 length * sizeof (tree)); 5331 5332 if (length == 3) 5333 { 5334 /* vect_grouped_store_supported ensures that this is constant. */ 5335 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5336 unsigned int j0 = 0, j1 = 0, j2 = 0; 5337 5338 vec_perm_builder sel (nelt, nelt, 1); 5339 sel.quick_grow (nelt); 5340 vec_perm_indices indices; 5341 for (j = 0; j < 3; j++) 5342 { 5343 int nelt0 = ((3 - j) * nelt) % 3; 5344 int nelt1 = ((3 - j) * nelt + 1) % 3; 5345 int nelt2 = ((3 - j) * nelt + 2) % 3; 5346 5347 for (i = 0; i < nelt; i++) 5348 { 5349 if (3 * i + nelt0 < nelt) 5350 sel[3 * i + nelt0] = j0++; 5351 if (3 * i + nelt1 < nelt) 5352 sel[3 * i + nelt1] = nelt + j1++; 5353 if (3 * i + nelt2 < nelt) 5354 sel[3 * i + nelt2] = 0; 5355 } 5356 indices.new_vector (sel, 2, nelt); 5357 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5358 5359 for (i = 0; i < nelt; i++) 5360 { 5361 if (3 * i + nelt0 < nelt) 5362 sel[3 * i + nelt0] = 3 * i + nelt0; 5363 if (3 * i + nelt1 < nelt) 5364 sel[3 * i + nelt1] = 3 * i + nelt1; 5365 if (3 * i + nelt2 < nelt) 5366 sel[3 * i + nelt2] = nelt + j2++; 5367 } 5368 indices.new_vector (sel, 2, nelt); 5369 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5370 5371 vect1 = dr_chain[0]; 5372 vect2 = dr_chain[1]; 5373 5374 /* Create interleaving stmt: 5375 low = VEC_PERM_EXPR <vect1, vect2, 5376 {j, nelt, *, j + 1, nelt + j + 1, *, 5377 j + 2, nelt + j + 2, *, ...}> */ 5378 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5379 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5380 vect2, perm3_mask_low); 5381 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5382 5383 vect1 = data_ref; 5384 vect2 = dr_chain[2]; 5385 /* Create interleaving stmt: 5386 low = VEC_PERM_EXPR <vect1, vect2, 5387 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5388 6, 7, nelt + j + 2, ...}> */ 5389 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5390 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5391 vect2, perm3_mask_high); 5392 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5393 (*result_chain)[j] = data_ref; 5394 } 5395 } 5396 else 5397 { 5398 /* If length is not equal to 3 then only power of 2 is supported. */ 5399 gcc_assert (pow2p_hwi (length)); 5400 5401 /* The encoding has 2 interleaved stepped patterns. */ 5402 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5403 vec_perm_builder sel (nelt, 2, 3); 5404 sel.quick_grow (6); 5405 for (i = 0; i < 3; i++) 5406 { 5407 sel[i * 2] = i; 5408 sel[i * 2 + 1] = i + nelt; 5409 } 5410 vec_perm_indices indices (sel, 2, nelt); 5411 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5412 5413 for (i = 0; i < 6; i++) 5414 sel[i] += exact_div (nelt, 2); 5415 indices.new_vector (sel, 2, nelt); 5416 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5417 5418 for (i = 0, n = log_length; i < n; i++) 5419 { 5420 for (j = 0; j < length/2; j++) 5421 { 5422 vect1 = dr_chain[j]; 5423 vect2 = dr_chain[j+length/2]; 5424 5425 /* Create interleaving stmt: 5426 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5427 ...}> */ 5428 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5429 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5430 vect2, perm_mask_high); 5431 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5432 (*result_chain)[2*j] = high; 5433 5434 /* Create interleaving stmt: 5435 low = VEC_PERM_EXPR <vect1, vect2, 5436 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5437 ...}> */ 5438 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5439 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5440 vect2, perm_mask_low); 5441 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5442 (*result_chain)[2*j+1] = low; 5443 } 5444 memcpy (dr_chain.address (), result_chain->address (), 5445 length * sizeof (tree)); 5446 } 5447 } 5448 } 5449 5450 /* Function vect_setup_realignment 5451 5452 This function is called when vectorizing an unaligned load using 5453 the dr_explicit_realign[_optimized] scheme. 5454 This function generates the following code at the loop prolog: 5455 5456 p = initial_addr; 5457 x msq_init = *(floor(p)); # prolog load 5458 realignment_token = call target_builtin; 5459 loop: 5460 x msq = phi (msq_init, ---) 5461 5462 The stmts marked with x are generated only for the case of 5463 dr_explicit_realign_optimized. 5464 5465 The code above sets up a new (vector) pointer, pointing to the first 5466 location accessed by STMT, and a "floor-aligned" load using that pointer. 5467 It also generates code to compute the "realignment-token" (if the relevant 5468 target hook was defined), and creates a phi-node at the loop-header bb 5469 whose arguments are the result of the prolog-load (created by this 5470 function) and the result of a load that takes place in the loop (to be 5471 created by the caller to this function). 5472 5473 For the case of dr_explicit_realign_optimized: 5474 The caller to this function uses the phi-result (msq) to create the 5475 realignment code inside the loop, and sets up the missing phi argument, 5476 as follows: 5477 loop: 5478 msq = phi (msq_init, lsq) 5479 lsq = *(floor(p')); # load in loop 5480 result = realign_load (msq, lsq, realignment_token); 5481 5482 For the case of dr_explicit_realign: 5483 loop: 5484 msq = *(floor(p)); # load in loop 5485 p' = p + (VS-1); 5486 lsq = *(floor(p')); # load in loop 5487 result = realign_load (msq, lsq, realignment_token); 5488 5489 Input: 5490 STMT - (scalar) load stmt to be vectorized. This load accesses 5491 a memory location that may be unaligned. 5492 BSI - place where new code is to be inserted. 5493 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5494 is used. 5495 5496 Output: 5497 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5498 target hook, if defined. 5499 Return value - the result of the loop-header phi node. */ 5500 5501 tree 5502 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5503 tree *realignment_token, 5504 enum dr_alignment_support alignment_support_scheme, 5505 tree init_addr, 5506 struct loop **at_loop) 5507 { 5508 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5509 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5510 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5511 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5512 struct loop *loop = NULL; 5513 edge pe = NULL; 5514 tree scalar_dest = gimple_assign_lhs (stmt); 5515 tree vec_dest; 5516 gimple *inc; 5517 tree ptr; 5518 tree data_ref; 5519 basic_block new_bb; 5520 tree msq_init = NULL_TREE; 5521 tree new_temp; 5522 gphi *phi_stmt; 5523 tree msq = NULL_TREE; 5524 gimple_seq stmts = NULL; 5525 bool inv_p; 5526 bool compute_in_loop = false; 5527 bool nested_in_vect_loop = false; 5528 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5529 struct loop *loop_for_initial_load = NULL; 5530 5531 if (loop_vinfo) 5532 { 5533 loop = LOOP_VINFO_LOOP (loop_vinfo); 5534 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5535 } 5536 5537 gcc_assert (alignment_support_scheme == dr_explicit_realign 5538 || alignment_support_scheme == dr_explicit_realign_optimized); 5539 5540 /* We need to generate three things: 5541 1. the misalignment computation 5542 2. the extra vector load (for the optimized realignment scheme). 5543 3. the phi node for the two vectors from which the realignment is 5544 done (for the optimized realignment scheme). */ 5545 5546 /* 1. Determine where to generate the misalignment computation. 5547 5548 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5549 calculation will be generated by this function, outside the loop (in the 5550 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5551 caller, inside the loop. 5552 5553 Background: If the misalignment remains fixed throughout the iterations of 5554 the loop, then both realignment schemes are applicable, and also the 5555 misalignment computation can be done outside LOOP. This is because we are 5556 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5557 are a multiple of VS (the Vector Size), and therefore the misalignment in 5558 different vectorized LOOP iterations is always the same. 5559 The problem arises only if the memory access is in an inner-loop nested 5560 inside LOOP, which is now being vectorized using outer-loop vectorization. 5561 This is the only case when the misalignment of the memory access may not 5562 remain fixed throughout the iterations of the inner-loop (as explained in 5563 detail in vect_supportable_dr_alignment). In this case, not only is the 5564 optimized realignment scheme not applicable, but also the misalignment 5565 computation (and generation of the realignment token that is passed to 5566 REALIGN_LOAD) have to be done inside the loop. 5567 5568 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5569 or not, which in turn determines if the misalignment is computed inside 5570 the inner-loop, or outside LOOP. */ 5571 5572 if (init_addr != NULL_TREE || !loop_vinfo) 5573 { 5574 compute_in_loop = true; 5575 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5576 } 5577 5578 5579 /* 2. Determine where to generate the extra vector load. 5580 5581 For the optimized realignment scheme, instead of generating two vector 5582 loads in each iteration, we generate a single extra vector load in the 5583 preheader of the loop, and in each iteration reuse the result of the 5584 vector load from the previous iteration. In case the memory access is in 5585 an inner-loop nested inside LOOP, which is now being vectorized using 5586 outer-loop vectorization, we need to determine whether this initial vector 5587 load should be generated at the preheader of the inner-loop, or can be 5588 generated at the preheader of LOOP. If the memory access has no evolution 5589 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5590 to be generated inside LOOP (in the preheader of the inner-loop). */ 5591 5592 if (nested_in_vect_loop) 5593 { 5594 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5595 bool invariant_in_outerloop = 5596 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5597 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5598 } 5599 else 5600 loop_for_initial_load = loop; 5601 if (at_loop) 5602 *at_loop = loop_for_initial_load; 5603 5604 if (loop_for_initial_load) 5605 pe = loop_preheader_edge (loop_for_initial_load); 5606 5607 /* 3. For the case of the optimized realignment, create the first vector 5608 load at the loop preheader. */ 5609 5610 if (alignment_support_scheme == dr_explicit_realign_optimized) 5611 { 5612 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5613 gassign *new_stmt; 5614 5615 gcc_assert (!compute_in_loop); 5616 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5617 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5618 NULL_TREE, &init_addr, NULL, &inc, 5619 true, &inv_p); 5620 if (TREE_CODE (ptr) == SSA_NAME) 5621 new_temp = copy_ssa_name (ptr); 5622 else 5623 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5624 unsigned int align = DR_TARGET_ALIGNMENT (dr); 5625 new_stmt = gimple_build_assign 5626 (new_temp, BIT_AND_EXPR, ptr, 5627 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); 5628 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5629 gcc_assert (!new_bb); 5630 data_ref 5631 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5632 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5633 vect_copy_ref_info (data_ref, DR_REF (dr)); 5634 new_stmt = gimple_build_assign (vec_dest, data_ref); 5635 new_temp = make_ssa_name (vec_dest, new_stmt); 5636 gimple_assign_set_lhs (new_stmt, new_temp); 5637 if (pe) 5638 { 5639 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5640 gcc_assert (!new_bb); 5641 } 5642 else 5643 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5644 5645 msq_init = gimple_assign_lhs (new_stmt); 5646 } 5647 5648 /* 4. Create realignment token using a target builtin, if available. 5649 It is done either inside the containing loop, or before LOOP (as 5650 determined above). */ 5651 5652 if (targetm.vectorize.builtin_mask_for_load) 5653 { 5654 gcall *new_stmt; 5655 tree builtin_decl; 5656 5657 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5658 if (!init_addr) 5659 { 5660 /* Generate the INIT_ADDR computation outside LOOP. */ 5661 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5662 NULL_TREE); 5663 if (loop) 5664 { 5665 pe = loop_preheader_edge (loop); 5666 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5667 gcc_assert (!new_bb); 5668 } 5669 else 5670 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5671 } 5672 5673 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5674 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5675 vec_dest = 5676 vect_create_destination_var (scalar_dest, 5677 gimple_call_return_type (new_stmt)); 5678 new_temp = make_ssa_name (vec_dest, new_stmt); 5679 gimple_call_set_lhs (new_stmt, new_temp); 5680 5681 if (compute_in_loop) 5682 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5683 else 5684 { 5685 /* Generate the misalignment computation outside LOOP. */ 5686 pe = loop_preheader_edge (loop); 5687 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5688 gcc_assert (!new_bb); 5689 } 5690 5691 *realignment_token = gimple_call_lhs (new_stmt); 5692 5693 /* The result of the CALL_EXPR to this builtin is determined from 5694 the value of the parameter and no global variables are touched 5695 which makes the builtin a "const" function. Requiring the 5696 builtin to have the "const" attribute makes it unnecessary 5697 to call mark_call_clobbered. */ 5698 gcc_assert (TREE_READONLY (builtin_decl)); 5699 } 5700 5701 if (alignment_support_scheme == dr_explicit_realign) 5702 return msq; 5703 5704 gcc_assert (!compute_in_loop); 5705 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5706 5707 5708 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5709 5710 pe = loop_preheader_edge (containing_loop); 5711 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5712 msq = make_ssa_name (vec_dest); 5713 phi_stmt = create_phi_node (msq, containing_loop->header); 5714 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5715 5716 return msq; 5717 } 5718 5719 5720 /* Function vect_grouped_load_supported. 5721 5722 COUNT is the size of the load group (the number of statements plus the 5723 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5724 only one statement, with a gap of COUNT - 1. 5725 5726 Returns true if a suitable permute exists. */ 5727 5728 bool 5729 vect_grouped_load_supported (tree vectype, bool single_element_p, 5730 unsigned HOST_WIDE_INT count) 5731 { 5732 machine_mode mode = TYPE_MODE (vectype); 5733 5734 /* If this is single-element interleaving with an element distance 5735 that leaves unused vector loads around punt - we at least create 5736 very sub-optimal code in that case (and blow up memory, 5737 see PR65518). */ 5738 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5739 { 5740 if (dump_enabled_p ()) 5741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5742 "single-element interleaving not supported " 5743 "for not adjacent vector loads\n"); 5744 return false; 5745 } 5746 5747 /* vect_permute_load_chain requires the group size to be equal to 3 or 5748 be a power of two. */ 5749 if (count != 3 && exact_log2 (count) == -1) 5750 { 5751 if (dump_enabled_p ()) 5752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5753 "the size of the group of accesses" 5754 " is not a power of 2 or not equal to 3\n"); 5755 return false; 5756 } 5757 5758 /* Check that the permutation is supported. */ 5759 if (VECTOR_MODE_P (mode)) 5760 { 5761 unsigned int i, j; 5762 if (count == 3) 5763 { 5764 unsigned int nelt; 5765 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5766 { 5767 if (dump_enabled_p ()) 5768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5769 "cannot handle groups of 3 loads for" 5770 " variable-length vectors\n"); 5771 return false; 5772 } 5773 5774 vec_perm_builder sel (nelt, nelt, 1); 5775 sel.quick_grow (nelt); 5776 vec_perm_indices indices; 5777 unsigned int k; 5778 for (k = 0; k < 3; k++) 5779 { 5780 for (i = 0; i < nelt; i++) 5781 if (3 * i + k < 2 * nelt) 5782 sel[i] = 3 * i + k; 5783 else 5784 sel[i] = 0; 5785 indices.new_vector (sel, 2, nelt); 5786 if (!can_vec_perm_const_p (mode, indices)) 5787 { 5788 if (dump_enabled_p ()) 5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5790 "shuffle of 3 loads is not supported by" 5791 " target\n"); 5792 return false; 5793 } 5794 for (i = 0, j = 0; i < nelt; i++) 5795 if (3 * i + k < 2 * nelt) 5796 sel[i] = i; 5797 else 5798 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5799 indices.new_vector (sel, 2, nelt); 5800 if (!can_vec_perm_const_p (mode, indices)) 5801 { 5802 if (dump_enabled_p ()) 5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5804 "shuffle of 3 loads is not supported by" 5805 " target\n"); 5806 return false; 5807 } 5808 } 5809 return true; 5810 } 5811 else 5812 { 5813 /* If length is not equal to 3 then only power of 2 is supported. */ 5814 gcc_assert (pow2p_hwi (count)); 5815 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5816 5817 /* The encoding has a single stepped pattern. */ 5818 vec_perm_builder sel (nelt, 1, 3); 5819 sel.quick_grow (3); 5820 for (i = 0; i < 3; i++) 5821 sel[i] = i * 2; 5822 vec_perm_indices indices (sel, 2, nelt); 5823 if (can_vec_perm_const_p (mode, indices)) 5824 { 5825 for (i = 0; i < 3; i++) 5826 sel[i] = i * 2 + 1; 5827 indices.new_vector (sel, 2, nelt); 5828 if (can_vec_perm_const_p (mode, indices)) 5829 return true; 5830 } 5831 } 5832 } 5833 5834 if (dump_enabled_p ()) 5835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5836 "extract even/odd not supported by target\n"); 5837 return false; 5838 } 5839 5840 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5841 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5842 5843 bool 5844 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5845 bool masked_p) 5846 { 5847 if (masked_p) 5848 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5849 vec_mask_load_lanes_optab, 5850 vectype, count); 5851 else 5852 return vect_lanes_optab_supported_p ("vec_load_lanes", 5853 vec_load_lanes_optab, 5854 vectype, count); 5855 } 5856 5857 /* Function vect_permute_load_chain. 5858 5859 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5860 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5861 the input data correctly. Return the final references for loads in 5862 RESULT_CHAIN. 5863 5864 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5865 The input is 4 vectors each containing 8 elements. We assign a number to each 5866 element, the input sequence is: 5867 5868 1st vec: 0 1 2 3 4 5 6 7 5869 2nd vec: 8 9 10 11 12 13 14 15 5870 3rd vec: 16 17 18 19 20 21 22 23 5871 4th vec: 24 25 26 27 28 29 30 31 5872 5873 The output sequence should be: 5874 5875 1st vec: 0 4 8 12 16 20 24 28 5876 2nd vec: 1 5 9 13 17 21 25 29 5877 3rd vec: 2 6 10 14 18 22 26 30 5878 4th vec: 3 7 11 15 19 23 27 31 5879 5880 i.e., the first output vector should contain the first elements of each 5881 interleaving group, etc. 5882 5883 We use extract_even/odd instructions to create such output. The input of 5884 each extract_even/odd operation is two vectors 5885 1st vec 2nd vec 5886 0 1 2 3 4 5 6 7 5887 5888 and the output is the vector of extracted even/odd elements. The output of 5889 extract_even will be: 0 2 4 6 5890 and of extract_odd: 1 3 5 7 5891 5892 5893 The permutation is done in log LENGTH stages. In each stage extract_even 5894 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5895 their order. In our example, 5896 5897 E1: extract_even (1st vec, 2nd vec) 5898 E2: extract_odd (1st vec, 2nd vec) 5899 E3: extract_even (3rd vec, 4th vec) 5900 E4: extract_odd (3rd vec, 4th vec) 5901 5902 The output for the first stage will be: 5903 5904 E1: 0 2 4 6 8 10 12 14 5905 E2: 1 3 5 7 9 11 13 15 5906 E3: 16 18 20 22 24 26 28 30 5907 E4: 17 19 21 23 25 27 29 31 5908 5909 In order to proceed and create the correct sequence for the next stage (or 5910 for the correct output, if the second stage is the last one, as in our 5911 example), we first put the output of extract_even operation and then the 5912 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5913 The input for the second stage is: 5914 5915 1st vec (E1): 0 2 4 6 8 10 12 14 5916 2nd vec (E3): 16 18 20 22 24 26 28 30 5917 3rd vec (E2): 1 3 5 7 9 11 13 15 5918 4th vec (E4): 17 19 21 23 25 27 29 31 5919 5920 The output of the second stage: 5921 5922 E1: 0 4 8 12 16 20 24 28 5923 E2: 2 6 10 14 18 22 26 30 5924 E3: 1 5 9 13 17 21 25 29 5925 E4: 3 7 11 15 19 23 27 31 5926 5927 And RESULT_CHAIN after reordering: 5928 5929 1st vec (E1): 0 4 8 12 16 20 24 28 5930 2nd vec (E3): 1 5 9 13 17 21 25 29 5931 3rd vec (E2): 2 6 10 14 18 22 26 30 5932 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5933 5934 static void 5935 vect_permute_load_chain (vec<tree> dr_chain, 5936 unsigned int length, 5937 gimple *stmt, 5938 gimple_stmt_iterator *gsi, 5939 vec<tree> *result_chain) 5940 { 5941 tree data_ref, first_vect, second_vect; 5942 tree perm_mask_even, perm_mask_odd; 5943 tree perm3_mask_low, perm3_mask_high; 5944 gimple *perm_stmt; 5945 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5946 unsigned int i, j, log_length = exact_log2 (length); 5947 5948 result_chain->quick_grow (length); 5949 memcpy (result_chain->address (), dr_chain.address (), 5950 length * sizeof (tree)); 5951 5952 if (length == 3) 5953 { 5954 /* vect_grouped_load_supported ensures that this is constant. */ 5955 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5956 unsigned int k; 5957 5958 vec_perm_builder sel (nelt, nelt, 1); 5959 sel.quick_grow (nelt); 5960 vec_perm_indices indices; 5961 for (k = 0; k < 3; k++) 5962 { 5963 for (i = 0; i < nelt; i++) 5964 if (3 * i + k < 2 * nelt) 5965 sel[i] = 3 * i + k; 5966 else 5967 sel[i] = 0; 5968 indices.new_vector (sel, 2, nelt); 5969 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5970 5971 for (i = 0, j = 0; i < nelt; i++) 5972 if (3 * i + k < 2 * nelt) 5973 sel[i] = i; 5974 else 5975 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5976 indices.new_vector (sel, 2, nelt); 5977 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5978 5979 first_vect = dr_chain[0]; 5980 second_vect = dr_chain[1]; 5981 5982 /* Create interleaving stmt (low part of): 5983 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5984 ...}> */ 5985 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5986 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5987 second_vect, perm3_mask_low); 5988 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5989 5990 /* Create interleaving stmt (high part of): 5991 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5992 ...}> */ 5993 first_vect = data_ref; 5994 second_vect = dr_chain[2]; 5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5997 second_vect, perm3_mask_high); 5998 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5999 (*result_chain)[k] = data_ref; 6000 } 6001 } 6002 else 6003 { 6004 /* If length is not equal to 3 then only power of 2 is supported. */ 6005 gcc_assert (pow2p_hwi (length)); 6006 6007 /* The encoding has a single stepped pattern. */ 6008 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 6009 vec_perm_builder sel (nelt, 1, 3); 6010 sel.quick_grow (3); 6011 for (i = 0; i < 3; ++i) 6012 sel[i] = i * 2; 6013 vec_perm_indices indices (sel, 2, nelt); 6014 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 6015 6016 for (i = 0; i < 3; ++i) 6017 sel[i] = i * 2 + 1; 6018 indices.new_vector (sel, 2, nelt); 6019 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 6020 6021 for (i = 0; i < log_length; i++) 6022 { 6023 for (j = 0; j < length; j += 2) 6024 { 6025 first_vect = dr_chain[j]; 6026 second_vect = dr_chain[j+1]; 6027 6028 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 6029 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 6030 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6031 first_vect, second_vect, 6032 perm_mask_even); 6033 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6034 (*result_chain)[j/2] = data_ref; 6035 6036 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 6037 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 6038 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6039 first_vect, second_vect, 6040 perm_mask_odd); 6041 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6042 (*result_chain)[j/2+length/2] = data_ref; 6043 } 6044 memcpy (dr_chain.address (), result_chain->address (), 6045 length * sizeof (tree)); 6046 } 6047 } 6048 } 6049 6050 /* Function vect_shift_permute_load_chain. 6051 6052 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 6053 sequence of stmts to reorder the input data accordingly. 6054 Return the final references for loads in RESULT_CHAIN. 6055 Return true if successed, false otherwise. 6056 6057 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 6058 The input is 3 vectors each containing 8 elements. We assign a 6059 number to each element, the input sequence is: 6060 6061 1st vec: 0 1 2 3 4 5 6 7 6062 2nd vec: 8 9 10 11 12 13 14 15 6063 3rd vec: 16 17 18 19 20 21 22 23 6064 6065 The output sequence should be: 6066 6067 1st vec: 0 3 6 9 12 15 18 21 6068 2nd vec: 1 4 7 10 13 16 19 22 6069 3rd vec: 2 5 8 11 14 17 20 23 6070 6071 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 6072 6073 First we shuffle all 3 vectors to get correct elements order: 6074 6075 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 6076 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 6077 3rd vec: (16 19 22) (17 20 23) (18 21) 6078 6079 Next we unite and shift vector 3 times: 6080 6081 1st step: 6082 shift right by 6 the concatenation of: 6083 "1st vec" and "2nd vec" 6084 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 6085 "2nd vec" and "3rd vec" 6086 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 6087 "3rd vec" and "1st vec" 6088 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 6089 | New vectors | 6090 6091 So that now new vectors are: 6092 6093 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 6094 2nd vec: (10 13) (16 19 22) (17 20 23) 6095 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 6096 6097 2nd step: 6098 shift right by 5 the concatenation of: 6099 "1st vec" and "3rd vec" 6100 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 6101 "2nd vec" and "1st vec" 6102 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 6103 "3rd vec" and "2nd vec" 6104 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 6105 | New vectors | 6106 6107 So that now new vectors are: 6108 6109 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 6110 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 6111 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 6112 6113 3rd step: 6114 shift right by 5 the concatenation of: 6115 "1st vec" and "1st vec" 6116 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 6117 shift right by 3 the concatenation of: 6118 "2nd vec" and "2nd vec" 6119 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 6120 | New vectors | 6121 6122 So that now all vectors are READY: 6123 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6124 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6125 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6126 6127 This algorithm is faster than one in vect_permute_load_chain if: 6128 1. "shift of a concatination" is faster than general permutation. 6129 This is usually so. 6130 2. The TARGET machine can't execute vector instructions in parallel. 6131 This is because each step of the algorithm depends on previous. 6132 The algorithm in vect_permute_load_chain is much more parallel. 6133 6134 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6135 */ 6136 6137 static bool 6138 vect_shift_permute_load_chain (vec<tree> dr_chain, 6139 unsigned int length, 6140 gimple *stmt, 6141 gimple_stmt_iterator *gsi, 6142 vec<tree> *result_chain) 6143 { 6144 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6145 tree perm2_mask1, perm2_mask2, perm3_mask; 6146 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6147 gimple *perm_stmt; 6148 6149 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 6150 unsigned int i; 6151 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6152 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6153 6154 unsigned HOST_WIDE_INT nelt, vf; 6155 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6156 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6157 /* Not supported for variable-length vectors. */ 6158 return false; 6159 6160 vec_perm_builder sel (nelt, nelt, 1); 6161 sel.quick_grow (nelt); 6162 6163 result_chain->quick_grow (length); 6164 memcpy (result_chain->address (), dr_chain.address (), 6165 length * sizeof (tree)); 6166 6167 if (pow2p_hwi (length) && vf > 4) 6168 { 6169 unsigned int j, log_length = exact_log2 (length); 6170 for (i = 0; i < nelt / 2; ++i) 6171 sel[i] = i * 2; 6172 for (i = 0; i < nelt / 2; ++i) 6173 sel[nelt / 2 + i] = i * 2 + 1; 6174 vec_perm_indices indices (sel, 2, nelt); 6175 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6176 { 6177 if (dump_enabled_p ()) 6178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6179 "shuffle of 2 fields structure is not \ 6180 supported by target\n"); 6181 return false; 6182 } 6183 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6184 6185 for (i = 0; i < nelt / 2; ++i) 6186 sel[i] = i * 2 + 1; 6187 for (i = 0; i < nelt / 2; ++i) 6188 sel[nelt / 2 + i] = i * 2; 6189 indices.new_vector (sel, 2, nelt); 6190 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6191 { 6192 if (dump_enabled_p ()) 6193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6194 "shuffle of 2 fields structure is not \ 6195 supported by target\n"); 6196 return false; 6197 } 6198 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6199 6200 /* Generating permutation constant to shift all elements. 6201 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6202 for (i = 0; i < nelt; i++) 6203 sel[i] = nelt / 2 + i; 6204 indices.new_vector (sel, 2, nelt); 6205 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6206 { 6207 if (dump_enabled_p ()) 6208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6209 "shift permutation is not supported by target\n"); 6210 return false; 6211 } 6212 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6213 6214 /* Generating permutation constant to select vector from 2. 6215 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6216 for (i = 0; i < nelt / 2; i++) 6217 sel[i] = i; 6218 for (i = nelt / 2; i < nelt; i++) 6219 sel[i] = nelt + i; 6220 indices.new_vector (sel, 2, nelt); 6221 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6222 { 6223 if (dump_enabled_p ()) 6224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6225 "select is not supported by target\n"); 6226 return false; 6227 } 6228 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6229 6230 for (i = 0; i < log_length; i++) 6231 { 6232 for (j = 0; j < length; j += 2) 6233 { 6234 first_vect = dr_chain[j]; 6235 second_vect = dr_chain[j + 1]; 6236 6237 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6238 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6239 first_vect, first_vect, 6240 perm2_mask1); 6241 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6242 vect[0] = data_ref; 6243 6244 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6245 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6246 second_vect, second_vect, 6247 perm2_mask2); 6248 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6249 vect[1] = data_ref; 6250 6251 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6252 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6253 vect[0], vect[1], shift1_mask); 6254 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6255 (*result_chain)[j/2 + length/2] = data_ref; 6256 6257 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6258 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6259 vect[0], vect[1], select_mask); 6260 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6261 (*result_chain)[j/2] = data_ref; 6262 } 6263 memcpy (dr_chain.address (), result_chain->address (), 6264 length * sizeof (tree)); 6265 } 6266 return true; 6267 } 6268 if (length == 3 && vf > 2) 6269 { 6270 unsigned int k = 0, l = 0; 6271 6272 /* Generating permutation constant to get all elements in rigth order. 6273 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6274 for (i = 0; i < nelt; i++) 6275 { 6276 if (3 * k + (l % 3) >= nelt) 6277 { 6278 k = 0; 6279 l += (3 - (nelt % 3)); 6280 } 6281 sel[i] = 3 * k + (l % 3); 6282 k++; 6283 } 6284 vec_perm_indices indices (sel, 2, nelt); 6285 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6286 { 6287 if (dump_enabled_p ()) 6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6289 "shuffle of 3 fields structure is not \ 6290 supported by target\n"); 6291 return false; 6292 } 6293 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6294 6295 /* Generating permutation constant to shift all elements. 6296 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6297 for (i = 0; i < nelt; i++) 6298 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6299 indices.new_vector (sel, 2, nelt); 6300 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6301 { 6302 if (dump_enabled_p ()) 6303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6304 "shift permutation is not supported by target\n"); 6305 return false; 6306 } 6307 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6308 6309 /* Generating permutation constant to shift all elements. 6310 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6311 for (i = 0; i < nelt; i++) 6312 sel[i] = 2 * (nelt / 3) + 1 + i; 6313 indices.new_vector (sel, 2, nelt); 6314 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6315 { 6316 if (dump_enabled_p ()) 6317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6318 "shift permutation is not supported by target\n"); 6319 return false; 6320 } 6321 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6322 6323 /* Generating permutation constant to shift all elements. 6324 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6325 for (i = 0; i < nelt; i++) 6326 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6327 indices.new_vector (sel, 2, nelt); 6328 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6329 { 6330 if (dump_enabled_p ()) 6331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6332 "shift permutation is not supported by target\n"); 6333 return false; 6334 } 6335 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6336 6337 /* Generating permutation constant to shift all elements. 6338 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6339 for (i = 0; i < nelt; i++) 6340 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6341 indices.new_vector (sel, 2, nelt); 6342 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6343 { 6344 if (dump_enabled_p ()) 6345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6346 "shift permutation is not supported by target\n"); 6347 return false; 6348 } 6349 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6350 6351 for (k = 0; k < 3; k++) 6352 { 6353 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6354 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6355 dr_chain[k], dr_chain[k], 6356 perm3_mask); 6357 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6358 vect[k] = data_ref; 6359 } 6360 6361 for (k = 0; k < 3; k++) 6362 { 6363 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6364 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6365 vect[k % 3], vect[(k + 1) % 3], 6366 shift1_mask); 6367 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6368 vect_shift[k] = data_ref; 6369 } 6370 6371 for (k = 0; k < 3; k++) 6372 { 6373 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6374 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6375 vect_shift[(4 - k) % 3], 6376 vect_shift[(3 - k) % 3], 6377 shift2_mask); 6378 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6379 vect[k] = data_ref; 6380 } 6381 6382 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6383 6384 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6385 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6386 vect[0], shift3_mask); 6387 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6388 (*result_chain)[nelt % 3] = data_ref; 6389 6390 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6391 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6392 vect[1], shift4_mask); 6393 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6394 (*result_chain)[0] = data_ref; 6395 return true; 6396 } 6397 return false; 6398 } 6399 6400 /* Function vect_transform_grouped_load. 6401 6402 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6403 to perform their permutation and ascribe the result vectorized statements to 6404 the scalar statements. 6405 */ 6406 6407 void 6408 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 6409 gimple_stmt_iterator *gsi) 6410 { 6411 machine_mode mode; 6412 vec<tree> result_chain = vNULL; 6413 6414 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6415 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6416 vectors, that are ready for vector computation. */ 6417 result_chain.create (size); 6418 6419 /* If reassociation width for vector type is 2 or greater target machine can 6420 execute 2 or more vector instructions in parallel. Otherwise try to 6421 get chain for loads group using vect_shift_permute_load_chain. */ 6422 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 6423 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6424 || pow2p_hwi (size) 6425 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 6426 gsi, &result_chain)) 6427 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 6428 vect_record_grouped_load_vectors (stmt, result_chain); 6429 result_chain.release (); 6430 } 6431 6432 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6433 generated as part of the vectorization of STMT. Assign the statement 6434 for each vector to the associated scalar statement. */ 6435 6436 void 6437 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6438 { 6439 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6440 gimple *next_stmt, *new_stmt; 6441 unsigned int i, gap_count; 6442 tree tmp_data_ref; 6443 6444 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6445 Since we scan the chain starting from it's first node, their order 6446 corresponds the order of data-refs in RESULT_CHAIN. */ 6447 next_stmt = first_stmt; 6448 gap_count = 1; 6449 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6450 { 6451 if (!next_stmt) 6452 break; 6453 6454 /* Skip the gaps. Loads created for the gaps will be removed by dead 6455 code elimination pass later. No need to check for the first stmt in 6456 the group, since it always exists. 6457 GROUP_GAP is the number of steps in elements from the previous 6458 access (if there is no gap GROUP_GAP is 1). We skip loads that 6459 correspond to the gaps. */ 6460 if (next_stmt != first_stmt 6461 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6462 { 6463 gap_count++; 6464 continue; 6465 } 6466 6467 while (next_stmt) 6468 { 6469 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6470 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6471 copies, and we put the new vector statement in the first available 6472 RELATED_STMT. */ 6473 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6474 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6475 else 6476 { 6477 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6478 { 6479 gimple *prev_stmt = 6480 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6481 gimple *rel_stmt = 6482 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6483 while (rel_stmt) 6484 { 6485 prev_stmt = rel_stmt; 6486 rel_stmt = 6487 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6488 } 6489 6490 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6491 new_stmt; 6492 } 6493 } 6494 6495 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6496 gap_count = 1; 6497 /* If NEXT_STMT accesses the same DR as the previous statement, 6498 put the same TMP_DATA_REF as its vectorized statement; otherwise 6499 get the next data-ref from RESULT_CHAIN. */ 6500 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6501 break; 6502 } 6503 } 6504 } 6505 6506 /* Function vect_force_dr_alignment_p. 6507 6508 Returns whether the alignment of a DECL can be forced to be aligned 6509 on ALIGNMENT bit boundary. */ 6510 6511 bool 6512 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6513 { 6514 if (!VAR_P (decl)) 6515 return false; 6516 6517 if (decl_in_symtab_p (decl) 6518 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6519 return false; 6520 6521 if (TREE_STATIC (decl)) 6522 return (alignment <= MAX_OFILE_ALIGNMENT); 6523 else 6524 return (alignment <= MAX_STACK_ALIGNMENT); 6525 } 6526 6527 6528 /* Return whether the data reference DR is supported with respect to its 6529 alignment. 6530 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6531 it is aligned, i.e., check if it is possible to vectorize it with different 6532 alignment. */ 6533 6534 enum dr_alignment_support 6535 vect_supportable_dr_alignment (struct data_reference *dr, 6536 bool check_aligned_accesses) 6537 { 6538 gimple *stmt = DR_STMT (dr); 6539 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6540 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6541 machine_mode mode = TYPE_MODE (vectype); 6542 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6543 struct loop *vect_loop = NULL; 6544 bool nested_in_vect_loop = false; 6545 6546 if (aligned_access_p (dr) && !check_aligned_accesses) 6547 return dr_aligned; 6548 6549 /* For now assume all conditional loads/stores support unaligned 6550 access without any special code. */ 6551 if (is_gimple_call (stmt) 6552 && gimple_call_internal_p (stmt) 6553 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6554 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6555 return dr_unaligned_supported; 6556 6557 if (loop_vinfo) 6558 { 6559 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6560 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6561 } 6562 6563 /* Possibly unaligned access. */ 6564 6565 /* We can choose between using the implicit realignment scheme (generating 6566 a misaligned_move stmt) and the explicit realignment scheme (generating 6567 aligned loads with a REALIGN_LOAD). There are two variants to the 6568 explicit realignment scheme: optimized, and unoptimized. 6569 We can optimize the realignment only if the step between consecutive 6570 vector loads is equal to the vector size. Since the vector memory 6571 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6572 is guaranteed that the misalignment amount remains the same throughout the 6573 execution of the vectorized loop. Therefore, we can create the 6574 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6575 at the loop preheader. 6576 6577 However, in the case of outer-loop vectorization, when vectorizing a 6578 memory access in the inner-loop nested within the LOOP that is now being 6579 vectorized, while it is guaranteed that the misalignment of the 6580 vectorized memory access will remain the same in different outer-loop 6581 iterations, it is *not* guaranteed that is will remain the same throughout 6582 the execution of the inner-loop. This is because the inner-loop advances 6583 with the original scalar step (and not in steps of VS). If the inner-loop 6584 step happens to be a multiple of VS, then the misalignment remains fixed 6585 and we can use the optimized realignment scheme. For example: 6586 6587 for (i=0; i<N; i++) 6588 for (j=0; j<M; j++) 6589 s += a[i+j]; 6590 6591 When vectorizing the i-loop in the above example, the step between 6592 consecutive vector loads is 1, and so the misalignment does not remain 6593 fixed across the execution of the inner-loop, and the realignment cannot 6594 be optimized (as illustrated in the following pseudo vectorized loop): 6595 6596 for (i=0; i<N; i+=4) 6597 for (j=0; j<M; j++){ 6598 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6599 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6600 // (assuming that we start from an aligned address). 6601 } 6602 6603 We therefore have to use the unoptimized realignment scheme: 6604 6605 for (i=0; i<N; i+=4) 6606 for (j=k; j<M; j+=4) 6607 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6608 // that the misalignment of the initial address is 6609 // 0). 6610 6611 The loop can then be vectorized as follows: 6612 6613 for (k=0; k<4; k++){ 6614 rt = get_realignment_token (&vp[k]); 6615 for (i=0; i<N; i+=4){ 6616 v1 = vp[i+k]; 6617 for (j=k; j<M; j+=4){ 6618 v2 = vp[i+j+VS-1]; 6619 va = REALIGN_LOAD <v1,v2,rt>; 6620 vs += va; 6621 v1 = v2; 6622 } 6623 } 6624 } */ 6625 6626 if (DR_IS_READ (dr)) 6627 { 6628 bool is_packed = false; 6629 tree type = (TREE_TYPE (DR_REF (dr))); 6630 6631 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6632 && (!targetm.vectorize.builtin_mask_for_load 6633 || targetm.vectorize.builtin_mask_for_load ())) 6634 { 6635 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6636 6637 /* If we are doing SLP then the accesses need not have the 6638 same alignment, instead it depends on the SLP group size. */ 6639 if (loop_vinfo 6640 && STMT_SLP_TYPE (stmt_info) 6641 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6642 * GROUP_SIZE (vinfo_for_stmt 6643 (GROUP_FIRST_ELEMENT (stmt_info))), 6644 TYPE_VECTOR_SUBPARTS (vectype))) 6645 ; 6646 else if (!loop_vinfo 6647 || (nested_in_vect_loop 6648 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6649 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6650 return dr_explicit_realign; 6651 else 6652 return dr_explicit_realign_optimized; 6653 } 6654 if (!known_alignment_for_access_p (dr)) 6655 is_packed = not_size_aligned (DR_REF (dr)); 6656 6657 if (targetm.vectorize.support_vector_misalignment 6658 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6659 /* Can't software pipeline the loads, but can at least do them. */ 6660 return dr_unaligned_supported; 6661 } 6662 else 6663 { 6664 bool is_packed = false; 6665 tree type = (TREE_TYPE (DR_REF (dr))); 6666 6667 if (!known_alignment_for_access_p (dr)) 6668 is_packed = not_size_aligned (DR_REF (dr)); 6669 6670 if (targetm.vectorize.support_vector_misalignment 6671 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6672 return dr_unaligned_supported; 6673 } 6674 6675 /* Unsupported. */ 6676 return dr_unaligned_unsupported; 6677 } 6678