1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/STLExtras.h" 69 #include "llvm/ADT/SmallPtrSet.h" 70 #include "llvm/ADT/SmallSet.h" 71 #include "llvm/ADT/SmallVector.h" 72 #include "llvm/ADT/Statistic.h" 73 #include "llvm/ADT/StringRef.h" 74 #include "llvm/ADT/Twine.h" 75 #include "llvm/ADT/iterator_range.h" 76 #include "llvm/Analysis/AssumptionCache.h" 77 #include "llvm/Analysis/BasicAliasAnalysis.h" 78 #include "llvm/Analysis/BlockFrequencyInfo.h" 79 #include "llvm/Analysis/CFG.h" 80 #include "llvm/Analysis/CodeMetrics.h" 81 #include "llvm/Analysis/DemandedBits.h" 82 #include "llvm/Analysis/GlobalsModRef.h" 83 #include "llvm/Analysis/LoopAccessAnalysis.h" 84 #include "llvm/Analysis/LoopAnalysisManager.h" 85 #include "llvm/Analysis/LoopInfo.h" 86 #include "llvm/Analysis/LoopIterator.h" 87 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 88 #include "llvm/Analysis/ProfileSummaryInfo.h" 89 #include "llvm/Analysis/ScalarEvolution.h" 90 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 91 #include "llvm/Analysis/TargetLibraryInfo.h" 92 #include "llvm/Analysis/TargetTransformInfo.h" 93 #include "llvm/Analysis/ValueTracking.h" 94 #include "llvm/Analysis/VectorUtils.h" 95 #include "llvm/IR/Attributes.h" 96 #include "llvm/IR/BasicBlock.h" 97 #include "llvm/IR/CFG.h" 98 #include "llvm/IR/Constant.h" 99 #include "llvm/IR/Constants.h" 100 #include "llvm/IR/DataLayout.h" 101 #include "llvm/IR/DebugInfo.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/Support/Casting.h" 125 #include "llvm/Support/CommandLine.h" 126 #include "llvm/Support/Compiler.h" 127 #include "llvm/Support/Debug.h" 128 #include "llvm/Support/ErrorHandling.h" 129 #include "llvm/Support/InstructionCost.h" 130 #include "llvm/Support/MathExtras.h" 131 #include "llvm/Support/raw_ostream.h" 132 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 133 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 134 #include "llvm/Transforms/Utils/LoopSimplify.h" 135 #include "llvm/Transforms/Utils/LoopUtils.h" 136 #include "llvm/Transforms/Utils/LoopVersioning.h" 137 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 138 #include "llvm/Transforms/Utils/SizeOpts.h" 139 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 140 #include <algorithm> 141 #include <cassert> 142 #include <cmath> 143 #include <cstdint> 144 #include <functional> 145 #include <iterator> 146 #include <limits> 147 #include <map> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 153 using namespace llvm; 154 155 #define LV_NAME "loop-vectorize" 156 #define DEBUG_TYPE LV_NAME 157 158 #ifndef NDEBUG 159 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 160 #endif 161 162 /// @{ 163 /// Metadata attribute names 164 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 165 const char LLVMLoopVectorizeFollowupVectorized[] = 166 "llvm.loop.vectorize.followup_vectorized"; 167 const char LLVMLoopVectorizeFollowupEpilogue[] = 168 "llvm.loop.vectorize.followup_epilogue"; 169 /// @} 170 171 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 172 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 173 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 174 175 static cl::opt<bool> EnableEpilogueVectorization( 176 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 177 cl::desc("Enable vectorization of epilogue loops.")); 178 179 static cl::opt<unsigned> EpilogueVectorizationForceVF( 180 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 181 cl::desc("When epilogue vectorization is enabled, and a value greater than " 182 "1 is specified, forces the given VF for all applicable epilogue " 183 "loops.")); 184 185 static cl::opt<unsigned> EpilogueVectorizationMinVF( 186 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 187 cl::desc("Only loops with vectorization factor equal to or larger than " 188 "the specified value are considered for epilogue vectorization.")); 189 190 /// Loops with a known constant trip count below this number are vectorized only 191 /// if no scalar iteration overheads are incurred. 192 static cl::opt<unsigned> TinyTripCountVectorThreshold( 193 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 194 cl::desc("Loops with a constant trip count that is smaller than this " 195 "value are vectorized only if no scalar iteration overheads " 196 "are incurred.")); 197 198 static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 199 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 200 cl::desc("The maximum allowed number of runtime memory checks")); 201 202 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 203 // that predication is preferred, and this lists all options. I.e., the 204 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 205 // and predicate the instructions accordingly. If tail-folding fails, there are 206 // different fallback strategies depending on these values: 207 namespace PreferPredicateTy { 208 enum Option { 209 ScalarEpilogue = 0, 210 PredicateElseScalarEpilogue, 211 PredicateOrDontVectorize 212 }; 213 } // namespace PreferPredicateTy 214 215 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 216 "prefer-predicate-over-epilogue", 217 cl::init(PreferPredicateTy::ScalarEpilogue), 218 cl::Hidden, 219 cl::desc("Tail-folding and predication preferences over creating a scalar " 220 "epilogue loop."), 221 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 222 "scalar-epilogue", 223 "Don't tail-predicate loops, create scalar epilogue"), 224 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 225 "predicate-else-scalar-epilogue", 226 "prefer tail-folding, create scalar epilogue if tail " 227 "folding fails."), 228 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 229 "predicate-dont-vectorize", 230 "prefers tail-folding, don't attempt vectorization if " 231 "tail-folding fails."))); 232 233 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 234 "force-tail-folding-style", cl::desc("Force the tail folding style"), 235 cl::init(TailFoldingStyle::None), 236 cl::values( 237 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 238 clEnumValN( 239 TailFoldingStyle::Data, "data", 240 "Create lane mask for data only, using active.lane.mask intrinsic"), 241 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 242 "data-without-lane-mask", 243 "Create lane mask with compare/stepvector"), 244 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 245 "Create lane mask using active.lane.mask intrinsic, and use " 246 "it for both data and control flow"), 247 clEnumValN( 248 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 249 "data-and-control-without-rt-check", 250 "Similar to data-and-control, but remove the runtime check"))); 251 252 static cl::opt<bool> MaximizeBandwidth( 253 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 254 cl::desc("Maximize bandwidth when selecting vectorization factor which " 255 "will be determined by the smallest type in loop.")); 256 257 static cl::opt<bool> EnableInterleavedMemAccesses( 258 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 259 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 260 261 /// An interleave-group may need masking if it resides in a block that needs 262 /// predication, or in order to mask away gaps. 263 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 264 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 265 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 266 267 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 268 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 269 cl::desc("We don't interleave loops with a estimated constant trip count " 270 "below this number")); 271 272 static cl::opt<unsigned> ForceTargetNumScalarRegs( 273 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 274 cl::desc("A flag that overrides the target's number of scalar registers.")); 275 276 static cl::opt<unsigned> ForceTargetNumVectorRegs( 277 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's number of vector registers.")); 279 280 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 281 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 282 cl::desc("A flag that overrides the target's max interleave factor for " 283 "scalar loops.")); 284 285 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 286 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 287 cl::desc("A flag that overrides the target's max interleave factor for " 288 "vectorized loops.")); 289 290 static cl::opt<unsigned> ForceTargetInstructionCost( 291 "force-target-instruction-cost", cl::init(0), cl::Hidden, 292 cl::desc("A flag that overrides the target's expected cost for " 293 "an instruction to a single constant value. Mostly " 294 "useful for getting consistent testing.")); 295 296 static cl::opt<bool> ForceTargetSupportsScalableVectors( 297 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 298 cl::desc( 299 "Pretend that scalable vectors are supported, even if the target does " 300 "not support them. This flag should only be used for testing.")); 301 302 static cl::opt<unsigned> SmallLoopCost( 303 "small-loop-cost", cl::init(20), cl::Hidden, 304 cl::desc( 305 "The cost of a loop that is considered 'small' by the interleaver.")); 306 307 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 308 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 309 cl::desc("Enable the use of the block frequency analysis to access PGO " 310 "heuristics minimizing code growth in cold regions and being more " 311 "aggressive in hot regions.")); 312 313 // Runtime interleave loops for load/store throughput. 314 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 315 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 316 cl::desc( 317 "Enable runtime interleaving until load/store ports are saturated")); 318 319 /// Interleave small loops with scalar reductions. 320 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 321 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 322 cl::desc("Enable interleaving for loops with small iteration counts that " 323 "contain scalar reductions to expose ILP.")); 324 325 /// The number of stores in a loop that are allowed to need predication. 326 static cl::opt<unsigned> NumberOfStoresToPredicate( 327 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 328 cl::desc("Max number of stores to be predicated behind an if.")); 329 330 static cl::opt<bool> EnableIndVarRegisterHeur( 331 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 332 cl::desc("Count the induction variable only once when interleaving")); 333 334 static cl::opt<bool> EnableCondStoresVectorization( 335 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 336 cl::desc("Enable if predication of stores during vectorization.")); 337 338 static cl::opt<unsigned> MaxNestedScalarReductionIC( 339 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 340 cl::desc("The maximum interleave count to use when interleaving a scalar " 341 "reduction in a nested loop.")); 342 343 static cl::opt<bool> 344 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 345 cl::Hidden, 346 cl::desc("Prefer in-loop vector reductions, " 347 "overriding the targets preference.")); 348 349 static cl::opt<bool> ForceOrderedReductions( 350 "force-ordered-reductions", cl::init(false), cl::Hidden, 351 cl::desc("Enable the vectorisation of loops with in-order (strict) " 352 "FP reductions")); 353 354 static cl::opt<bool> PreferPredicatedReductionSelect( 355 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 356 cl::desc( 357 "Prefer predicating a reduction operation over an after loop select.")); 358 359 namespace llvm { 360 cl::opt<bool> EnableVPlanNativePath( 361 "enable-vplan-native-path", cl::Hidden, 362 cl::desc("Enable VPlan-native vectorization path with " 363 "support for outer loop vectorization.")); 364 } 365 366 // This flag enables the stress testing of the VPlan H-CFG construction in the 367 // VPlan-native vectorization path. It must be used in conjuction with 368 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 369 // verification of the H-CFGs built. 370 static cl::opt<bool> VPlanBuildStressTest( 371 "vplan-build-stress-test", cl::init(false), cl::Hidden, 372 cl::desc( 373 "Build VPlan for every supported loop nest in the function and bail " 374 "out right after the build (stress test the VPlan H-CFG construction " 375 "in the VPlan-native vectorization path).")); 376 377 cl::opt<bool> llvm::EnableLoopInterleaving( 378 "interleave-loops", cl::init(true), cl::Hidden, 379 cl::desc("Enable loop interleaving in Loop vectorization passes")); 380 cl::opt<bool> llvm::EnableLoopVectorization( 381 "vectorize-loops", cl::init(true), cl::Hidden, 382 cl::desc("Run the Loop vectorization passes")); 383 384 static cl::opt<bool> PrintVPlansInDotFormat( 385 "vplan-print-in-dot-format", cl::Hidden, 386 cl::desc("Use dot format instead of plain text when dumping VPlans")); 387 388 static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 389 "force-widen-divrem-via-safe-divisor", cl::Hidden, 390 cl::desc( 391 "Override cost based safe divisor widening for div/rem instructions")); 392 393 /// A helper function that returns true if the given type is irregular. The 394 /// type is irregular if its allocated size doesn't equal the store size of an 395 /// element of the corresponding vector type. 396 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 397 // Determine if an array of N elements of type Ty is "bitcast compatible" 398 // with a <N x Ty> vector. 399 // This is only true if there is no padding between the array elements. 400 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 401 } 402 403 /// A helper function that returns the reciprocal of the block probability of 404 /// predicated blocks. If we return X, we are assuming the predicated block 405 /// will execute once for every X iterations of the loop header. 406 /// 407 /// TODO: We should use actual block probability here, if available. Currently, 408 /// we always assume predicated blocks have a 50% chance of executing. 409 static unsigned getReciprocalPredBlockProb() { return 2; } 410 411 /// A helper function that returns an integer or floating-point constant with 412 /// value C. 413 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 414 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 415 : ConstantFP::get(Ty, C); 416 } 417 418 /// Returns "best known" trip count for the specified loop \p L as defined by 419 /// the following procedure: 420 /// 1) Returns exact trip count if it is known. 421 /// 2) Returns expected trip count according to profile data if any. 422 /// 3) Returns upper bound estimate if it is known. 423 /// 4) Returns std::nullopt if all of the above failed. 424 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 425 Loop *L) { 426 // Check if exact trip count is known. 427 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 428 return ExpectedTC; 429 430 // Check if there is an expected trip count available from profile data. 431 if (LoopVectorizeWithBlockFrequency) 432 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 433 return *EstimatedTC; 434 435 // Check if upper bound estimate is known. 436 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 437 return ExpectedTC; 438 439 return std::nullopt; 440 } 441 442 /// Return a vector containing interleaved elements from multiple 443 /// smaller input vectors. 444 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 445 const Twine &Name) { 446 unsigned Factor = Vals.size(); 447 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 448 449 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 450 #ifndef NDEBUG 451 for (Value *Val : Vals) 452 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 453 #endif 454 455 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 456 // must use intrinsics to interleave. 457 if (VecTy->isScalableTy()) { 458 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 459 return Builder.CreateIntrinsic( 460 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 461 /*FMFSource=*/nullptr, Name); 462 } 463 464 // Fixed length. Start by concatenating all vectors into a wide vector. 465 Value *WideVec = concatenateVectors(Builder, Vals); 466 467 // Interleave the elements into the wide vector. 468 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 469 return Builder.CreateShuffleVector( 470 WideVec, createInterleaveMask(NumElts, Factor), Name); 471 } 472 473 namespace { 474 // Forward declare GeneratedRTChecks. 475 class GeneratedRTChecks; 476 477 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 478 } // namespace 479 480 namespace llvm { 481 482 AnalysisKey ShouldRunExtraVectorPasses::Key; 483 484 /// InnerLoopVectorizer vectorizes loops which contain only one basic 485 /// block to a specified vectorization factor (VF). 486 /// This class performs the widening of scalars into vectors, or multiple 487 /// scalars. This class also implements the following features: 488 /// * It inserts an epilogue loop for handling loops that don't have iteration 489 /// counts that are known to be a multiple of the vectorization factor. 490 /// * It handles the code generation for reduction variables. 491 /// * Scalarization (implementation using scalars) of un-vectorizable 492 /// instructions. 493 /// InnerLoopVectorizer does not perform any vectorization-legality 494 /// checks, and relies on the caller to check for the different legality 495 /// aspects. The InnerLoopVectorizer relies on the 496 /// LoopVectorizationLegality class to provide information about the induction 497 /// and reduction variables that were found to a given vectorization factor. 498 class InnerLoopVectorizer { 499 public: 500 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 501 LoopInfo *LI, DominatorTree *DT, 502 const TargetLibraryInfo *TLI, 503 const TargetTransformInfo *TTI, AssumptionCache *AC, 504 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 505 ElementCount MinProfitableTripCount, 506 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 507 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 508 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 509 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 510 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 511 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 512 PSI(PSI), RTChecks(RTChecks) { 513 // Query this against the original loop and save it here because the profile 514 // of the original loop header may change as the transformation happens. 515 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 516 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 517 518 if (MinProfitableTripCount.isZero()) 519 this->MinProfitableTripCount = VecWidth; 520 else 521 this->MinProfitableTripCount = MinProfitableTripCount; 522 } 523 524 virtual ~InnerLoopVectorizer() = default; 525 526 /// Create a new empty loop that will contain vectorized instructions later 527 /// on, while the old loop will be used as the scalar remainder. Control flow 528 /// is generated around the vectorized (and scalar epilogue) loops consisting 529 /// of various checks and bypasses. Return the pre-header block of the new 530 /// loop and the start value for the canonical induction, if it is != 0. The 531 /// latter is the case when vectorizing the epilogue loop. In the case of 532 /// epilogue vectorization, this function is overriden to handle the more 533 /// complex control flow around the loops. \p ExpandedSCEVs is used to 534 /// look up SCEV expansions for expressions needed during skeleton creation. 535 virtual std::pair<BasicBlock *, Value *> 536 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 537 538 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 539 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 540 541 // Return true if any runtime check is added. 542 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 543 544 /// A type for vectorized values in the new loop. Each value from the 545 /// original loop, when vectorized, is represented by UF vector values in the 546 /// new unrolled loop, where UF is the unroll factor. 547 using VectorParts = SmallVector<Value *, 2>; 548 549 /// A helper function to scalarize a single Instruction in the innermost loop. 550 /// Generates a sequence of scalar instances for each lane between \p MinLane 551 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 552 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 553 /// Instr's operands. 554 void scalarizeInstruction(const Instruction *Instr, 555 VPReplicateRecipe *RepRecipe, 556 const VPIteration &Instance, 557 VPTransformState &State); 558 559 /// Construct the vector value of a scalarized value \p V one lane at a time. 560 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 561 VPTransformState &State); 562 563 /// Try to vectorize interleaved access group \p Group with the base address 564 /// given in \p Addr, optionally masking the vector operations if \p 565 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 566 /// values in the vectorized loop. 567 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 568 ArrayRef<VPValue *> VPDefs, 569 VPTransformState &State, VPValue *Addr, 570 ArrayRef<VPValue *> StoredValues, 571 VPValue *BlockInMask, bool NeedsMaskForGaps); 572 573 /// Fix the non-induction PHIs in \p Plan. 574 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 575 576 /// Returns true if the reordering of FP operations is not allowed, but we are 577 /// able to vectorize with strict in-order reductions for the given RdxDesc. 578 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 579 580 // Returns the resume value (bc.merge.rdx) for a reduction as 581 // generated by fixReduction. 582 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 583 584 /// Create a new phi node for the induction variable \p OrigPhi to resume 585 /// iteration count in the scalar epilogue, from where the vectorized loop 586 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 587 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 588 /// and the resume values can come from an additional bypass block, the \p 589 /// AdditionalBypass pair provides information about the bypass block and the 590 /// end value on the edge from bypass to this loop. 591 PHINode *createInductionResumeValue( 592 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 593 ArrayRef<BasicBlock *> BypassBlocks, 594 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 595 596 /// Returns the original loop trip count. 597 Value *getTripCount() const { return TripCount; } 598 599 /// Used to set the trip count after ILV's construction and after the 600 /// preheader block has been executed. Note that this always holds the trip 601 /// count of the original loop for both main loop and epilogue vectorization. 602 void setTripCount(Value *TC) { TripCount = TC; } 603 604 protected: 605 friend class LoopVectorizationPlanner; 606 607 /// A small list of PHINodes. 608 using PhiVector = SmallVector<PHINode *, 4>; 609 610 /// A type for scalarized values in the new loop. Each value from the 611 /// original loop, when scalarized, is represented by UF x VF scalar values 612 /// in the new unrolled loop, where UF is the unroll factor and VF is the 613 /// vectorization factor. 614 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 615 616 /// Set up the values of the IVs correctly when exiting the vector loop. 617 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 618 Value *VectorTripCount, Value *EndValue, 619 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 620 VPlan &Plan, VPTransformState &State); 621 622 /// Handle all cross-iteration phis in the header. 623 void fixCrossIterationPHIs(VPTransformState &State); 624 625 /// Create the exit value of first order recurrences in the middle block and 626 /// update their users. 627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 628 VPTransformState &State); 629 630 /// Create code for the loop exit value of the reduction. 631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 638 /// represented as. 639 void truncateToMinimalBitwidths(VPTransformState &State); 640 641 /// Returns (and creates if needed) the trip count of the widened loop. 642 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 643 644 /// Returns a bitcasted value to the requested vector type. 645 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 646 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 647 const DataLayout &DL); 648 649 /// Emit a bypass check to see if the vector trip count is zero, including if 650 /// it overflows. 651 void emitIterationCountCheck(BasicBlock *Bypass); 652 653 /// Emit a bypass check to see if all of the SCEV assumptions we've 654 /// had to make are correct. Returns the block containing the checks or 655 /// nullptr if no checks have been added. 656 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 657 658 /// Emit bypass checks to check any memory assumptions we may have made. 659 /// Returns the block containing the checks or nullptr if no checks have been 660 /// added. 661 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 662 663 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 664 /// vector loop preheader, middle block and scalar preheader. 665 void createVectorLoopSkeleton(StringRef Prefix); 666 667 /// Create new phi nodes for the induction variables to resume iteration count 668 /// in the scalar epilogue, from where the vectorized loop left off. 669 /// In cases where the loop skeleton is more complicated (eg. epilogue 670 /// vectorization) and the resume values can come from an additional bypass 671 /// block, the \p AdditionalBypass pair provides information about the bypass 672 /// block and the end value on the edge from bypass to this loop. 673 void createInductionResumeValues( 674 const SCEV2ValueTy &ExpandedSCEVs, 675 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 676 677 /// Complete the loop skeleton by adding debug MDs, creating appropriate 678 /// conditional branches in the middle block, preparing the builder and 679 /// running the verifier. Return the preheader of the completed vector loop. 680 BasicBlock *completeLoopSkeleton(); 681 682 /// Collect poison-generating recipes that may generate a poison value that is 683 /// used after vectorization, even when their operands are not poison. Those 684 /// recipes meet the following conditions: 685 /// * Contribute to the address computation of a recipe generating a widen 686 /// memory load/store (VPWidenMemoryInstructionRecipe or 687 /// VPInterleaveRecipe). 688 /// * Such a widen memory load/store has at least one underlying Instruction 689 /// that is in a basic block that needs predication and after vectorization 690 /// the generated instruction won't be predicated. 691 void collectPoisonGeneratingRecipes(VPTransformState &State); 692 693 /// Allow subclasses to override and print debug traces before/after vplan 694 /// execution, when trace information is requested. 695 virtual void printDebugTracesAtStart(){}; 696 virtual void printDebugTracesAtEnd(){}; 697 698 /// The original loop. 699 Loop *OrigLoop; 700 701 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 702 /// dynamic knowledge to simplify SCEV expressions and converts them to a 703 /// more usable form. 704 PredicatedScalarEvolution &PSE; 705 706 /// Loop Info. 707 LoopInfo *LI; 708 709 /// Dominator Tree. 710 DominatorTree *DT; 711 712 /// Target Library Info. 713 const TargetLibraryInfo *TLI; 714 715 /// Target Transform Info. 716 const TargetTransformInfo *TTI; 717 718 /// Assumption Cache. 719 AssumptionCache *AC; 720 721 /// Interface to emit optimization remarks. 722 OptimizationRemarkEmitter *ORE; 723 724 /// The vectorization SIMD factor to use. Each vector will have this many 725 /// vector elements. 726 ElementCount VF; 727 728 ElementCount MinProfitableTripCount; 729 730 /// The vectorization unroll factor to use. Each scalar is vectorized to this 731 /// many different vector instructions. 732 unsigned UF; 733 734 /// The builder that we use 735 IRBuilder<> Builder; 736 737 // --- Vectorization state --- 738 739 /// The vector-loop preheader. 740 BasicBlock *LoopVectorPreHeader; 741 742 /// The scalar-loop preheader. 743 BasicBlock *LoopScalarPreHeader; 744 745 /// Middle Block between the vector and the scalar. 746 BasicBlock *LoopMiddleBlock; 747 748 /// The unique ExitBlock of the scalar loop if one exists. Note that 749 /// there can be multiple exiting edges reaching this block. 750 BasicBlock *LoopExitBlock; 751 752 /// The scalar loop body. 753 BasicBlock *LoopScalarBody; 754 755 /// A list of all bypass blocks. The first block is the entry of the loop. 756 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 757 758 /// Store instructions that were predicated. 759 SmallVector<Instruction *, 4> PredicatedInstructions; 760 761 /// Trip count of the original loop. 762 Value *TripCount = nullptr; 763 764 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 765 Value *VectorTripCount = nullptr; 766 767 /// The legality analysis. 768 LoopVectorizationLegality *Legal; 769 770 /// The profitablity analysis. 771 LoopVectorizationCostModel *Cost; 772 773 // Record whether runtime checks are added. 774 bool AddedSafetyChecks = false; 775 776 // Holds the end values for each induction variable. We save the end values 777 // so we can later fix-up the external users of the induction variables. 778 DenseMap<PHINode *, Value *> IVEndValues; 779 780 /// BFI and PSI are used to check for profile guided size optimizations. 781 BlockFrequencyInfo *BFI; 782 ProfileSummaryInfo *PSI; 783 784 // Whether this loop should be optimized for size based on profile guided size 785 // optimizatios. 786 bool OptForSizeBasedOnProfile; 787 788 /// Structure to hold information about generated runtime checks, responsible 789 /// for cleaning the checks, if vectorization turns out unprofitable. 790 GeneratedRTChecks &RTChecks; 791 792 // Holds the resume values for reductions in the loops, used to set the 793 // correct start value of reduction PHIs when vectorizing the epilogue. 794 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 795 ReductionResumeValues; 796 }; 797 798 class InnerLoopUnroller : public InnerLoopVectorizer { 799 public: 800 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 801 LoopInfo *LI, DominatorTree *DT, 802 const TargetLibraryInfo *TLI, 803 const TargetTransformInfo *TTI, AssumptionCache *AC, 804 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 805 LoopVectorizationLegality *LVL, 806 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 807 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 808 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 809 ElementCount::getFixed(1), 810 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 811 BFI, PSI, Check) {} 812 }; 813 814 /// Encapsulate information regarding vectorization of a loop and its epilogue. 815 /// This information is meant to be updated and used across two stages of 816 /// epilogue vectorization. 817 struct EpilogueLoopVectorizationInfo { 818 ElementCount MainLoopVF = ElementCount::getFixed(0); 819 unsigned MainLoopUF = 0; 820 ElementCount EpilogueVF = ElementCount::getFixed(0); 821 unsigned EpilogueUF = 0; 822 BasicBlock *MainLoopIterationCountCheck = nullptr; 823 BasicBlock *EpilogueIterationCountCheck = nullptr; 824 BasicBlock *SCEVSafetyCheck = nullptr; 825 BasicBlock *MemSafetyCheck = nullptr; 826 Value *TripCount = nullptr; 827 Value *VectorTripCount = nullptr; 828 829 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 830 ElementCount EVF, unsigned EUF) 831 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 832 assert(EUF == 1 && 833 "A high UF for the epilogue loop is likely not beneficial."); 834 } 835 }; 836 837 /// An extension of the inner loop vectorizer that creates a skeleton for a 838 /// vectorized loop that has its epilogue (residual) also vectorized. 839 /// The idea is to run the vplan on a given loop twice, firstly to setup the 840 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 841 /// from the first step and vectorize the epilogue. This is achieved by 842 /// deriving two concrete strategy classes from this base class and invoking 843 /// them in succession from the loop vectorizer planner. 844 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 845 public: 846 InnerLoopAndEpilogueVectorizer( 847 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 848 DominatorTree *DT, const TargetLibraryInfo *TLI, 849 const TargetTransformInfo *TTI, AssumptionCache *AC, 850 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 851 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 852 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 853 GeneratedRTChecks &Checks) 854 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 855 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 856 CM, BFI, PSI, Checks), 857 EPI(EPI) {} 858 859 // Override this function to handle the more complex control flow around the 860 // three loops. 861 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 862 const SCEV2ValueTy &ExpandedSCEVs) final { 863 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 864 } 865 866 /// The interface for creating a vectorized skeleton using one of two 867 /// different strategies, each corresponding to one execution of the vplan 868 /// as described above. 869 virtual std::pair<BasicBlock *, Value *> 870 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 871 872 /// Holds and updates state information required to vectorize the main loop 873 /// and its epilogue in two separate passes. This setup helps us avoid 874 /// regenerating and recomputing runtime safety checks. It also helps us to 875 /// shorten the iteration-count-check path length for the cases where the 876 /// iteration count of the loop is so small that the main vector loop is 877 /// completely skipped. 878 EpilogueLoopVectorizationInfo &EPI; 879 }; 880 881 /// A specialized derived class of inner loop vectorizer that performs 882 /// vectorization of *main* loops in the process of vectorizing loops and their 883 /// epilogues. 884 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 885 public: 886 EpilogueVectorizerMainLoop( 887 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 888 DominatorTree *DT, const TargetLibraryInfo *TLI, 889 const TargetTransformInfo *TTI, AssumptionCache *AC, 890 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 891 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 892 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 893 GeneratedRTChecks &Check) 894 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 895 EPI, LVL, CM, BFI, PSI, Check) {} 896 /// Implements the interface for creating a vectorized skeleton using the 897 /// *main loop* strategy (ie the first pass of vplan execution). 898 std::pair<BasicBlock *, Value *> 899 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 900 901 protected: 902 /// Emits an iteration count bypass check once for the main loop (when \p 903 /// ForEpilogue is false) and once for the epilogue loop (when \p 904 /// ForEpilogue is true). 905 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 906 void printDebugTracesAtStart() override; 907 void printDebugTracesAtEnd() override; 908 }; 909 910 // A specialized derived class of inner loop vectorizer that performs 911 // vectorization of *epilogue* loops in the process of vectorizing loops and 912 // their epilogues. 913 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 914 public: 915 EpilogueVectorizerEpilogueLoop( 916 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 917 DominatorTree *DT, const TargetLibraryInfo *TLI, 918 const TargetTransformInfo *TTI, AssumptionCache *AC, 919 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 920 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 921 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 922 GeneratedRTChecks &Checks) 923 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 924 EPI, LVL, CM, BFI, PSI, Checks) { 925 TripCount = EPI.TripCount; 926 } 927 /// Implements the interface for creating a vectorized skeleton using the 928 /// *epilogue loop* strategy (ie the second pass of vplan execution). 929 std::pair<BasicBlock *, Value *> 930 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 931 932 protected: 933 /// Emits an iteration count bypass check after the main vector loop has 934 /// finished to see if there are any iterations left to execute by either 935 /// the vector epilogue or the scalar epilogue. 936 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 937 BasicBlock *Bypass, 938 BasicBlock *Insert); 939 void printDebugTracesAtStart() override; 940 void printDebugTracesAtEnd() override; 941 }; 942 } // end namespace llvm 943 944 /// Look for a meaningful debug location on the instruction or it's 945 /// operands. 946 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 947 if (!I) 948 return I; 949 950 DebugLoc Empty; 951 if (I->getDebugLoc() != Empty) 952 return I; 953 954 for (Use &Op : I->operands()) { 955 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 956 if (OpInst->getDebugLoc() != Empty) 957 return OpInst; 958 } 959 960 return I; 961 } 962 963 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 964 /// is passed, the message relates to that particular instruction. 965 #ifndef NDEBUG 966 static void debugVectorizationMessage(const StringRef Prefix, 967 const StringRef DebugMsg, 968 Instruction *I) { 969 dbgs() << "LV: " << Prefix << DebugMsg; 970 if (I != nullptr) 971 dbgs() << " " << *I; 972 else 973 dbgs() << '.'; 974 dbgs() << '\n'; 975 } 976 #endif 977 978 /// Create an analysis remark that explains why vectorization failed 979 /// 980 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 981 /// RemarkName is the identifier for the remark. If \p I is passed it is an 982 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 983 /// the location of the remark. \return the remark object that can be 984 /// streamed to. 985 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 986 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 987 Value *CodeRegion = TheLoop->getHeader(); 988 DebugLoc DL = TheLoop->getStartLoc(); 989 990 if (I) { 991 CodeRegion = I->getParent(); 992 // If there is no debug location attached to the instruction, revert back to 993 // using the loop's. 994 if (I->getDebugLoc()) 995 DL = I->getDebugLoc(); 996 } 997 998 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 999 } 1000 1001 namespace llvm { 1002 1003 /// Return a value for Step multiplied by VF. 1004 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1005 int64_t Step) { 1006 assert(Ty->isIntegerTy() && "Expected an integer step"); 1007 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1008 } 1009 1010 /// Return the runtime value for VF. 1011 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1012 return B.CreateElementCount(Ty, VF); 1013 } 1014 1015 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1016 Loop *OrigLoop) { 1017 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1018 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1019 1020 ScalarEvolution &SE = *PSE.getSE(); 1021 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1022 } 1023 1024 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1025 ElementCount VF) { 1026 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1027 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1028 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1029 return B.CreateUIToFP(RuntimeVF, FTy); 1030 } 1031 1032 void reportVectorizationFailure(const StringRef DebugMsg, 1033 const StringRef OREMsg, const StringRef ORETag, 1034 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1035 Instruction *I) { 1036 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1037 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1038 ORE->emit( 1039 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1040 << "loop not vectorized: " << OREMsg); 1041 } 1042 1043 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1044 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1045 Instruction *I) { 1046 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1047 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1048 ORE->emit( 1049 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1050 << Msg); 1051 } 1052 1053 } // end namespace llvm 1054 1055 #ifndef NDEBUG 1056 /// \return string containing a file name and a line # for the given loop. 1057 static std::string getDebugLocString(const Loop *L) { 1058 std::string Result; 1059 if (L) { 1060 raw_string_ostream OS(Result); 1061 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1062 LoopDbgLoc.print(OS); 1063 else 1064 // Just print the module name. 1065 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1066 OS.flush(); 1067 } 1068 return Result; 1069 } 1070 #endif 1071 1072 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1073 VPTransformState &State) { 1074 1075 // Collect recipes in the backward slice of `Root` that may generate a poison 1076 // value that is used after vectorization. 1077 SmallPtrSet<VPRecipeBase *, 16> Visited; 1078 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1079 SmallVector<VPRecipeBase *, 16> Worklist; 1080 Worklist.push_back(Root); 1081 1082 // Traverse the backward slice of Root through its use-def chain. 1083 while (!Worklist.empty()) { 1084 VPRecipeBase *CurRec = Worklist.back(); 1085 Worklist.pop_back(); 1086 1087 if (!Visited.insert(CurRec).second) 1088 continue; 1089 1090 // Prune search if we find another recipe generating a widen memory 1091 // instruction. Widen memory instructions involved in address computation 1092 // will lead to gather/scatter instructions, which don't need to be 1093 // handled. 1094 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1095 isa<VPInterleaveRecipe>(CurRec) || 1096 isa<VPScalarIVStepsRecipe>(CurRec) || 1097 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1098 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1099 continue; 1100 1101 // This recipe contributes to the address computation of a widen 1102 // load/store. If the underlying instruction has poison-generating flags, 1103 // drop them directly. 1104 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1105 RecWithFlags->dropPoisonGeneratingFlags(); 1106 } else { 1107 Instruction *Instr = CurRec->getUnderlyingInstr(); 1108 (void)Instr; 1109 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1110 "found instruction with poison generating flags not covered by " 1111 "VPRecipeWithIRFlags"); 1112 } 1113 1114 // Add new definitions to the worklist. 1115 for (VPValue *operand : CurRec->operands()) 1116 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1117 Worklist.push_back(OpDef); 1118 } 1119 }); 1120 1121 // Traverse all the recipes in the VPlan and collect the poison-generating 1122 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1123 // VPInterleaveRecipe. 1124 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1125 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1126 for (VPRecipeBase &Recipe : *VPBB) { 1127 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1128 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1129 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1130 if (AddrDef && WidenRec->isConsecutive() && 1131 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1132 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1133 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1134 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1135 if (AddrDef) { 1136 // Check if any member of the interleave group needs predication. 1137 const InterleaveGroup<Instruction> *InterGroup = 1138 InterleaveRec->getInterleaveGroup(); 1139 bool NeedPredication = false; 1140 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1141 I < NumMembers; ++I) { 1142 Instruction *Member = InterGroup->getMember(I); 1143 if (Member) 1144 NeedPredication |= 1145 Legal->blockNeedsPredication(Member->getParent()); 1146 } 1147 1148 if (NeedPredication) 1149 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1150 } 1151 } 1152 } 1153 } 1154 } 1155 1156 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1157 const RecurrenceDescriptor &RdxDesc) { 1158 auto It = ReductionResumeValues.find(&RdxDesc); 1159 assert(It != ReductionResumeValues.end() && 1160 "Expected to find a resume value for the reduction."); 1161 return It->second; 1162 } 1163 1164 namespace llvm { 1165 1166 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1167 // lowered. 1168 enum ScalarEpilogueLowering { 1169 1170 // The default: allowing scalar epilogues. 1171 CM_ScalarEpilogueAllowed, 1172 1173 // Vectorization with OptForSize: don't allow epilogues. 1174 CM_ScalarEpilogueNotAllowedOptSize, 1175 1176 // A special case of vectorisation with OptForSize: loops with a very small 1177 // trip count are considered for vectorization under OptForSize, thereby 1178 // making sure the cost of their loop body is dominant, free of runtime 1179 // guards and scalar iteration overheads. 1180 CM_ScalarEpilogueNotAllowedLowTripLoop, 1181 1182 // Loop hint predicate indicating an epilogue is undesired. 1183 CM_ScalarEpilogueNotNeededUsePredicate, 1184 1185 // Directive indicating we must either tail fold or not vectorize 1186 CM_ScalarEpilogueNotAllowedUsePredicate 1187 }; 1188 1189 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1190 1191 /// LoopVectorizationCostModel - estimates the expected speedups due to 1192 /// vectorization. 1193 /// In many cases vectorization is not profitable. This can happen because of 1194 /// a number of reasons. In this class we mainly attempt to predict the 1195 /// expected speedup/slowdowns due to the supported instruction set. We use the 1196 /// TargetTransformInfo to query the different backends for the cost of 1197 /// different operations. 1198 class LoopVectorizationCostModel { 1199 public: 1200 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1201 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1202 LoopVectorizationLegality *Legal, 1203 const TargetTransformInfo &TTI, 1204 const TargetLibraryInfo *TLI, DemandedBits *DB, 1205 AssumptionCache *AC, 1206 OptimizationRemarkEmitter *ORE, const Function *F, 1207 const LoopVectorizeHints *Hints, 1208 InterleavedAccessInfo &IAI) 1209 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1210 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1211 Hints(Hints), InterleaveInfo(IAI) {} 1212 1213 /// \return An upper bound for the vectorization factors (both fixed and 1214 /// scalable). If the factors are 0, vectorization and interleaving should be 1215 /// avoided up front. 1216 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1217 1218 /// \return True if runtime checks are required for vectorization, and false 1219 /// otherwise. 1220 bool runtimeChecksRequired(); 1221 1222 /// Setup cost-based decisions for user vectorization factor. 1223 /// \return true if the UserVF is a feasible VF to be chosen. 1224 bool selectUserVectorizationFactor(ElementCount UserVF) { 1225 collectUniformsAndScalars(UserVF); 1226 collectInstsToScalarize(UserVF); 1227 return expectedCost(UserVF).first.isValid(); 1228 } 1229 1230 /// \return The size (in bits) of the smallest and widest types in the code 1231 /// that needs to be vectorized. We ignore values that remain scalar such as 1232 /// 64 bit loop indices. 1233 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1234 1235 /// \return The desired interleave count. 1236 /// If interleave count has been specified by metadata it will be returned. 1237 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1238 /// are the selected vectorization factor and the cost of the selected VF. 1239 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1240 1241 /// Memory access instruction may be vectorized in more than one way. 1242 /// Form of instruction after vectorization depends on cost. 1243 /// This function takes cost-based decisions for Load/Store instructions 1244 /// and collects them in a map. This decisions map is used for building 1245 /// the lists of loop-uniform and loop-scalar instructions. 1246 /// The calculated cost is saved with widening decision in order to 1247 /// avoid redundant calculations. 1248 void setCostBasedWideningDecision(ElementCount VF); 1249 1250 /// A struct that represents some properties of the register usage 1251 /// of a loop. 1252 struct RegisterUsage { 1253 /// Holds the number of loop invariant values that are used in the loop. 1254 /// The key is ClassID of target-provided register class. 1255 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1256 /// Holds the maximum number of concurrent live intervals in the loop. 1257 /// The key is ClassID of target-provided register class. 1258 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1259 }; 1260 1261 /// \return Returns information about the register usages of the loop for the 1262 /// given vectorization factors. 1263 SmallVector<RegisterUsage, 8> 1264 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1265 1266 /// Collect values we want to ignore in the cost model. 1267 void collectValuesToIgnore(); 1268 1269 /// Collect all element types in the loop for which widening is needed. 1270 void collectElementTypesForWidening(); 1271 1272 /// Split reductions into those that happen in the loop, and those that happen 1273 /// outside. In loop reductions are collected into InLoopReductionChains. 1274 void collectInLoopReductions(); 1275 1276 /// Returns true if we should use strict in-order reductions for the given 1277 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1278 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1279 /// of FP operations. 1280 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1281 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1282 } 1283 1284 /// \returns The smallest bitwidth each instruction can be represented with. 1285 /// The vector equivalents of these instructions should be truncated to this 1286 /// type. 1287 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1288 return MinBWs; 1289 } 1290 1291 /// \returns True if it is more profitable to scalarize instruction \p I for 1292 /// vectorization factor \p VF. 1293 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1294 assert(VF.isVector() && 1295 "Profitable to scalarize relevant only for VF > 1."); 1296 1297 // Cost model is not run in the VPlan-native path - return conservative 1298 // result until this changes. 1299 if (EnableVPlanNativePath) 1300 return false; 1301 1302 auto Scalars = InstsToScalarize.find(VF); 1303 assert(Scalars != InstsToScalarize.end() && 1304 "VF not yet analyzed for scalarization profitability"); 1305 return Scalars->second.contains(I); 1306 } 1307 1308 /// Returns true if \p I is known to be uniform after vectorization. 1309 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1310 // Pseudo probe needs to be duplicated for each unrolled iteration and 1311 // vector lane so that profiled loop trip count can be accurately 1312 // accumulated instead of being under counted. 1313 if (isa<PseudoProbeInst>(I)) 1314 return false; 1315 1316 if (VF.isScalar()) 1317 return true; 1318 1319 // Cost model is not run in the VPlan-native path - return conservative 1320 // result until this changes. 1321 if (EnableVPlanNativePath) 1322 return false; 1323 1324 auto UniformsPerVF = Uniforms.find(VF); 1325 assert(UniformsPerVF != Uniforms.end() && 1326 "VF not yet analyzed for uniformity"); 1327 return UniformsPerVF->second.count(I); 1328 } 1329 1330 /// Returns true if \p I is known to be scalar after vectorization. 1331 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1332 if (VF.isScalar()) 1333 return true; 1334 1335 // Cost model is not run in the VPlan-native path - return conservative 1336 // result until this changes. 1337 if (EnableVPlanNativePath) 1338 return false; 1339 1340 auto ScalarsPerVF = Scalars.find(VF); 1341 assert(ScalarsPerVF != Scalars.end() && 1342 "Scalar values are not calculated for VF"); 1343 return ScalarsPerVF->second.count(I); 1344 } 1345 1346 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1347 /// for vectorization factor \p VF. 1348 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1349 return VF.isVector() && MinBWs.contains(I) && 1350 !isProfitableToScalarize(I, VF) && 1351 !isScalarAfterVectorization(I, VF); 1352 } 1353 1354 /// Decision that was taken during cost calculation for memory instruction. 1355 enum InstWidening { 1356 CM_Unknown, 1357 CM_Widen, // For consecutive accesses with stride +1. 1358 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1359 CM_Interleave, 1360 CM_GatherScatter, 1361 CM_Scalarize 1362 }; 1363 1364 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1365 /// instruction \p I and vector width \p VF. 1366 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1367 InstructionCost Cost) { 1368 assert(VF.isVector() && "Expected VF >=2"); 1369 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1370 } 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// interleaving group \p Grp and vector width \p VF. 1374 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1375 ElementCount VF, InstWidening W, 1376 InstructionCost Cost) { 1377 assert(VF.isVector() && "Expected VF >=2"); 1378 /// Broadcast this decicion to all instructions inside the group. 1379 /// But the cost will be assigned to one instruction only. 1380 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1381 if (auto *I = Grp->getMember(i)) { 1382 if (Grp->getInsertPos() == I) 1383 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1384 else 1385 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1386 } 1387 } 1388 } 1389 1390 /// Return the cost model decision for the given instruction \p I and vector 1391 /// width \p VF. Return CM_Unknown if this instruction did not pass 1392 /// through the cost modeling. 1393 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1394 assert(VF.isVector() && "Expected VF to be a vector VF"); 1395 // Cost model is not run in the VPlan-native path - return conservative 1396 // result until this changes. 1397 if (EnableVPlanNativePath) 1398 return CM_GatherScatter; 1399 1400 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1401 auto Itr = WideningDecisions.find(InstOnVF); 1402 if (Itr == WideningDecisions.end()) 1403 return CM_Unknown; 1404 return Itr->second.first; 1405 } 1406 1407 /// Return the vectorization cost for the given instruction \p I and vector 1408 /// width \p VF. 1409 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1410 assert(VF.isVector() && "Expected VF >=2"); 1411 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1412 assert(WideningDecisions.contains(InstOnVF) && 1413 "The cost is not calculated"); 1414 return WideningDecisions[InstOnVF].second; 1415 } 1416 1417 /// Return True if instruction \p I is an optimizable truncate whose operand 1418 /// is an induction variable. Such a truncate will be removed by adding a new 1419 /// induction variable with the destination type. 1420 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1421 // If the instruction is not a truncate, return false. 1422 auto *Trunc = dyn_cast<TruncInst>(I); 1423 if (!Trunc) 1424 return false; 1425 1426 // Get the source and destination types of the truncate. 1427 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1428 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1429 1430 // If the truncate is free for the given types, return false. Replacing a 1431 // free truncate with an induction variable would add an induction variable 1432 // update instruction to each iteration of the loop. We exclude from this 1433 // check the primary induction variable since it will need an update 1434 // instruction regardless. 1435 Value *Op = Trunc->getOperand(0); 1436 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1437 return false; 1438 1439 // If the truncated value is not an induction variable, return false. 1440 return Legal->isInductionPhi(Op); 1441 } 1442 1443 /// Collects the instructions to scalarize for each predicated instruction in 1444 /// the loop. 1445 void collectInstsToScalarize(ElementCount VF); 1446 1447 /// Collect Uniform and Scalar values for the given \p VF. 1448 /// The sets depend on CM decision for Load/Store instructions 1449 /// that may be vectorized as interleave, gather-scatter or scalarized. 1450 void collectUniformsAndScalars(ElementCount VF) { 1451 // Do the analysis once. 1452 if (VF.isScalar() || Uniforms.contains(VF)) 1453 return; 1454 setCostBasedWideningDecision(VF); 1455 collectLoopUniforms(VF); 1456 collectLoopScalars(VF); 1457 } 1458 1459 /// Returns true if the target machine supports masked store operation 1460 /// for the given \p DataType and kind of access to \p Ptr. 1461 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1462 return Legal->isConsecutivePtr(DataType, Ptr) && 1463 TTI.isLegalMaskedStore(DataType, Alignment); 1464 } 1465 1466 /// Returns true if the target machine supports masked load operation 1467 /// for the given \p DataType and kind of access to \p Ptr. 1468 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1469 return Legal->isConsecutivePtr(DataType, Ptr) && 1470 TTI.isLegalMaskedLoad(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getLoadStoreType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 if (VF.isVector()) 1483 Ty = VectorType::get(Ty, VF); 1484 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1485 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1486 } 1487 1488 /// Returns true if the target machine supports all of the reduction 1489 /// variables found for the given VF. 1490 bool canVectorizeReductions(ElementCount VF) const { 1491 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1492 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1493 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1494 })); 1495 } 1496 1497 /// Given costs for both strategies, return true if the scalar predication 1498 /// lowering should be used for div/rem. This incorporates an override 1499 /// option so it is not simply a cost comparison. 1500 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1501 InstructionCost SafeDivisorCost) const { 1502 switch (ForceSafeDivisor) { 1503 case cl::BOU_UNSET: 1504 return ScalarCost < SafeDivisorCost; 1505 case cl::BOU_TRUE: 1506 return false; 1507 case cl::BOU_FALSE: 1508 return true; 1509 }; 1510 llvm_unreachable("impossible case value"); 1511 } 1512 1513 /// Returns true if \p I is an instruction which requires predication and 1514 /// for which our chosen predication strategy is scalarization (i.e. we 1515 /// don't have an alternate strategy such as masking available). 1516 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1517 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1518 1519 /// Returns true if \p I is an instruction that needs to be predicated 1520 /// at runtime. The result is independent of the predication mechanism. 1521 /// Superset of instructions that return true for isScalarWithPredication. 1522 bool isPredicatedInst(Instruction *I) const; 1523 1524 /// Return the costs for our two available strategies for lowering a 1525 /// div/rem operation which requires speculating at least one lane. 1526 /// First result is for scalarization (will be invalid for scalable 1527 /// vectors); second is for the safe-divisor strategy. 1528 std::pair<InstructionCost, InstructionCost> 1529 getDivRemSpeculationCost(Instruction *I, 1530 ElementCount VF) const; 1531 1532 /// Returns true if \p I is a memory instruction with consecutive memory 1533 /// access that can be widened. 1534 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1535 1536 /// Returns true if \p I is a memory instruction in an interleaved-group 1537 /// of memory accesses that can be vectorized with wide vector loads/stores 1538 /// and shuffles. 1539 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1540 1541 /// Check if \p Instr belongs to any interleaved access group. 1542 bool isAccessInterleaved(Instruction *Instr) { 1543 return InterleaveInfo.isInterleaved(Instr); 1544 } 1545 1546 /// Get the interleaved access group that \p Instr belongs to. 1547 const InterleaveGroup<Instruction> * 1548 getInterleavedAccessGroup(Instruction *Instr) { 1549 return InterleaveInfo.getInterleaveGroup(Instr); 1550 } 1551 1552 /// Returns true if we're required to use a scalar epilogue for at least 1553 /// the final iteration of the original loop. 1554 bool requiresScalarEpilogue(bool IsVectorizing) const { 1555 if (!isScalarEpilogueAllowed()) 1556 return false; 1557 // If we might exit from anywhere but the latch, must run the exiting 1558 // iteration in scalar form. 1559 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1560 return true; 1561 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1562 } 1563 1564 /// Returns true if we're required to use a scalar epilogue for at least 1565 /// the final iteration of the original loop for all VFs in \p Range. 1566 /// A scalar epilogue must either be required for all VFs in \p Range or for 1567 /// none. 1568 bool requiresScalarEpilogue(VFRange Range) const { 1569 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1570 return requiresScalarEpilogue(VF.isVector()); 1571 }; 1572 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1573 assert( 1574 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1575 "all VFs in range must agree on whether a scalar epilogue is required"); 1576 return IsRequired; 1577 } 1578 1579 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1580 /// loop hint annotation. 1581 bool isScalarEpilogueAllowed() const { 1582 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1583 } 1584 1585 /// Returns the TailFoldingStyle that is best for the current loop. 1586 TailFoldingStyle 1587 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1588 if (!CanFoldTailByMasking) 1589 return TailFoldingStyle::None; 1590 1591 if (ForceTailFoldingStyle.getNumOccurrences()) 1592 return ForceTailFoldingStyle; 1593 1594 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1595 } 1596 1597 /// Returns true if all loop blocks should be masked to fold tail loop. 1598 bool foldTailByMasking() const { 1599 return getTailFoldingStyle() != TailFoldingStyle::None; 1600 } 1601 1602 /// Returns true if the instructions in this block requires predication 1603 /// for any reason, e.g. because tail folding now requires a predicate 1604 /// or because the block in the original loop was predicated. 1605 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1606 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1607 } 1608 1609 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1610 /// nodes to the chain of instructions representing the reductions. Uses a 1611 /// MapVector to ensure deterministic iteration order. 1612 using ReductionChainMap = 1613 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1614 1615 /// Return the chain of instructions representing an inloop reduction. 1616 const ReductionChainMap &getInLoopReductionChains() const { 1617 return InLoopReductionChains; 1618 } 1619 1620 /// Returns true if the Phi is part of an inloop reduction. 1621 bool isInLoopReduction(PHINode *Phi) const { 1622 return InLoopReductionChains.count(Phi); 1623 } 1624 1625 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1626 /// with factor VF. Return the cost of the instruction, including 1627 /// scalarization overhead if it's needed. 1628 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1629 1630 /// Estimate cost of a call instruction CI if it were vectorized with factor 1631 /// VF. Return the cost of the instruction, including scalarization overhead 1632 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1633 /// scalarized - 1634 /// i.e. either vector version isn't available, or is too expensive. 1635 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1636 Function **Variant, 1637 bool *NeedsMask = nullptr) const; 1638 1639 /// Invalidates decisions already taken by the cost model. 1640 void invalidateCostModelingDecisions() { 1641 WideningDecisions.clear(); 1642 Uniforms.clear(); 1643 Scalars.clear(); 1644 } 1645 1646 /// The vectorization cost is a combination of the cost itself and a boolean 1647 /// indicating whether any of the contributing operations will actually 1648 /// operate on vector values after type legalization in the backend. If this 1649 /// latter value is false, then all operations will be scalarized (i.e. no 1650 /// vectorization has actually taken place). 1651 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1652 1653 /// Returns the expected execution cost. The unit of the cost does 1654 /// not matter because we use the 'cost' units to compare different 1655 /// vector widths. The cost that is returned is *not* normalized by 1656 /// the factor width. If \p Invalid is not nullptr, this function 1657 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1658 /// each instruction that has an Invalid cost for the given VF. 1659 VectorizationCostTy 1660 expectedCost(ElementCount VF, 1661 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1662 1663 bool hasPredStores() const { return NumPredStores > 0; } 1664 1665 /// Returns true if epilogue vectorization is considered profitable, and 1666 /// false otherwise. 1667 /// \p VF is the vectorization factor chosen for the original loop. 1668 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1669 1670 private: 1671 unsigned NumPredStores = 0; 1672 1673 /// \return An upper bound for the vectorization factors for both 1674 /// fixed and scalable vectorization, where the minimum-known number of 1675 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1676 /// disabled or unsupported, then the scalable part will be equal to 1677 /// ElementCount::getScalable(0). 1678 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1679 ElementCount UserVF, 1680 bool FoldTailByMasking); 1681 1682 /// \return the maximized element count based on the targets vector 1683 /// registers and the loop trip-count, but limited to a maximum safe VF. 1684 /// This is a helper function of computeFeasibleMaxVF. 1685 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1686 unsigned SmallestType, 1687 unsigned WidestType, 1688 ElementCount MaxSafeVF, 1689 bool FoldTailByMasking); 1690 1691 /// \return the maximum legal scalable VF, based on the safe max number 1692 /// of elements. 1693 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1694 1695 /// Returns the execution time cost of an instruction for a given vector 1696 /// width. Vector width of one means scalar. 1697 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost-computation logic from getInstructionCost which provides 1700 /// the vector type as an output parameter. 1701 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1702 Type *&VectorTy); 1703 1704 /// Return the cost of instructions in an inloop reduction pattern, if I is 1705 /// part of that pattern. 1706 std::optional<InstructionCost> 1707 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1708 TTI::TargetCostKind CostKind); 1709 1710 /// Calculate vectorization cost of memory instruction \p I. 1711 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1712 1713 /// The cost computation for scalarized memory instruction. 1714 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1715 1716 /// The cost computation for interleaving group of memory instructions. 1717 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost computation for Gather/Scatter instruction. 1720 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1721 1722 /// The cost computation for widening instruction \p I with consecutive 1723 /// memory access. 1724 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1725 1726 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1727 /// Load: scalar load + broadcast. 1728 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1729 /// element) 1730 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1731 1732 /// Estimate the overhead of scalarizing an instruction. This is a 1733 /// convenience wrapper for the type-based getScalarizationOverhead API. 1734 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1735 TTI::TargetCostKind CostKind) const; 1736 1737 /// Returns true if an artificially high cost for emulated masked memrefs 1738 /// should be used. 1739 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1740 1741 /// Map of scalar integer values to the smallest bitwidth they can be legally 1742 /// represented as. The vector equivalents of these values should be truncated 1743 /// to this type. 1744 MapVector<Instruction *, uint64_t> MinBWs; 1745 1746 /// A type representing the costs for instructions if they were to be 1747 /// scalarized rather than vectorized. The entries are Instruction-Cost 1748 /// pairs. 1749 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1750 1751 /// A set containing all BasicBlocks that are known to present after 1752 /// vectorization as a predicated block. 1753 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1754 PredicatedBBsAfterVectorization; 1755 1756 /// Records whether it is allowed to have the original scalar loop execute at 1757 /// least once. This may be needed as a fallback loop in case runtime 1758 /// aliasing/dependence checks fail, or to handle the tail/remainder 1759 /// iterations when the trip count is unknown or doesn't divide by the VF, 1760 /// or as a peel-loop to handle gaps in interleave-groups. 1761 /// Under optsize and when the trip count is very small we don't allow any 1762 /// iterations to execute in the scalar loop. 1763 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1764 1765 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1766 bool CanFoldTailByMasking = false; 1767 1768 /// A map holding scalar costs for different vectorization factors. The 1769 /// presence of a cost for an instruction in the mapping indicates that the 1770 /// instruction will be scalarized when vectorizing with the associated 1771 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1772 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1773 1774 /// Holds the instructions known to be uniform after vectorization. 1775 /// The data is collected per VF. 1776 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1777 1778 /// Holds the instructions known to be scalar after vectorization. 1779 /// The data is collected per VF. 1780 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1781 1782 /// Holds the instructions (address computations) that are forced to be 1783 /// scalarized. 1784 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1785 1786 /// PHINodes of the reductions that should be expanded in-loop along with 1787 /// their associated chains of reduction operations, in program order from top 1788 /// (PHI) to bottom 1789 ReductionChainMap InLoopReductionChains; 1790 1791 /// A Map of inloop reduction operations and their immediate chain operand. 1792 /// FIXME: This can be removed once reductions can be costed correctly in 1793 /// vplan. This was added to allow quick lookup to the inloop operations, 1794 /// without having to loop through InLoopReductionChains. 1795 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1796 1797 /// Returns the expected difference in cost from scalarizing the expression 1798 /// feeding a predicated instruction \p PredInst. The instructions to 1799 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1800 /// non-negative return value implies the expression will be scalarized. 1801 /// Currently, only single-use chains are considered for scalarization. 1802 InstructionCost computePredInstDiscount(Instruction *PredInst, 1803 ScalarCostsTy &ScalarCosts, 1804 ElementCount VF); 1805 1806 /// Collect the instructions that are uniform after vectorization. An 1807 /// instruction is uniform if we represent it with a single scalar value in 1808 /// the vectorized loop corresponding to each vector iteration. Examples of 1809 /// uniform instructions include pointer operands of consecutive or 1810 /// interleaved memory accesses. Note that although uniformity implies an 1811 /// instruction will be scalar, the reverse is not true. In general, a 1812 /// scalarized instruction will be represented by VF scalar values in the 1813 /// vectorized loop, each corresponding to an iteration of the original 1814 /// scalar loop. 1815 void collectLoopUniforms(ElementCount VF); 1816 1817 /// Collect the instructions that are scalar after vectorization. An 1818 /// instruction is scalar if it is known to be uniform or will be scalarized 1819 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1820 /// to the list if they are used by a load/store instruction that is marked as 1821 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1822 /// VF values in the vectorized loop, each corresponding to an iteration of 1823 /// the original scalar loop. 1824 void collectLoopScalars(ElementCount VF); 1825 1826 /// Keeps cost model vectorization decision and cost for instructions. 1827 /// Right now it is used for memory instructions only. 1828 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1829 std::pair<InstWidening, InstructionCost>>; 1830 1831 DecisionList WideningDecisions; 1832 1833 /// Returns true if \p V is expected to be vectorized and it needs to be 1834 /// extracted. 1835 bool needsExtract(Value *V, ElementCount VF) const { 1836 Instruction *I = dyn_cast<Instruction>(V); 1837 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1838 TheLoop->isLoopInvariant(I)) 1839 return false; 1840 1841 // Assume we can vectorize V (and hence we need extraction) if the 1842 // scalars are not computed yet. This can happen, because it is called 1843 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1844 // the scalars are collected. That should be a safe assumption in most 1845 // cases, because we check if the operands have vectorizable types 1846 // beforehand in LoopVectorizationLegality. 1847 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1848 }; 1849 1850 /// Returns a range containing only operands needing to be extracted. 1851 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1852 ElementCount VF) const { 1853 return SmallVector<Value *, 4>(make_filter_range( 1854 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1855 } 1856 1857 public: 1858 /// The loop that we evaluate. 1859 Loop *TheLoop; 1860 1861 /// Predicated scalar evolution analysis. 1862 PredicatedScalarEvolution &PSE; 1863 1864 /// Loop Info analysis. 1865 LoopInfo *LI; 1866 1867 /// Vectorization legality. 1868 LoopVectorizationLegality *Legal; 1869 1870 /// Vector target information. 1871 const TargetTransformInfo &TTI; 1872 1873 /// Target Library Info. 1874 const TargetLibraryInfo *TLI; 1875 1876 /// Demanded bits analysis. 1877 DemandedBits *DB; 1878 1879 /// Assumption cache. 1880 AssumptionCache *AC; 1881 1882 /// Interface to emit optimization remarks. 1883 OptimizationRemarkEmitter *ORE; 1884 1885 const Function *TheFunction; 1886 1887 /// Loop Vectorize Hint. 1888 const LoopVectorizeHints *Hints; 1889 1890 /// The interleave access information contains groups of interleaved accesses 1891 /// with the same stride and close to each other. 1892 InterleavedAccessInfo &InterleaveInfo; 1893 1894 /// Values to ignore in the cost model. 1895 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1896 1897 /// Values to ignore in the cost model when VF > 1. 1898 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1899 1900 /// All element types found in the loop. 1901 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1902 }; 1903 } // end namespace llvm 1904 1905 namespace { 1906 /// Helper struct to manage generating runtime checks for vectorization. 1907 /// 1908 /// The runtime checks are created up-front in temporary blocks to allow better 1909 /// estimating the cost and un-linked from the existing IR. After deciding to 1910 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1911 /// temporary blocks are completely removed. 1912 class GeneratedRTChecks { 1913 /// Basic block which contains the generated SCEV checks, if any. 1914 BasicBlock *SCEVCheckBlock = nullptr; 1915 1916 /// The value representing the result of the generated SCEV checks. If it is 1917 /// nullptr, either no SCEV checks have been generated or they have been used. 1918 Value *SCEVCheckCond = nullptr; 1919 1920 /// Basic block which contains the generated memory runtime checks, if any. 1921 BasicBlock *MemCheckBlock = nullptr; 1922 1923 /// The value representing the result of the generated memory runtime checks. 1924 /// If it is nullptr, either no memory runtime checks have been generated or 1925 /// they have been used. 1926 Value *MemRuntimeCheckCond = nullptr; 1927 1928 DominatorTree *DT; 1929 LoopInfo *LI; 1930 TargetTransformInfo *TTI; 1931 1932 SCEVExpander SCEVExp; 1933 SCEVExpander MemCheckExp; 1934 1935 bool CostTooHigh = false; 1936 1937 public: 1938 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1939 TargetTransformInfo *TTI, const DataLayout &DL) 1940 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1941 MemCheckExp(SE, DL, "scev.check") {} 1942 1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1944 /// accurately estimate the cost of the runtime checks. The blocks are 1945 /// un-linked from the IR and is added back during vector code generation. If 1946 /// there is no vector code generation, the check blocks are removed 1947 /// completely. 1948 void Create(Loop *L, const LoopAccessInfo &LAI, 1949 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1950 1951 // Hard cutoff to limit compile-time increase in case a very large number of 1952 // runtime checks needs to be generated. 1953 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1954 // profile info. 1955 CostTooHigh = 1956 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1957 if (CostTooHigh) 1958 return; 1959 1960 BasicBlock *LoopHeader = L->getHeader(); 1961 BasicBlock *Preheader = L->getLoopPreheader(); 1962 1963 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1964 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1965 // may be used by SCEVExpander. The blocks will be un-linked from their 1966 // predecessors and removed from LI & DT at the end of the function. 1967 if (!UnionPred.isAlwaysTrue()) { 1968 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1969 nullptr, "vector.scevcheck"); 1970 1971 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1972 &UnionPred, SCEVCheckBlock->getTerminator()); 1973 } 1974 1975 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1976 if (RtPtrChecking.Need) { 1977 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1978 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1979 "vector.memcheck"); 1980 1981 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1982 if (DiffChecks) { 1983 Value *RuntimeVF = nullptr; 1984 MemRuntimeCheckCond = addDiffRuntimeChecks( 1985 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 1986 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 1987 if (!RuntimeVF) 1988 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 1989 return RuntimeVF; 1990 }, 1991 IC); 1992 } else { 1993 MemRuntimeCheckCond = 1994 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1995 RtPtrChecking.getChecks(), MemCheckExp); 1996 } 1997 assert(MemRuntimeCheckCond && 1998 "no RT checks generated although RtPtrChecking " 1999 "claimed checks are required"); 2000 } 2001 2002 if (!MemCheckBlock && !SCEVCheckBlock) 2003 return; 2004 2005 // Unhook the temporary block with the checks, update various places 2006 // accordingly. 2007 if (SCEVCheckBlock) 2008 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2009 if (MemCheckBlock) 2010 MemCheckBlock->replaceAllUsesWith(Preheader); 2011 2012 if (SCEVCheckBlock) { 2013 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2014 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2015 Preheader->getTerminator()->eraseFromParent(); 2016 } 2017 if (MemCheckBlock) { 2018 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2019 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2020 Preheader->getTerminator()->eraseFromParent(); 2021 } 2022 2023 DT->changeImmediateDominator(LoopHeader, Preheader); 2024 if (MemCheckBlock) { 2025 DT->eraseNode(MemCheckBlock); 2026 LI->removeBlock(MemCheckBlock); 2027 } 2028 if (SCEVCheckBlock) { 2029 DT->eraseNode(SCEVCheckBlock); 2030 LI->removeBlock(SCEVCheckBlock); 2031 } 2032 } 2033 2034 InstructionCost getCost() { 2035 if (SCEVCheckBlock || MemCheckBlock) 2036 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2037 2038 if (CostTooHigh) { 2039 InstructionCost Cost; 2040 Cost.setInvalid(); 2041 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2042 return Cost; 2043 } 2044 2045 InstructionCost RTCheckCost = 0; 2046 if (SCEVCheckBlock) 2047 for (Instruction &I : *SCEVCheckBlock) { 2048 if (SCEVCheckBlock->getTerminator() == &I) 2049 continue; 2050 InstructionCost C = 2051 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2052 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2053 RTCheckCost += C; 2054 } 2055 if (MemCheckBlock) 2056 for (Instruction &I : *MemCheckBlock) { 2057 if (MemCheckBlock->getTerminator() == &I) 2058 continue; 2059 InstructionCost C = 2060 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2061 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2062 RTCheckCost += C; 2063 } 2064 2065 if (SCEVCheckBlock || MemCheckBlock) 2066 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2067 << "\n"); 2068 2069 return RTCheckCost; 2070 } 2071 2072 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2073 /// unused. 2074 ~GeneratedRTChecks() { 2075 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2076 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2077 if (!SCEVCheckCond) 2078 SCEVCleaner.markResultUsed(); 2079 2080 if (!MemRuntimeCheckCond) 2081 MemCheckCleaner.markResultUsed(); 2082 2083 if (MemRuntimeCheckCond) { 2084 auto &SE = *MemCheckExp.getSE(); 2085 // Memory runtime check generation creates compares that use expanded 2086 // values. Remove them before running the SCEVExpanderCleaners. 2087 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2088 if (MemCheckExp.isInsertedInstruction(&I)) 2089 continue; 2090 SE.forgetValue(&I); 2091 I.eraseFromParent(); 2092 } 2093 } 2094 MemCheckCleaner.cleanup(); 2095 SCEVCleaner.cleanup(); 2096 2097 if (SCEVCheckCond) 2098 SCEVCheckBlock->eraseFromParent(); 2099 if (MemRuntimeCheckCond) 2100 MemCheckBlock->eraseFromParent(); 2101 } 2102 2103 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2104 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2105 /// depending on the generated condition. 2106 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2107 BasicBlock *LoopVectorPreHeader, 2108 BasicBlock *LoopExitBlock) { 2109 if (!SCEVCheckCond) 2110 return nullptr; 2111 2112 Value *Cond = SCEVCheckCond; 2113 // Mark the check as used, to prevent it from being removed during cleanup. 2114 SCEVCheckCond = nullptr; 2115 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2116 if (C->isZero()) 2117 return nullptr; 2118 2119 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2120 2121 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2122 // Create new preheader for vector loop. 2123 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2124 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2125 2126 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2127 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2128 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2129 SCEVCheckBlock); 2130 2131 DT->addNewBlock(SCEVCheckBlock, Pred); 2132 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2133 2134 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2135 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2136 return SCEVCheckBlock; 2137 } 2138 2139 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2140 /// the branches to branch to the vector preheader or \p Bypass, depending on 2141 /// the generated condition. 2142 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2143 BasicBlock *LoopVectorPreHeader) { 2144 // Check if we generated code that checks in runtime if arrays overlap. 2145 if (!MemRuntimeCheckCond) 2146 return nullptr; 2147 2148 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2149 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2150 MemCheckBlock); 2151 2152 DT->addNewBlock(MemCheckBlock, Pred); 2153 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2154 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2155 2156 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2157 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2158 2159 ReplaceInstWithInst( 2160 MemCheckBlock->getTerminator(), 2161 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2162 MemCheckBlock->getTerminator()->setDebugLoc( 2163 Pred->getTerminator()->getDebugLoc()); 2164 2165 // Mark the check as used, to prevent it from being removed during cleanup. 2166 MemRuntimeCheckCond = nullptr; 2167 return MemCheckBlock; 2168 } 2169 }; 2170 } // namespace 2171 2172 static bool useActiveLaneMask(TailFoldingStyle Style) { 2173 return Style == TailFoldingStyle::Data || 2174 Style == TailFoldingStyle::DataAndControlFlow || 2175 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2176 } 2177 2178 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2179 return Style == TailFoldingStyle::DataAndControlFlow || 2180 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2181 } 2182 2183 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2184 // vectorization. The loop needs to be annotated with #pragma omp simd 2185 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2186 // vector length information is not provided, vectorization is not considered 2187 // explicit. Interleave hints are not allowed either. These limitations will be 2188 // relaxed in the future. 2189 // Please, note that we are currently forced to abuse the pragma 'clang 2190 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2191 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2192 // provides *explicit vectorization hints* (LV can bypass legal checks and 2193 // assume that vectorization is legal). However, both hints are implemented 2194 // using the same metadata (llvm.loop.vectorize, processed by 2195 // LoopVectorizeHints). This will be fixed in the future when the native IR 2196 // representation for pragma 'omp simd' is introduced. 2197 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2198 OptimizationRemarkEmitter *ORE) { 2199 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2200 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2201 2202 // Only outer loops with an explicit vectorization hint are supported. 2203 // Unannotated outer loops are ignored. 2204 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2205 return false; 2206 2207 Function *Fn = OuterLp->getHeader()->getParent(); 2208 if (!Hints.allowVectorization(Fn, OuterLp, 2209 true /*VectorizeOnlyWhenForced*/)) { 2210 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2211 return false; 2212 } 2213 2214 if (Hints.getInterleave() > 1) { 2215 // TODO: Interleave support is future work. 2216 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2217 "outer loops.\n"); 2218 Hints.emitRemarkWithHints(); 2219 return false; 2220 } 2221 2222 return true; 2223 } 2224 2225 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2226 OptimizationRemarkEmitter *ORE, 2227 SmallVectorImpl<Loop *> &V) { 2228 // Collect inner loops and outer loops without irreducible control flow. For 2229 // now, only collect outer loops that have explicit vectorization hints. If we 2230 // are stress testing the VPlan H-CFG construction, we collect the outermost 2231 // loop of every loop nest. 2232 if (L.isInnermost() || VPlanBuildStressTest || 2233 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2234 LoopBlocksRPO RPOT(&L); 2235 RPOT.perform(LI); 2236 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2237 V.push_back(&L); 2238 // TODO: Collect inner loops inside marked outer loops in case 2239 // vectorization fails for the outer loop. Do not invoke 2240 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2241 // already known to be reducible. We can use an inherited attribute for 2242 // that. 2243 return; 2244 } 2245 } 2246 for (Loop *InnerL : L) 2247 collectSupportedLoops(*InnerL, LI, ORE, V); 2248 } 2249 2250 //===----------------------------------------------------------------------===// 2251 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2252 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2253 //===----------------------------------------------------------------------===// 2254 2255 /// This function adds 2256 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2257 /// to each vector element of Val. The sequence starts at StartIndex. 2258 /// \p Opcode is relevant for FP induction variable. 2259 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2260 Instruction::BinaryOps BinOp, ElementCount VF, 2261 IRBuilderBase &Builder) { 2262 assert(VF.isVector() && "only vector VFs are supported"); 2263 2264 // Create and check the types. 2265 auto *ValVTy = cast<VectorType>(Val->getType()); 2266 ElementCount VLen = ValVTy->getElementCount(); 2267 2268 Type *STy = Val->getType()->getScalarType(); 2269 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2270 "Induction Step must be an integer or FP"); 2271 assert(Step->getType() == STy && "Step has wrong type"); 2272 2273 SmallVector<Constant *, 8> Indices; 2274 2275 // Create a vector of consecutive numbers from zero to VF. 2276 VectorType *InitVecValVTy = ValVTy; 2277 if (STy->isFloatingPointTy()) { 2278 Type *InitVecValSTy = 2279 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2280 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2281 } 2282 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2283 2284 // Splat the StartIdx 2285 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2286 2287 if (STy->isIntegerTy()) { 2288 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2289 Step = Builder.CreateVectorSplat(VLen, Step); 2290 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2291 // FIXME: The newly created binary instructions should contain nsw/nuw 2292 // flags, which can be found from the original scalar operations. 2293 Step = Builder.CreateMul(InitVec, Step); 2294 return Builder.CreateAdd(Val, Step, "induction"); 2295 } 2296 2297 // Floating point induction. 2298 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2299 "Binary Opcode should be specified for FP induction"); 2300 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2301 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2302 2303 Step = Builder.CreateVectorSplat(VLen, Step); 2304 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2305 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2306 } 2307 2308 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2309 /// variable on which to base the steps, \p Step is the size of the step. 2310 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2311 const InductionDescriptor &ID, VPValue *Def, 2312 VPTransformState &State) { 2313 IRBuilderBase &Builder = State.Builder; 2314 2315 // Ensure step has the same type as that of scalar IV. 2316 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2317 if (ScalarIVTy != Step->getType()) { 2318 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to 2319 // avoid separate truncate here. 2320 assert(Step->getType()->isIntegerTy() && 2321 "Truncation requires an integer step"); 2322 Step = State.Builder.CreateTrunc(Step, ScalarIVTy); 2323 } 2324 2325 // We build scalar steps for both integer and floating-point induction 2326 // variables. Here, we determine the kind of arithmetic we will perform. 2327 Instruction::BinaryOps AddOp; 2328 Instruction::BinaryOps MulOp; 2329 if (ScalarIVTy->isIntegerTy()) { 2330 AddOp = Instruction::Add; 2331 MulOp = Instruction::Mul; 2332 } else { 2333 AddOp = ID.getInductionOpcode(); 2334 MulOp = Instruction::FMul; 2335 } 2336 2337 // Determine the number of scalars we need to generate for each unroll 2338 // iteration. 2339 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2340 // Compute the scalar steps and save the results in State. 2341 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2342 ScalarIVTy->getScalarSizeInBits()); 2343 Type *VecIVTy = nullptr; 2344 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2345 if (!FirstLaneOnly && State.VF.isScalable()) { 2346 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2347 UnitStepVec = 2348 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2349 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2350 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2351 } 2352 2353 unsigned StartPart = 0; 2354 unsigned EndPart = State.UF; 2355 unsigned StartLane = 0; 2356 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2357 if (State.Instance) { 2358 StartPart = State.Instance->Part; 2359 EndPart = StartPart + 1; 2360 StartLane = State.Instance->Lane.getKnownLane(); 2361 EndLane = StartLane + 1; 2362 } 2363 for (unsigned Part = StartPart; Part < EndPart; ++Part) { 2364 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2365 2366 if (!FirstLaneOnly && State.VF.isScalable()) { 2367 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2368 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2369 if (ScalarIVTy->isFloatingPointTy()) 2370 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2371 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2372 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2373 State.set(Def, Add, Part); 2374 // It's useful to record the lane values too for the known minimum number 2375 // of elements so we do those below. This improves the code quality when 2376 // trying to extract the first element, for example. 2377 } 2378 2379 if (ScalarIVTy->isFloatingPointTy()) 2380 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2381 2382 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { 2383 Value *StartIdx = Builder.CreateBinOp( 2384 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2385 // The step returned by `createStepForVF` is a runtime-evaluated value 2386 // when VF is scalable. Otherwise, it should be folded into a Constant. 2387 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2388 "Expected StartIdx to be folded to a constant when VF is not " 2389 "scalable"); 2390 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2391 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2392 State.set(Def, Add, VPIteration(Part, Lane)); 2393 } 2394 } 2395 } 2396 2397 /// Compute the transformed value of Index at offset StartValue using step 2398 /// StepValue. 2399 /// For integer induction, returns StartValue + Index * StepValue. 2400 /// For pointer induction, returns StartValue[Index * StepValue]. 2401 /// FIXME: The newly created binary instructions should contain nsw/nuw 2402 /// flags, which can be found from the original scalar operations. 2403 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2404 Value *StartValue, Value *Step, 2405 const InductionDescriptor &ID) { 2406 Type *StepTy = Step->getType(); 2407 Value *CastedIndex = StepTy->isIntegerTy() 2408 ? B.CreateSExtOrTrunc(Index, StepTy) 2409 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2410 if (CastedIndex != Index) { 2411 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2412 Index = CastedIndex; 2413 } 2414 2415 // Note: the IR at this point is broken. We cannot use SE to create any new 2416 // SCEV and then expand it, hoping that SCEV's simplification will give us 2417 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2418 // lead to various SCEV crashes. So all we can do is to use builder and rely 2419 // on InstCombine for future simplifications. Here we handle some trivial 2420 // cases only. 2421 auto CreateAdd = [&B](Value *X, Value *Y) { 2422 assert(X->getType() == Y->getType() && "Types don't match!"); 2423 if (auto *CX = dyn_cast<ConstantInt>(X)) 2424 if (CX->isZero()) 2425 return Y; 2426 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2427 if (CY->isZero()) 2428 return X; 2429 return B.CreateAdd(X, Y); 2430 }; 2431 2432 // We allow X to be a vector type, in which case Y will potentially be 2433 // splatted into a vector with the same element count. 2434 auto CreateMul = [&B](Value *X, Value *Y) { 2435 assert(X->getType()->getScalarType() == Y->getType() && 2436 "Types don't match!"); 2437 if (auto *CX = dyn_cast<ConstantInt>(X)) 2438 if (CX->isOne()) 2439 return Y; 2440 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2441 if (CY->isOne()) 2442 return X; 2443 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2444 if (XVTy && !isa<VectorType>(Y->getType())) 2445 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2446 return B.CreateMul(X, Y); 2447 }; 2448 2449 switch (ID.getKind()) { 2450 case InductionDescriptor::IK_IntInduction: { 2451 assert(!isa<VectorType>(Index->getType()) && 2452 "Vector indices not supported for integer inductions yet"); 2453 assert(Index->getType() == StartValue->getType() && 2454 "Index type does not match StartValue type"); 2455 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2456 return B.CreateSub(StartValue, Index); 2457 auto *Offset = CreateMul(Index, Step); 2458 return CreateAdd(StartValue, Offset); 2459 } 2460 case InductionDescriptor::IK_PtrInduction: { 2461 return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); 2462 } 2463 case InductionDescriptor::IK_FpInduction: { 2464 assert(!isa<VectorType>(Index->getType()) && 2465 "Vector indices not supported for FP inductions yet"); 2466 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2467 auto InductionBinOp = ID.getInductionBinOp(); 2468 assert(InductionBinOp && 2469 (InductionBinOp->getOpcode() == Instruction::FAdd || 2470 InductionBinOp->getOpcode() == Instruction::FSub) && 2471 "Original bin op should be defined for FP induction"); 2472 2473 Value *MulExp = B.CreateFMul(Step, Index); 2474 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2475 "induction"); 2476 } 2477 case InductionDescriptor::IK_NoInduction: 2478 return nullptr; 2479 } 2480 llvm_unreachable("invalid enum"); 2481 } 2482 2483 std::optional<unsigned> getMaxVScale(const Function &F, 2484 const TargetTransformInfo &TTI) { 2485 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2486 return MaxVScale; 2487 2488 if (F.hasFnAttribute(Attribute::VScaleRange)) 2489 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2490 2491 return std::nullopt; 2492 } 2493 2494 /// For the given VF and UF and maximum trip count computed for the loop, return 2495 /// whether the induction variable might overflow in the vectorized loop. If not, 2496 /// then we know a runtime overflow check always evaluates to false and can be 2497 /// removed. 2498 static bool isIndvarOverflowCheckKnownFalse( 2499 const LoopVectorizationCostModel *Cost, 2500 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2501 // Always be conservative if we don't know the exact unroll factor. 2502 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2503 2504 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2505 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2506 2507 // We know the runtime overflow check is known false iff the (max) trip-count 2508 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2509 // the vector loop induction variable. 2510 if (unsigned TC = 2511 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2512 uint64_t MaxVF = VF.getKnownMinValue(); 2513 if (VF.isScalable()) { 2514 std::optional<unsigned> MaxVScale = 2515 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2516 if (!MaxVScale) 2517 return false; 2518 MaxVF *= *MaxVScale; 2519 } 2520 2521 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2522 } 2523 2524 return false; 2525 } 2526 2527 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2528 const VPIteration &Instance, 2529 VPTransformState &State) { 2530 Value *ScalarInst = State.get(Def, Instance); 2531 Value *VectorValue = State.get(Def, Instance.Part); 2532 VectorValue = Builder.CreateInsertElement( 2533 VectorValue, ScalarInst, 2534 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2535 State.set(Def, VectorValue, Instance.Part); 2536 } 2537 2538 // Return whether we allow using masked interleave-groups (for dealing with 2539 // strided loads/stores that reside in predicated blocks, or for dealing 2540 // with gaps). 2541 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2542 // If an override option has been passed in for interleaved accesses, use it. 2543 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2544 return EnableMaskedInterleavedMemAccesses; 2545 2546 return TTI.enableMaskedInterleavedAccessVectorization(); 2547 } 2548 2549 // Try to vectorize the interleave group that \p Instr belongs to. 2550 // 2551 // E.g. Translate following interleaved load group (factor = 3): 2552 // for (i = 0; i < N; i+=3) { 2553 // R = Pic[i]; // Member of index 0 2554 // G = Pic[i+1]; // Member of index 1 2555 // B = Pic[i+2]; // Member of index 2 2556 // ... // do something to R, G, B 2557 // } 2558 // To: 2559 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2560 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2561 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2562 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2563 // 2564 // Or translate following interleaved store group (factor = 3): 2565 // for (i = 0; i < N; i+=3) { 2566 // ... do something to R, G, B 2567 // Pic[i] = R; // Member of index 0 2568 // Pic[i+1] = G; // Member of index 1 2569 // Pic[i+2] = B; // Member of index 2 2570 // } 2571 // To: 2572 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2573 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2574 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2575 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2576 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2577 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2578 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2579 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2580 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2581 Instruction *Instr = Group->getInsertPos(); 2582 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2583 2584 // Prepare for the vector type of the interleaved load/store. 2585 Type *ScalarTy = getLoadStoreType(Instr); 2586 unsigned InterleaveFactor = Group->getFactor(); 2587 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2588 2589 // Prepare for the new pointers. 2590 SmallVector<Value *, 2> AddrParts; 2591 unsigned Index = Group->getIndex(Instr); 2592 2593 // TODO: extend the masked interleaved-group support to reversed access. 2594 assert((!BlockInMask || !Group->isReverse()) && 2595 "Reversed masked interleave-group not supported."); 2596 2597 Value *Idx; 2598 // If the group is reverse, adjust the index to refer to the last vector lane 2599 // instead of the first. We adjust the index from the first vector lane, 2600 // rather than directly getting the pointer for lane VF - 1, because the 2601 // pointer operand of the interleaved access is supposed to be uniform. For 2602 // uniform instructions, we're only required to generate a value for the 2603 // first vector lane in each unroll iteration. 2604 if (Group->isReverse()) { 2605 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2606 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2607 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2608 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2609 Idx = Builder.CreateNeg(Idx); 2610 } else 2611 Idx = Builder.getInt32(-Index); 2612 2613 for (unsigned Part = 0; Part < UF; Part++) { 2614 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2615 State.setDebugLocFromInst(AddrPart); 2616 2617 // Notice current instruction could be any index. Need to adjust the address 2618 // to the member of index 0. 2619 // 2620 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2621 // b = A[i]; // Member of index 0 2622 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2623 // 2624 // E.g. A[i+1] = a; // Member of index 1 2625 // A[i] = b; // Member of index 0 2626 // A[i+2] = c; // Member of index 2 (Current instruction) 2627 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2628 2629 bool InBounds = false; 2630 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2631 InBounds = gep->isInBounds(); 2632 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2633 2634 // Cast to the vector pointer type. 2635 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2636 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2637 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2638 } 2639 2640 State.setDebugLocFromInst(Instr); 2641 Value *PoisonVec = PoisonValue::get(VecTy); 2642 2643 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2644 unsigned Part, Value *MaskForGaps) -> Value * { 2645 if (VF.isScalable()) { 2646 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2647 assert(InterleaveFactor == 2 && 2648 "Unsupported deinterleave factor for scalable vectors"); 2649 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2650 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2651 auto *MaskTy = 2652 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2653 return Builder.CreateIntrinsic( 2654 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2655 /*FMFSource=*/nullptr, "interleaved.mask"); 2656 } 2657 2658 if (!BlockInMask) 2659 return MaskForGaps; 2660 2661 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2662 Value *ShuffledMask = Builder.CreateShuffleVector( 2663 BlockInMaskPart, 2664 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2665 "interleaved.mask"); 2666 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2667 MaskForGaps) 2668 : ShuffledMask; 2669 }; 2670 2671 // Vectorize the interleaved load group. 2672 if (isa<LoadInst>(Instr)) { 2673 Value *MaskForGaps = nullptr; 2674 if (NeedsMaskForGaps) { 2675 MaskForGaps = 2676 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2677 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2678 } 2679 2680 // For each unroll part, create a wide load for the group. 2681 SmallVector<Value *, 2> NewLoads; 2682 for (unsigned Part = 0; Part < UF; Part++) { 2683 Instruction *NewLoad; 2684 if (BlockInMask || MaskForGaps) { 2685 assert(useMaskedInterleavedAccesses(*TTI) && 2686 "masked interleaved groups are not allowed."); 2687 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2688 NewLoad = 2689 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2690 GroupMask, PoisonVec, "wide.masked.vec"); 2691 } 2692 else 2693 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2694 Group->getAlign(), "wide.vec"); 2695 Group->addMetadata(NewLoad); 2696 NewLoads.push_back(NewLoad); 2697 } 2698 2699 if (VecTy->isScalableTy()) { 2700 assert(InterleaveFactor == 2 && 2701 "Unsupported deinterleave factor for scalable vectors"); 2702 2703 for (unsigned Part = 0; Part < UF; ++Part) { 2704 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2705 // so must use intrinsics to deinterleave. 2706 Value *DI = Builder.CreateIntrinsic( 2707 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2708 /*FMFSource=*/nullptr, "strided.vec"); 2709 unsigned J = 0; 2710 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2711 Instruction *Member = Group->getMember(I); 2712 2713 if (!Member) 2714 continue; 2715 2716 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2717 // If this member has different type, cast the result type. 2718 if (Member->getType() != ScalarTy) { 2719 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2720 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2721 } 2722 2723 if (Group->isReverse()) 2724 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2725 2726 State.set(VPDefs[J], StridedVec, Part); 2727 ++J; 2728 } 2729 } 2730 2731 return; 2732 } 2733 2734 // For each member in the group, shuffle out the appropriate data from the 2735 // wide loads. 2736 unsigned J = 0; 2737 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2738 Instruction *Member = Group->getMember(I); 2739 2740 // Skip the gaps in the group. 2741 if (!Member) 2742 continue; 2743 2744 auto StrideMask = 2745 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2746 for (unsigned Part = 0; Part < UF; Part++) { 2747 Value *StridedVec = Builder.CreateShuffleVector( 2748 NewLoads[Part], StrideMask, "strided.vec"); 2749 2750 // If this member has different type, cast the result type. 2751 if (Member->getType() != ScalarTy) { 2752 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2753 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2754 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2755 } 2756 2757 if (Group->isReverse()) 2758 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2759 2760 State.set(VPDefs[J], StridedVec, Part); 2761 } 2762 ++J; 2763 } 2764 return; 2765 } 2766 2767 // The sub vector type for current instruction. 2768 auto *SubVT = VectorType::get(ScalarTy, VF); 2769 2770 // Vectorize the interleaved store group. 2771 Value *MaskForGaps = 2772 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2773 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2774 "masked interleaved groups are not allowed."); 2775 assert((!MaskForGaps || !VF.isScalable()) && 2776 "masking gaps for scalable vectors is not yet supported."); 2777 for (unsigned Part = 0; Part < UF; Part++) { 2778 // Collect the stored vector from each member. 2779 SmallVector<Value *, 4> StoredVecs; 2780 unsigned StoredIdx = 0; 2781 for (unsigned i = 0; i < InterleaveFactor; i++) { 2782 assert((Group->getMember(i) || MaskForGaps) && 2783 "Fail to get a member from an interleaved store group"); 2784 Instruction *Member = Group->getMember(i); 2785 2786 // Skip the gaps in the group. 2787 if (!Member) { 2788 Value *Undef = PoisonValue::get(SubVT); 2789 StoredVecs.push_back(Undef); 2790 continue; 2791 } 2792 2793 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2794 ++StoredIdx; 2795 2796 if (Group->isReverse()) 2797 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2798 2799 // If this member has different type, cast it to a unified type. 2800 2801 if (StoredVec->getType() != SubVT) 2802 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2803 2804 StoredVecs.push_back(StoredVec); 2805 } 2806 2807 // Interleave all the smaller vectors into one wider vector. 2808 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2809 Instruction *NewStoreInstr; 2810 if (BlockInMask || MaskForGaps) { 2811 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2812 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2813 Group->getAlign(), GroupMask); 2814 } else 2815 NewStoreInstr = 2816 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2817 2818 Group->addMetadata(NewStoreInstr); 2819 } 2820 } 2821 2822 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2823 VPReplicateRecipe *RepRecipe, 2824 const VPIteration &Instance, 2825 VPTransformState &State) { 2826 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2827 2828 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2829 // the first lane and part. 2830 if (isa<NoAliasScopeDeclInst>(Instr)) 2831 if (!Instance.isFirstIteration()) 2832 return; 2833 2834 // Does this instruction return a value ? 2835 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2836 2837 Instruction *Cloned = Instr->clone(); 2838 if (!IsVoidRetTy) 2839 Cloned->setName(Instr->getName() + ".cloned"); 2840 2841 RepRecipe->setFlags(Cloned); 2842 2843 if (Instr->getDebugLoc()) 2844 State.setDebugLocFromInst(Instr); 2845 2846 // Replace the operands of the cloned instructions with their scalar 2847 // equivalents in the new loop. 2848 for (const auto &I : enumerate(RepRecipe->operands())) { 2849 auto InputInstance = Instance; 2850 VPValue *Operand = I.value(); 2851 if (vputils::isUniformAfterVectorization(Operand)) 2852 InputInstance.Lane = VPLane::getFirstLane(); 2853 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2854 } 2855 State.addNewMetadata(Cloned, Instr); 2856 2857 // Place the cloned scalar in the new loop. 2858 State.Builder.Insert(Cloned); 2859 2860 State.set(RepRecipe, Cloned, Instance); 2861 2862 // If we just cloned a new assumption, add it the assumption cache. 2863 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2864 AC->registerAssumption(II); 2865 2866 // End if-block. 2867 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2868 if (IfPredicateInstr) 2869 PredicatedInstructions.push_back(Cloned); 2870 } 2871 2872 Value * 2873 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2874 if (VectorTripCount) 2875 return VectorTripCount; 2876 2877 Value *TC = getTripCount(); 2878 IRBuilder<> Builder(InsertBlock->getTerminator()); 2879 2880 Type *Ty = TC->getType(); 2881 // This is where we can make the step a runtime constant. 2882 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2883 2884 // If the tail is to be folded by masking, round the number of iterations N 2885 // up to a multiple of Step instead of rounding down. This is done by first 2886 // adding Step-1 and then rounding down. Note that it's ok if this addition 2887 // overflows: the vector induction variable will eventually wrap to zero given 2888 // that it starts at zero and its Step is a power of two; the loop will then 2889 // exit, with the last early-exit vector comparison also producing all-true. 2890 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2891 // is accounted for in emitIterationCountCheck that adds an overflow check. 2892 if (Cost->foldTailByMasking()) { 2893 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2894 "VF*UF must be a power of 2 when folding tail by masking"); 2895 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2896 TC = Builder.CreateAdd( 2897 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2898 } 2899 2900 // Now we need to generate the expression for the part of the loop that the 2901 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2902 // iterations are not required for correctness, or N - Step, otherwise. Step 2903 // is equal to the vectorization factor (number of SIMD elements) times the 2904 // unroll factor (number of SIMD instructions). 2905 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2906 2907 // There are cases where we *must* run at least one iteration in the remainder 2908 // loop. See the cost model for when this can happen. If the step evenly 2909 // divides the trip count, we set the remainder to be equal to the step. If 2910 // the step does not evenly divide the trip count, no adjustment is necessary 2911 // since there will already be scalar iterations. Note that the minimum 2912 // iterations check ensures that N >= Step. 2913 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2914 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2915 R = Builder.CreateSelect(IsZero, Step, R); 2916 } 2917 2918 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2919 2920 return VectorTripCount; 2921 } 2922 2923 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2924 const DataLayout &DL) { 2925 // Verify that V is a vector type with same number of elements as DstVTy. 2926 auto *DstFVTy = cast<VectorType>(DstVTy); 2927 auto VF = DstFVTy->getElementCount(); 2928 auto *SrcVecTy = cast<VectorType>(V->getType()); 2929 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2930 Type *SrcElemTy = SrcVecTy->getElementType(); 2931 Type *DstElemTy = DstFVTy->getElementType(); 2932 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2933 "Vector elements must have same size"); 2934 2935 // Do a direct cast if element types are castable. 2936 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2937 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2938 } 2939 // V cannot be directly casted to desired vector type. 2940 // May happen when V is a floating point vector but DstVTy is a vector of 2941 // pointers or vice-versa. Handle this using a two-step bitcast using an 2942 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2943 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2944 "Only one type should be a pointer type"); 2945 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2946 "Only one type should be a floating point type"); 2947 Type *IntTy = 2948 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2949 auto *VecIntTy = VectorType::get(IntTy, VF); 2950 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2951 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2952 } 2953 2954 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2955 Value *Count = getTripCount(); 2956 // Reuse existing vector loop preheader for TC checks. 2957 // Note that new preheader block is generated for vector loop. 2958 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2959 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2960 2961 // Generate code to check if the loop's trip count is less than VF * UF, or 2962 // equal to it in case a scalar epilogue is required; this implies that the 2963 // vector trip count is zero. This check also covers the case where adding one 2964 // to the backedge-taken count overflowed leading to an incorrect trip count 2965 // of zero. In this case we will also jump to the scalar loop. 2966 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2967 : ICmpInst::ICMP_ULT; 2968 2969 // If tail is to be folded, vector loop takes care of all iterations. 2970 Type *CountTy = Count->getType(); 2971 Value *CheckMinIters = Builder.getFalse(); 2972 auto CreateStep = [&]() -> Value * { 2973 // Create step with max(MinProTripCount, UF * VF). 2974 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2975 return createStepForVF(Builder, CountTy, VF, UF); 2976 2977 Value *MinProfTC = 2978 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2979 if (!VF.isScalable()) 2980 return MinProfTC; 2981 return Builder.CreateBinaryIntrinsic( 2982 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2983 }; 2984 2985 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2986 if (Style == TailFoldingStyle::None) 2987 CheckMinIters = 2988 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2989 else if (VF.isScalable() && 2990 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2991 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2992 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2993 // an overflow to zero when updating induction variables and so an 2994 // additional overflow check is required before entering the vector loop. 2995 2996 // Get the maximum unsigned value for the type. 2997 Value *MaxUIntTripCount = 2998 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2999 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 3000 3001 // Don't execute the vector loop if (UMax - n) < (VF * UF). 3002 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 3003 } 3004 3005 // Create new preheader for vector loop. 3006 LoopVectorPreHeader = 3007 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3008 "vector.ph"); 3009 3010 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3011 DT->getNode(Bypass)->getIDom()) && 3012 "TC check is expected to dominate Bypass"); 3013 3014 // Update dominator for Bypass & LoopExit (if needed). 3015 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3016 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3017 // If there is an epilogue which must run, there's no edge from the 3018 // middle block to exit blocks and thus no need to update the immediate 3019 // dominator of the exit blocks. 3020 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3021 3022 ReplaceInstWithInst( 3023 TCCheckBlock->getTerminator(), 3024 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3025 LoopBypassBlocks.push_back(TCCheckBlock); 3026 } 3027 3028 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3029 BasicBlock *const SCEVCheckBlock = 3030 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3031 if (!SCEVCheckBlock) 3032 return nullptr; 3033 3034 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3035 (OptForSizeBasedOnProfile && 3036 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3037 "Cannot SCEV check stride or overflow when optimizing for size"); 3038 3039 3040 // Update dominator only if this is first RT check. 3041 if (LoopBypassBlocks.empty()) { 3042 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3043 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3044 // If there is an epilogue which must run, there's no edge from the 3045 // middle block to exit blocks and thus no need to update the immediate 3046 // dominator of the exit blocks. 3047 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3048 } 3049 3050 LoopBypassBlocks.push_back(SCEVCheckBlock); 3051 AddedSafetyChecks = true; 3052 return SCEVCheckBlock; 3053 } 3054 3055 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3056 // VPlan-native path does not do any analysis for runtime checks currently. 3057 if (EnableVPlanNativePath) 3058 return nullptr; 3059 3060 BasicBlock *const MemCheckBlock = 3061 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3062 3063 // Check if we generated code that checks in runtime if arrays overlap. We put 3064 // the checks into a separate block to make the more common case of few 3065 // elements faster. 3066 if (!MemCheckBlock) 3067 return nullptr; 3068 3069 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3070 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3071 "Cannot emit memory checks when optimizing for size, unless forced " 3072 "to vectorize."); 3073 ORE->emit([&]() { 3074 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3075 OrigLoop->getStartLoc(), 3076 OrigLoop->getHeader()) 3077 << "Code-size may be reduced by not forcing " 3078 "vectorization, or by source-code modifications " 3079 "eliminating the need for runtime checks " 3080 "(e.g., adding 'restrict')."; 3081 }); 3082 } 3083 3084 LoopBypassBlocks.push_back(MemCheckBlock); 3085 3086 AddedSafetyChecks = true; 3087 3088 return MemCheckBlock; 3089 } 3090 3091 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3092 LoopScalarBody = OrigLoop->getHeader(); 3093 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3094 assert(LoopVectorPreHeader && "Invalid loop structure"); 3095 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3096 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 3097 "multiple exit loop without required epilogue?"); 3098 3099 LoopMiddleBlock = 3100 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3101 LI, nullptr, Twine(Prefix) + "middle.block"); 3102 LoopScalarPreHeader = 3103 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3104 nullptr, Twine(Prefix) + "scalar.ph"); 3105 3106 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3107 3108 // Set up the middle block terminator. Two cases: 3109 // 1) If we know that we must execute the scalar epilogue, emit an 3110 // unconditional branch. 3111 // 2) Otherwise, we must have a single unique exit block (due to how we 3112 // implement the multiple exit case). In this case, set up a conditional 3113 // branch from the middle block to the loop scalar preheader, and the 3114 // exit block. completeLoopSkeleton will update the condition to use an 3115 // iteration check, if required to decide whether to execute the remainder. 3116 BranchInst *BrInst = 3117 Cost->requiresScalarEpilogue(VF.isVector()) 3118 ? BranchInst::Create(LoopScalarPreHeader) 3119 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3120 Builder.getTrue()); 3121 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3122 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3123 3124 // Update dominator for loop exit. During skeleton creation, only the vector 3125 // pre-header and the middle block are created. The vector loop is entirely 3126 // created during VPlan exection. 3127 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3128 // If there is an epilogue which must run, there's no edge from the 3129 // middle block to exit blocks and thus no need to update the immediate 3130 // dominator of the exit blocks. 3131 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3132 } 3133 3134 PHINode *InnerLoopVectorizer::createInductionResumeValue( 3135 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3136 ArrayRef<BasicBlock *> BypassBlocks, 3137 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3138 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3139 assert(VectorTripCount && "Expected valid arguments"); 3140 3141 Instruction *OldInduction = Legal->getPrimaryInduction(); 3142 Value *&EndValue = IVEndValues[OrigPhi]; 3143 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3144 if (OrigPhi == OldInduction) { 3145 // We know what the end value is. 3146 EndValue = VectorTripCount; 3147 } else { 3148 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3149 3150 // Fast-math-flags propagate from the original induction instruction. 3151 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3152 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3153 3154 EndValue = 3155 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); 3156 EndValue->setName("ind.end"); 3157 3158 // Compute the end value for the additional bypass (if applicable). 3159 if (AdditionalBypass.first) { 3160 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3161 EndValueFromAdditionalBypass = emitTransformedIndex( 3162 B, AdditionalBypass.second, II.getStartValue(), Step, II); 3163 EndValueFromAdditionalBypass->setName("ind.end"); 3164 } 3165 } 3166 3167 // Create phi nodes to merge from the backedge-taken check block. 3168 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3169 LoopScalarPreHeader->getTerminator()); 3170 // Copy original phi DL over to the new one. 3171 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3172 3173 // The new PHI merges the original incoming value, in case of a bypass, 3174 // or the value at the end of the vectorized loop. 3175 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3176 3177 // Fix the scalar body counter (PHI node). 3178 // The old induction's phi node in the scalar body needs the truncated 3179 // value. 3180 for (BasicBlock *BB : BypassBlocks) 3181 BCResumeVal->addIncoming(II.getStartValue(), BB); 3182 3183 if (AdditionalBypass.first) 3184 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3185 EndValueFromAdditionalBypass); 3186 return BCResumeVal; 3187 } 3188 3189 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3190 /// expansion results. 3191 static Value *getExpandedStep(const InductionDescriptor &ID, 3192 const SCEV2ValueTy &ExpandedSCEVs) { 3193 const SCEV *Step = ID.getStep(); 3194 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3195 return C->getValue(); 3196 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3197 return U->getValue(); 3198 auto I = ExpandedSCEVs.find(Step); 3199 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3200 return I->second; 3201 } 3202 3203 void InnerLoopVectorizer::createInductionResumeValues( 3204 const SCEV2ValueTy &ExpandedSCEVs, 3205 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3206 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3207 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3208 "Inconsistent information about additional bypass."); 3209 // We are going to resume the execution of the scalar loop. 3210 // Go over all of the induction variables that we found and fix the 3211 // PHIs that are left in the scalar version of the loop. 3212 // The starting values of PHI nodes depend on the counter of the last 3213 // iteration in the vectorized loop. 3214 // If we come from a bypass edge then we need to start from the original 3215 // start value. 3216 for (const auto &InductionEntry : Legal->getInductionVars()) { 3217 PHINode *OrigPhi = InductionEntry.first; 3218 const InductionDescriptor &II = InductionEntry.second; 3219 PHINode *BCResumeVal = createInductionResumeValue( 3220 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3221 AdditionalBypass); 3222 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3223 } 3224 } 3225 3226 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3227 // The trip counts should be cached by now. 3228 Value *Count = getTripCount(); 3229 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3230 3231 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3232 3233 // Add a check in the middle block to see if we have completed 3234 // all of the iterations in the first vector loop. Three cases: 3235 // 1) If we require a scalar epilogue, there is no conditional branch as 3236 // we unconditionally branch to the scalar preheader. Do nothing. 3237 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3238 // Thus if tail is to be folded, we know we don't need to run the 3239 // remainder and we can use the previous value for the condition (true). 3240 // 3) Otherwise, construct a runtime check. 3241 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3242 !Cost->foldTailByMasking()) { 3243 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3244 Count, VectorTripCount, "cmp.n", 3245 LoopMiddleBlock->getTerminator()); 3246 3247 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3248 // of the corresponding compare because they may have ended up with 3249 // different line numbers and we want to avoid awkward line stepping while 3250 // debugging. Eg. if the compare has got a line number inside the loop. 3251 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3252 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3253 } 3254 3255 #ifdef EXPENSIVE_CHECKS 3256 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3257 #endif 3258 3259 return LoopVectorPreHeader; 3260 } 3261 3262 std::pair<BasicBlock *, Value *> 3263 InnerLoopVectorizer::createVectorizedLoopSkeleton( 3264 const SCEV2ValueTy &ExpandedSCEVs) { 3265 /* 3266 In this function we generate a new loop. The new loop will contain 3267 the vectorized instructions while the old loop will continue to run the 3268 scalar remainder. 3269 3270 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3271 / | preheader are expanded here. Eventually all required SCEV 3272 / | expansion should happen here. 3273 / v 3274 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3275 | / | 3276 | / v 3277 || [ ] <-- vector pre header. 3278 |/ | 3279 | v 3280 | [ ] \ 3281 | [ ]_| <-- vector loop (created during VPlan execution). 3282 | | 3283 | v 3284 \ -[ ] <--- middle-block. 3285 \/ | 3286 /\ v 3287 | ->[ ] <--- new preheader. 3288 | | 3289 (opt) v <-- edge from middle to exit iff epilogue is not required. 3290 | [ ] \ 3291 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3292 \ | 3293 \ v 3294 >[ ] <-- exit block(s). 3295 ... 3296 */ 3297 3298 // Create an empty vector loop, and prepare basic blocks for the runtime 3299 // checks. 3300 createVectorLoopSkeleton(""); 3301 3302 // Now, compare the new count to zero. If it is zero skip the vector loop and 3303 // jump to the scalar loop. This check also covers the case where the 3304 // backedge-taken count is uint##_max: adding one to it will overflow leading 3305 // to an incorrect trip count of zero. In this (rare) case we will also jump 3306 // to the scalar loop. 3307 emitIterationCountCheck(LoopScalarPreHeader); 3308 3309 // Generate the code to check any assumptions that we've made for SCEV 3310 // expressions. 3311 emitSCEVChecks(LoopScalarPreHeader); 3312 3313 // Generate the code that checks in runtime if arrays overlap. We put the 3314 // checks into a separate block to make the more common case of few elements 3315 // faster. 3316 emitMemRuntimeChecks(LoopScalarPreHeader); 3317 3318 // Emit phis for the new starting index of the scalar loop. 3319 createInductionResumeValues(ExpandedSCEVs); 3320 3321 return {completeLoopSkeleton(), nullptr}; 3322 } 3323 3324 // Fix up external users of the induction variable. At this point, we are 3325 // in LCSSA form, with all external PHIs that use the IV having one input value, 3326 // coming from the remainder loop. We need those PHIs to also have a correct 3327 // value for the IV when arriving directly from the middle block. 3328 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3329 const InductionDescriptor &II, 3330 Value *VectorTripCount, Value *EndValue, 3331 BasicBlock *MiddleBlock, 3332 BasicBlock *VectorHeader, VPlan &Plan, 3333 VPTransformState &State) { 3334 // There are two kinds of external IV usages - those that use the value 3335 // computed in the last iteration (the PHI) and those that use the penultimate 3336 // value (the value that feeds into the phi from the loop latch). 3337 // We allow both, but they, obviously, have different values. 3338 3339 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3340 3341 DenseMap<Value *, Value *> MissingVals; 3342 3343 // An external user of the last iteration's value should see the value that 3344 // the remainder loop uses to initialize its own IV. 3345 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3346 for (User *U : PostInc->users()) { 3347 Instruction *UI = cast<Instruction>(U); 3348 if (!OrigLoop->contains(UI)) { 3349 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3350 MissingVals[UI] = EndValue; 3351 } 3352 } 3353 3354 // An external user of the penultimate value need to see EndValue - Step. 3355 // The simplest way to get this is to recompute it from the constituent SCEVs, 3356 // that is Start + (Step * (CRD - 1)). 3357 for (User *U : OrigPhi->users()) { 3358 auto *UI = cast<Instruction>(U); 3359 if (!OrigLoop->contains(UI)) { 3360 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3361 IRBuilder<> B(MiddleBlock->getTerminator()); 3362 3363 // Fast-math-flags propagate from the original induction instruction. 3364 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3365 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3366 3367 Value *CountMinusOne = B.CreateSub( 3368 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3369 CountMinusOne->setName("cmo"); 3370 3371 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3372 assert(StepVPV && "step must have been expanded during VPlan execution"); 3373 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3374 : State.get(StepVPV, {0, 0}); 3375 Value *Escape = 3376 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); 3377 Escape->setName("ind.escape"); 3378 MissingVals[UI] = Escape; 3379 } 3380 } 3381 3382 for (auto &I : MissingVals) { 3383 PHINode *PHI = cast<PHINode>(I.first); 3384 // One corner case we have to handle is two IVs "chasing" each-other, 3385 // that is %IV2 = phi [...], [ %IV1, %latch ] 3386 // In this case, if IV1 has an external use, we need to avoid adding both 3387 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3388 // don't already have an incoming value for the middle block. 3389 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3390 PHI->addIncoming(I.second, MiddleBlock); 3391 Plan.removeLiveOut(PHI); 3392 } 3393 } 3394 } 3395 3396 namespace { 3397 3398 struct CSEDenseMapInfo { 3399 static bool canHandle(const Instruction *I) { 3400 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3401 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3402 } 3403 3404 static inline Instruction *getEmptyKey() { 3405 return DenseMapInfo<Instruction *>::getEmptyKey(); 3406 } 3407 3408 static inline Instruction *getTombstoneKey() { 3409 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3410 } 3411 3412 static unsigned getHashValue(const Instruction *I) { 3413 assert(canHandle(I) && "Unknown instruction!"); 3414 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3415 I->value_op_end())); 3416 } 3417 3418 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3419 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3420 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3421 return LHS == RHS; 3422 return LHS->isIdenticalTo(RHS); 3423 } 3424 }; 3425 3426 } // end anonymous namespace 3427 3428 ///Perform cse of induction variable instructions. 3429 static void cse(BasicBlock *BB) { 3430 // Perform simple cse. 3431 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3432 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3433 if (!CSEDenseMapInfo::canHandle(&In)) 3434 continue; 3435 3436 // Check if we can replace this instruction with any of the 3437 // visited instructions. 3438 if (Instruction *V = CSEMap.lookup(&In)) { 3439 In.replaceAllUsesWith(V); 3440 In.eraseFromParent(); 3441 continue; 3442 } 3443 3444 CSEMap[&In] = &In; 3445 } 3446 } 3447 3448 InstructionCost LoopVectorizationCostModel::getVectorCallCost( 3449 CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { 3450 Function *F = CI->getCalledFunction(); 3451 Type *ScalarRetTy = CI->getType(); 3452 SmallVector<Type *, 4> Tys, ScalarTys; 3453 bool MaskRequired = Legal->isMaskRequired(CI); 3454 for (auto &ArgOp : CI->args()) 3455 ScalarTys.push_back(ArgOp->getType()); 3456 3457 // Estimate cost of scalarized vector call. The source operands are assumed 3458 // to be vectors, so we need to extract individual elements from there, 3459 // execute VF scalar calls, and then gather the result into the vector return 3460 // value. 3461 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3462 InstructionCost ScalarCallCost = 3463 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); 3464 if (VF.isScalar()) 3465 return ScalarCallCost; 3466 3467 // Compute corresponding vector type for return value and arguments. 3468 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3469 for (Type *ScalarTy : ScalarTys) 3470 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3471 3472 // Compute costs of unpacking argument values for the scalar calls and 3473 // packing the return values to a vector. 3474 InstructionCost ScalarizationCost = 3475 getScalarizationOverhead(CI, VF, CostKind); 3476 3477 InstructionCost Cost = 3478 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3479 3480 // If we can't emit a vector call for this function, then the currently found 3481 // cost is the cost we need to return. 3482 InstructionCost MaskCost = 0; 3483 VFShape Shape = VFShape::get(*CI, VF, MaskRequired); 3484 if (NeedsMask) 3485 *NeedsMask = MaskRequired; 3486 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3487 // If we want an unmasked vector function but can't find one matching the VF, 3488 // maybe we can find vector function that does use a mask and synthesize 3489 // an all-true mask. 3490 if (!VecFunc && !MaskRequired) { 3491 Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); 3492 VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3493 // If we found one, add in the cost of creating a mask 3494 if (VecFunc) { 3495 if (NeedsMask) 3496 *NeedsMask = true; 3497 MaskCost = TTI.getShuffleCost( 3498 TargetTransformInfo::SK_Broadcast, 3499 VectorType::get( 3500 IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), 3501 VF)); 3502 } 3503 } 3504 3505 // We don't support masked function calls yet, but we can scalarize a 3506 // masked call with branches (unless VF is scalable). 3507 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3508 return VF.isScalable() ? InstructionCost::getInvalid() : Cost; 3509 3510 // If the corresponding vector cost is cheaper, return its cost. 3511 InstructionCost VectorCallCost = 3512 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 3513 if (VectorCallCost < Cost) { 3514 *Variant = VecFunc; 3515 Cost = VectorCallCost; 3516 } 3517 return Cost; 3518 } 3519 3520 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3521 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3522 return Elt; 3523 return VectorType::get(Elt, VF); 3524 } 3525 3526 InstructionCost 3527 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3528 ElementCount VF) const { 3529 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3530 assert(ID && "Expected intrinsic call!"); 3531 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3532 FastMathFlags FMF; 3533 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3534 FMF = FPMO->getFastMathFlags(); 3535 3536 SmallVector<const Value *> Arguments(CI->args()); 3537 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3538 SmallVector<Type *> ParamTys; 3539 std::transform(FTy->param_begin(), FTy->param_end(), 3540 std::back_inserter(ParamTys), 3541 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3542 3543 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3544 dyn_cast<IntrinsicInst>(CI)); 3545 return TTI.getIntrinsicInstrCost(CostAttrs, 3546 TargetTransformInfo::TCK_RecipThroughput); 3547 } 3548 3549 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3550 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3551 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3552 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3553 } 3554 3555 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3556 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3557 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3558 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3559 } 3560 3561 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3562 // For every instruction `I` in MinBWs, truncate the operands, create a 3563 // truncated version of `I` and reextend its result. InstCombine runs 3564 // later and will remove any ext/trunc pairs. 3565 SmallPtrSet<Value *, 4> Erased; 3566 for (const auto &KV : Cost->getMinimalBitwidths()) { 3567 // If the value wasn't vectorized, we must maintain the original scalar 3568 // type. The absence of the value from State indicates that it 3569 // wasn't vectorized. 3570 // FIXME: Should not rely on getVPValue at this point. 3571 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3572 if (!State.hasAnyVectorValue(Def)) 3573 continue; 3574 for (unsigned Part = 0; Part < UF; ++Part) { 3575 Value *I = State.get(Def, Part); 3576 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3577 continue; 3578 Type *OriginalTy = I->getType(); 3579 Type *ScalarTruncatedTy = 3580 IntegerType::get(OriginalTy->getContext(), KV.second); 3581 auto *TruncatedTy = VectorType::get( 3582 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3583 if (TruncatedTy == OriginalTy) 3584 continue; 3585 3586 IRBuilder<> B(cast<Instruction>(I)); 3587 auto ShrinkOperand = [&](Value *V) -> Value * { 3588 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3589 if (ZI->getSrcTy() == TruncatedTy) 3590 return ZI->getOperand(0); 3591 return B.CreateZExtOrTrunc(V, TruncatedTy); 3592 }; 3593 3594 // The actual instruction modification depends on the instruction type, 3595 // unfortunately. 3596 Value *NewI = nullptr; 3597 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3598 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3599 ShrinkOperand(BO->getOperand(1))); 3600 3601 // Any wrapping introduced by shrinking this operation shouldn't be 3602 // considered undefined behavior. So, we can't unconditionally copy 3603 // arithmetic wrapping flags to NewI. 3604 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3605 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3606 NewI = 3607 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3608 ShrinkOperand(CI->getOperand(1))); 3609 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3610 NewI = B.CreateSelect(SI->getCondition(), 3611 ShrinkOperand(SI->getTrueValue()), 3612 ShrinkOperand(SI->getFalseValue())); 3613 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3614 switch (CI->getOpcode()) { 3615 default: 3616 llvm_unreachable("Unhandled cast!"); 3617 case Instruction::Trunc: 3618 NewI = ShrinkOperand(CI->getOperand(0)); 3619 break; 3620 case Instruction::SExt: 3621 NewI = B.CreateSExtOrTrunc( 3622 CI->getOperand(0), 3623 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3624 break; 3625 case Instruction::ZExt: 3626 NewI = B.CreateZExtOrTrunc( 3627 CI->getOperand(0), 3628 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3629 break; 3630 } 3631 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3632 auto Elements0 = 3633 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3634 auto *O0 = B.CreateZExtOrTrunc( 3635 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3636 auto Elements1 = 3637 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3638 auto *O1 = B.CreateZExtOrTrunc( 3639 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3640 3641 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3642 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3643 // Don't do anything with the operands, just extend the result. 3644 continue; 3645 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3646 auto Elements = 3647 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3648 auto *O0 = B.CreateZExtOrTrunc( 3649 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3650 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3651 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3652 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3653 auto Elements = 3654 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3655 auto *O0 = B.CreateZExtOrTrunc( 3656 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3657 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3658 } else { 3659 // If we don't know what to do, be conservative and don't do anything. 3660 continue; 3661 } 3662 3663 // Lastly, extend the result. 3664 NewI->takeName(cast<Instruction>(I)); 3665 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3666 I->replaceAllUsesWith(Res); 3667 cast<Instruction>(I)->eraseFromParent(); 3668 Erased.insert(I); 3669 State.reset(Def, Res, Part); 3670 } 3671 } 3672 3673 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3674 for (const auto &KV : Cost->getMinimalBitwidths()) { 3675 // If the value wasn't vectorized, we must maintain the original scalar 3676 // type. The absence of the value from State indicates that it 3677 // wasn't vectorized. 3678 // FIXME: Should not rely on getVPValue at this point. 3679 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3680 if (!State.hasAnyVectorValue(Def)) 3681 continue; 3682 for (unsigned Part = 0; Part < UF; ++Part) { 3683 Value *I = State.get(Def, Part); 3684 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3685 if (Inst && Inst->use_empty()) { 3686 Value *NewI = Inst->getOperand(0); 3687 Inst->eraseFromParent(); 3688 State.reset(Def, NewI, Part); 3689 } 3690 } 3691 } 3692 } 3693 3694 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3695 VPlan &Plan) { 3696 // Insert truncates and extends for any truncated instructions as hints to 3697 // InstCombine. 3698 if (VF.isVector()) 3699 truncateToMinimalBitwidths(State); 3700 3701 // Fix widened non-induction PHIs by setting up the PHI operands. 3702 if (EnableVPlanNativePath) 3703 fixNonInductionPHIs(Plan, State); 3704 3705 // At this point every instruction in the original loop is widened to a 3706 // vector form. Now we need to fix the recurrences in the loop. These PHI 3707 // nodes are currently empty because we did not want to introduce cycles. 3708 // This is the second stage of vectorizing recurrences. 3709 fixCrossIterationPHIs(State); 3710 3711 // Forget the original basic block. 3712 PSE.getSE()->forgetLoop(OrigLoop); 3713 3714 // After vectorization, the exit blocks of the original loop will have 3715 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3716 // looked through single-entry phis. 3717 SmallVector<BasicBlock *> ExitBlocks; 3718 OrigLoop->getExitBlocks(ExitBlocks); 3719 for (BasicBlock *Exit : ExitBlocks) 3720 for (PHINode &PN : Exit->phis()) 3721 PSE.getSE()->forgetValue(&PN); 3722 3723 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3724 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3725 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3726 // No edge from the middle block to the unique exit block has been inserted 3727 // and there is nothing to fix from vector loop; phis should have incoming 3728 // from scalar loop only. 3729 } else { 3730 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3731 // the cost model. 3732 3733 // If we inserted an edge from the middle block to the unique exit block, 3734 // update uses outside the loop (phis) to account for the newly inserted 3735 // edge. 3736 3737 // Fix-up external users of the induction variables. 3738 for (const auto &Entry : Legal->getInductionVars()) 3739 fixupIVUsers(Entry.first, Entry.second, 3740 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3741 IVEndValues[Entry.first], LoopMiddleBlock, 3742 VectorLoop->getHeader(), Plan, State); 3743 } 3744 3745 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3746 // in the exit block, so update the builder. 3747 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3748 for (const auto &KV : Plan.getLiveOuts()) 3749 KV.second->fixPhi(Plan, State); 3750 3751 for (Instruction *PI : PredicatedInstructions) 3752 sinkScalarOperands(&*PI); 3753 3754 // Remove redundant induction instructions. 3755 cse(VectorLoop->getHeader()); 3756 3757 // Set/update profile weights for the vector and remainder loops as original 3758 // loop iterations are now distributed among them. Note that original loop 3759 // represented by LoopScalarBody becomes remainder loop after vectorization. 3760 // 3761 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3762 // end up getting slightly roughened result but that should be OK since 3763 // profile is not inherently precise anyway. Note also possible bypass of 3764 // vector code caused by legality checks is ignored, assigning all the weight 3765 // to the vector loop, optimistically. 3766 // 3767 // For scalable vectorization we can't know at compile time how many iterations 3768 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3769 // vscale of '1'. 3770 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3771 LI->getLoopFor(LoopScalarBody), 3772 VF.getKnownMinValue() * UF); 3773 } 3774 3775 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3776 // In order to support recurrences we need to be able to vectorize Phi nodes. 3777 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3778 // stage #2: We now need to fix the recurrences by adding incoming edges to 3779 // the currently empty PHI nodes. At this point every instruction in the 3780 // original loop is widened to a vector form so we can use them to construct 3781 // the incoming edges. 3782 VPBasicBlock *Header = 3783 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3784 3785 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 3786 // sank outside of the loop would keep the same order as they had in the 3787 // original loop. 3788 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 3789 for (VPRecipeBase &R : Header->phis()) { 3790 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3791 ReductionPHIList.emplace_back(ReductionPhi); 3792 } 3793 stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1, 3794 const VPReductionPHIRecipe *R2) { 3795 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 3796 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 3797 3798 // If neither of the recipes has an intermediate store, keep the order the 3799 // same. 3800 if (!IS1 && !IS2) 3801 return false; 3802 3803 // If only one of the recipes has an intermediate store, then move it 3804 // towards the beginning of the list. 3805 if (IS1 && !IS2) 3806 return true; 3807 3808 if (!IS1 && IS2) 3809 return false; 3810 3811 // If both recipes have an intermediate store, then the recipe with the 3812 // later store should be processed earlier. So it should go to the beginning 3813 // of the list. 3814 return DT->dominates(IS2, IS1); 3815 }); 3816 3817 for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList) 3818 fixReduction(ReductionPhi, State); 3819 3820 for (VPRecipeBase &R : Header->phis()) { 3821 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3822 fixFixedOrderRecurrence(FOR, State); 3823 } 3824 } 3825 3826 void InnerLoopVectorizer::fixFixedOrderRecurrence( 3827 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3828 // This is the second phase of vectorizing first-order recurrences. An 3829 // overview of the transformation is described below. Suppose we have the 3830 // following loop. 3831 // 3832 // for (int i = 0; i < n; ++i) 3833 // b[i] = a[i] - a[i - 1]; 3834 // 3835 // There is a first-order recurrence on "a". For this loop, the shorthand 3836 // scalar IR looks like: 3837 // 3838 // scalar.ph: 3839 // s_init = a[-1] 3840 // br scalar.body 3841 // 3842 // scalar.body: 3843 // i = phi [0, scalar.ph], [i+1, scalar.body] 3844 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3845 // s2 = a[i] 3846 // b[i] = s2 - s1 3847 // br cond, scalar.body, ... 3848 // 3849 // In this example, s1 is a recurrence because it's value depends on the 3850 // previous iteration. In the first phase of vectorization, we created a 3851 // vector phi v1 for s1. We now complete the vectorization and produce the 3852 // shorthand vector IR shown below (for VF = 4, UF = 1). 3853 // 3854 // vector.ph: 3855 // v_init = vector(..., ..., ..., a[-1]) 3856 // br vector.body 3857 // 3858 // vector.body 3859 // i = phi [0, vector.ph], [i+4, vector.body] 3860 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3861 // v2 = a[i, i+1, i+2, i+3]; 3862 // v3 = vector(v1(3), v2(0, 1, 2)) 3863 // b[i, i+1, i+2, i+3] = v2 - v3 3864 // br cond, vector.body, middle.block 3865 // 3866 // middle.block: 3867 // x = v2(3) 3868 // br scalar.ph 3869 // 3870 // scalar.ph: 3871 // s_init = phi [x, middle.block], [a[-1], otherwise] 3872 // br scalar.body 3873 // 3874 // After execution completes the vector loop, we extract the next value of 3875 // the recurrence (x) to use as the initial value in the scalar loop. 3876 3877 // Extract the last vector element in the middle block. This will be the 3878 // initial value for the recurrence when jumping to the scalar loop. 3879 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3880 Value *Incoming = State.get(PreviousDef, UF - 1); 3881 auto *ExtractForScalar = Incoming; 3882 auto *IdxTy = Builder.getInt32Ty(); 3883 Value *RuntimeVF = nullptr; 3884 if (VF.isVector()) { 3885 auto *One = ConstantInt::get(IdxTy, 1); 3886 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3887 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3888 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3889 ExtractForScalar = 3890 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3891 } 3892 3893 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3894 assert(PhiR->getNumUsers() == 1 && 3895 RecurSplice->getOpcode() == 3896 VPInstruction::FirstOrderRecurrenceSplice && 3897 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3898 SmallVector<VPLiveOut *> LiveOuts; 3899 for (VPUser *U : RecurSplice->users()) 3900 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3901 LiveOuts.push_back(LiveOut); 3902 3903 if (!LiveOuts.empty()) { 3904 // Extract the second last element in the middle block if the 3905 // Phi is used outside the loop. We need to extract the phi itself 3906 // and not the last element (the phi update in the current iteration). This 3907 // will be the value when jumping to the exit block from the 3908 // LoopMiddleBlock, when the scalar loop is not run at all. 3909 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3910 if (VF.isVector()) { 3911 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3912 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3913 Incoming, Idx, "vector.recur.extract.for.phi"); 3914 } else { 3915 assert(UF > 1 && "VF and UF cannot both be 1"); 3916 // When loop is unrolled without vectorizing, initialize 3917 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3918 // value of `Incoming`. This is analogous to the vectorized case above: 3919 // extracting the second last element when VF > 1. 3920 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3921 } 3922 3923 for (VPLiveOut *LiveOut : LiveOuts) { 3924 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3925 PHINode *LCSSAPhi = LiveOut->getPhi(); 3926 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3927 State.Plan->removeLiveOut(LCSSAPhi); 3928 } 3929 } 3930 3931 // Fix the initial value of the original recurrence in the scalar loop. 3932 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3933 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3934 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3935 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3936 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3937 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3938 Start->addIncoming(Incoming, BB); 3939 } 3940 3941 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3942 Phi->setName("scalar.recur"); 3943 } 3944 3945 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3946 VPTransformState &State) { 3947 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3948 // Get it's reduction variable descriptor. 3949 assert(Legal->isReductionVariable(OrigPhi) && 3950 "Unable to find the reduction variable"); 3951 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3952 3953 RecurKind RK = RdxDesc.getRecurrenceKind(); 3954 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3955 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3956 State.setDebugLocFromInst(ReductionStartValue); 3957 3958 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3959 // This is the vector-clone of the value that leaves the loop. 3960 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3961 3962 // Before each round, move the insertion point right between 3963 // the PHIs and the values we are going to write. 3964 // This allows us to write both PHINodes and the extractelement 3965 // instructions. 3966 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3967 3968 State.setDebugLocFromInst(LoopExitInst); 3969 3970 Type *PhiTy = OrigPhi->getType(); 3971 3972 VPBasicBlock *LatchVPBB = 3973 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3974 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3975 // If tail is folded by masking, the vector value to leave the loop should be 3976 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3977 // instead of the former. For an inloop reduction the reduction will already 3978 // be predicated, and does not need to be handled here. 3979 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3980 for (unsigned Part = 0; Part < UF; ++Part) { 3981 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3982 SelectInst *Sel = nullptr; 3983 for (User *U : VecLoopExitInst->users()) { 3984 if (isa<SelectInst>(U)) { 3985 assert(!Sel && "Reduction exit feeding two selects"); 3986 Sel = cast<SelectInst>(U); 3987 } else 3988 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3989 } 3990 assert(Sel && "Reduction exit feeds no select"); 3991 State.reset(LoopExitInstDef, Sel, Part); 3992 3993 if (isa<FPMathOperator>(Sel)) 3994 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3995 3996 // If the target can create a predicated operator for the reduction at no 3997 // extra cost in the loop (for example a predicated vadd), it can be 3998 // cheaper for the select to remain in the loop than be sunk out of it, 3999 // and so use the select value for the phi instead of the old 4000 // LoopExitValue. 4001 if (PreferPredicatedReductionSelect || 4002 TTI->preferPredicatedReductionSelect( 4003 RdxDesc.getOpcode(), PhiTy, 4004 TargetTransformInfo::ReductionFlags())) { 4005 auto *VecRdxPhi = 4006 cast<PHINode>(State.get(PhiR, Part)); 4007 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 4008 } 4009 } 4010 } 4011 4012 // If the vector reduction can be performed in a smaller type, we truncate 4013 // then extend the loop exit value to enable InstCombine to evaluate the 4014 // entire expression in the smaller type. 4015 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4016 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 4017 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4018 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 4019 VectorParts RdxParts(UF); 4020 for (unsigned Part = 0; Part < UF; ++Part) { 4021 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4022 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4023 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4024 : Builder.CreateZExt(Trunc, VecTy); 4025 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 4026 if (U != Trunc) { 4027 U->replaceUsesOfWith(RdxParts[Part], Extnd); 4028 RdxParts[Part] = Extnd; 4029 } 4030 } 4031 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4032 for (unsigned Part = 0; Part < UF; ++Part) { 4033 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4034 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4035 } 4036 } 4037 4038 // Reduce all of the unrolled parts into a single vector. 4039 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4040 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4041 4042 // The middle block terminator has already been assigned a DebugLoc here (the 4043 // OrigLoop's single latch terminator). We want the whole middle block to 4044 // appear to execute on this line because: (a) it is all compiler generated, 4045 // (b) these instructions are always executed after evaluating the latch 4046 // conditional branch, and (c) other passes may add new predecessors which 4047 // terminate on this line. This is the easiest way to ensure we don't 4048 // accidentally cause an extra step back into the loop while debugging. 4049 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 4050 if (PhiR->isOrdered()) 4051 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4052 else { 4053 // Floating-point operations should have some FMF to enable the reduction. 4054 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4055 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4056 for (unsigned Part = 1; Part < UF; ++Part) { 4057 Value *RdxPart = State.get(LoopExitInstDef, Part); 4058 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4059 ReducedPartRdx = Builder.CreateBinOp( 4060 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4061 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 4062 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 4063 ReducedPartRdx, RdxPart); 4064 else 4065 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4066 } 4067 } 4068 4069 // Create the reduction after the loop. Note that inloop reductions create the 4070 // target reduction in the loop using a Reduction recipe. 4071 if (VF.isVector() && !PhiR->isInLoop()) { 4072 ReducedPartRdx = 4073 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4074 // If the reduction can be performed in a smaller type, we need to extend 4075 // the reduction to the wider type before we branch to the original loop. 4076 if (PhiTy != RdxDesc.getRecurrenceType()) 4077 ReducedPartRdx = RdxDesc.isSigned() 4078 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4079 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4080 } 4081 4082 PHINode *ResumePhi = 4083 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4084 4085 // Create a phi node that merges control-flow from the backedge-taken check 4086 // block and the middle block. 4087 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4088 LoopScalarPreHeader->getTerminator()); 4089 4090 // If we are fixing reductions in the epilogue loop then we should already 4091 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4092 // we carry over the incoming values correctly. 4093 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4094 if (Incoming == LoopMiddleBlock) 4095 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4096 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4097 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4098 Incoming); 4099 else 4100 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4101 } 4102 4103 // Set the resume value for this reduction 4104 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4105 4106 // If there were stores of the reduction value to a uniform memory address 4107 // inside the loop, create the final store here. 4108 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4109 StoreInst *NewSI = 4110 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4111 propagateMetadata(NewSI, SI); 4112 4113 // If the reduction value is used in other places, 4114 // then let the code below create PHI's for that. 4115 } 4116 4117 // Now, we need to fix the users of the reduction variable 4118 // inside and outside of the scalar remainder loop. 4119 4120 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4121 // in the exit blocks. See comment on analogous loop in 4122 // fixFixedOrderRecurrence for a more complete explaination of the logic. 4123 if (!Cost->requiresScalarEpilogue(VF.isVector())) 4124 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4125 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4126 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4127 State.Plan->removeLiveOut(&LCSSAPhi); 4128 } 4129 4130 // Fix the scalar loop reduction variable with the incoming reduction sum 4131 // from the vector body and from the backedge value. 4132 int IncomingEdgeBlockIdx = 4133 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4134 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4135 // Pick the other block. 4136 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4137 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4138 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4139 } 4140 4141 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4142 // The basic block and loop containing the predicated instruction. 4143 auto *PredBB = PredInst->getParent(); 4144 auto *VectorLoop = LI->getLoopFor(PredBB); 4145 4146 // Initialize a worklist with the operands of the predicated instruction. 4147 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4148 4149 // Holds instructions that we need to analyze again. An instruction may be 4150 // reanalyzed if we don't yet know if we can sink it or not. 4151 SmallVector<Instruction *, 8> InstsToReanalyze; 4152 4153 // Returns true if a given use occurs in the predicated block. Phi nodes use 4154 // their operands in their corresponding predecessor blocks. 4155 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4156 auto *I = cast<Instruction>(U.getUser()); 4157 BasicBlock *BB = I->getParent(); 4158 if (auto *Phi = dyn_cast<PHINode>(I)) 4159 BB = Phi->getIncomingBlock( 4160 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4161 return BB == PredBB; 4162 }; 4163 4164 // Iteratively sink the scalarized operands of the predicated instruction 4165 // into the block we created for it. When an instruction is sunk, it's 4166 // operands are then added to the worklist. The algorithm ends after one pass 4167 // through the worklist doesn't sink a single instruction. 4168 bool Changed; 4169 do { 4170 // Add the instructions that need to be reanalyzed to the worklist, and 4171 // reset the changed indicator. 4172 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4173 InstsToReanalyze.clear(); 4174 Changed = false; 4175 4176 while (!Worklist.empty()) { 4177 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4178 4179 // We can't sink an instruction if it is a phi node, is not in the loop, 4180 // may have side effects or may read from memory. 4181 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 4182 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4183 I->mayHaveSideEffects() || I->mayReadFromMemory()) 4184 continue; 4185 4186 // If the instruction is already in PredBB, check if we can sink its 4187 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4188 // sinking the scalar instruction I, hence it appears in PredBB; but it 4189 // may have failed to sink I's operands (recursively), which we try 4190 // (again) here. 4191 if (I->getParent() == PredBB) { 4192 Worklist.insert(I->op_begin(), I->op_end()); 4193 continue; 4194 } 4195 4196 // It's legal to sink the instruction if all its uses occur in the 4197 // predicated block. Otherwise, there's nothing to do yet, and we may 4198 // need to reanalyze the instruction. 4199 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4200 InstsToReanalyze.push_back(I); 4201 continue; 4202 } 4203 4204 // Move the instruction to the beginning of the predicated block, and add 4205 // it's operands to the worklist. 4206 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4207 Worklist.insert(I->op_begin(), I->op_end()); 4208 4209 // The sinking may have enabled other instructions to be sunk, so we will 4210 // need to iterate. 4211 Changed = true; 4212 } 4213 } while (Changed); 4214 } 4215 4216 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4217 VPTransformState &State) { 4218 auto Iter = vp_depth_first_deep(Plan.getEntry()); 4219 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4220 for (VPRecipeBase &P : VPBB->phis()) { 4221 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4222 if (!VPPhi) 4223 continue; 4224 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4225 // Make sure the builder has a valid insert point. 4226 Builder.SetInsertPoint(NewPhi); 4227 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4228 VPValue *Inc = VPPhi->getIncomingValue(i); 4229 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4230 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4231 } 4232 } 4233 } 4234 } 4235 4236 bool InnerLoopVectorizer::useOrderedReductions( 4237 const RecurrenceDescriptor &RdxDesc) { 4238 return Cost->useOrderedReductions(RdxDesc); 4239 } 4240 4241 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4242 // We should not collect Scalars more than once per VF. Right now, this 4243 // function is called from collectUniformsAndScalars(), which already does 4244 // this check. Collecting Scalars for VF=1 does not make any sense. 4245 assert(VF.isVector() && !Scalars.contains(VF) && 4246 "This function should not be visited twice for the same VF"); 4247 4248 // This avoids any chances of creating a REPLICATE recipe during planning 4249 // since that would result in generation of scalarized code during execution, 4250 // which is not supported for scalable vectors. 4251 if (VF.isScalable()) { 4252 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4253 return; 4254 } 4255 4256 SmallSetVector<Instruction *, 8> Worklist; 4257 4258 // These sets are used to seed the analysis with pointers used by memory 4259 // accesses that will remain scalar. 4260 SmallSetVector<Instruction *, 8> ScalarPtrs; 4261 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4262 auto *Latch = TheLoop->getLoopLatch(); 4263 4264 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4265 // The pointer operands of loads and stores will be scalar as long as the 4266 // memory access is not a gather or scatter operation. The value operand of a 4267 // store will remain scalar if the store is scalarized. 4268 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4269 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4270 assert(WideningDecision != CM_Unknown && 4271 "Widening decision should be ready at this moment"); 4272 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4273 if (Ptr == Store->getValueOperand()) 4274 return WideningDecision == CM_Scalarize; 4275 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4276 "Ptr is neither a value or pointer operand"); 4277 return WideningDecision != CM_GatherScatter; 4278 }; 4279 4280 // A helper that returns true if the given value is a bitcast or 4281 // getelementptr instruction contained in the loop. 4282 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4283 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4284 isa<GetElementPtrInst>(V)) && 4285 !TheLoop->isLoopInvariant(V); 4286 }; 4287 4288 // A helper that evaluates a memory access's use of a pointer. If the use will 4289 // be a scalar use and the pointer is only used by memory accesses, we place 4290 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4291 // PossibleNonScalarPtrs. 4292 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4293 // We only care about bitcast and getelementptr instructions contained in 4294 // the loop. 4295 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4296 return; 4297 4298 // If the pointer has already been identified as scalar (e.g., if it was 4299 // also identified as uniform), there's nothing to do. 4300 auto *I = cast<Instruction>(Ptr); 4301 if (Worklist.count(I)) 4302 return; 4303 4304 // If the use of the pointer will be a scalar use, and all users of the 4305 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4306 // place the pointer in PossibleNonScalarPtrs. 4307 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4308 return isa<LoadInst>(U) || isa<StoreInst>(U); 4309 })) 4310 ScalarPtrs.insert(I); 4311 else 4312 PossibleNonScalarPtrs.insert(I); 4313 }; 4314 4315 // We seed the scalars analysis with three classes of instructions: (1) 4316 // instructions marked uniform-after-vectorization and (2) bitcast, 4317 // getelementptr and (pointer) phi instructions used by memory accesses 4318 // requiring a scalar use. 4319 // 4320 // (1) Add to the worklist all instructions that have been identified as 4321 // uniform-after-vectorization. 4322 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4323 4324 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4325 // memory accesses requiring a scalar use. The pointer operands of loads and 4326 // stores will be scalar as long as the memory accesses is not a gather or 4327 // scatter operation. The value operand of a store will remain scalar if the 4328 // store is scalarized. 4329 for (auto *BB : TheLoop->blocks()) 4330 for (auto &I : *BB) { 4331 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4332 evaluatePtrUse(Load, Load->getPointerOperand()); 4333 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4334 evaluatePtrUse(Store, Store->getPointerOperand()); 4335 evaluatePtrUse(Store, Store->getValueOperand()); 4336 } 4337 } 4338 for (auto *I : ScalarPtrs) 4339 if (!PossibleNonScalarPtrs.count(I)) { 4340 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4341 Worklist.insert(I); 4342 } 4343 4344 // Insert the forced scalars. 4345 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4346 // induction variable when the PHI user is scalarized. 4347 auto ForcedScalar = ForcedScalars.find(VF); 4348 if (ForcedScalar != ForcedScalars.end()) 4349 for (auto *I : ForcedScalar->second) { 4350 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 4351 Worklist.insert(I); 4352 } 4353 4354 // Expand the worklist by looking through any bitcasts and getelementptr 4355 // instructions we've already identified as scalar. This is similar to the 4356 // expansion step in collectLoopUniforms(); however, here we're only 4357 // expanding to include additional bitcasts and getelementptr instructions. 4358 unsigned Idx = 0; 4359 while (Idx != Worklist.size()) { 4360 Instruction *Dst = Worklist[Idx++]; 4361 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4362 continue; 4363 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4364 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4365 auto *J = cast<Instruction>(U); 4366 return !TheLoop->contains(J) || Worklist.count(J) || 4367 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4368 isScalarUse(J, Src)); 4369 })) { 4370 Worklist.insert(Src); 4371 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4372 } 4373 } 4374 4375 // An induction variable will remain scalar if all users of the induction 4376 // variable and induction variable update remain scalar. 4377 for (const auto &Induction : Legal->getInductionVars()) { 4378 auto *Ind = Induction.first; 4379 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4380 4381 // If tail-folding is applied, the primary induction variable will be used 4382 // to feed a vector compare. 4383 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4384 continue; 4385 4386 // Returns true if \p Indvar is a pointer induction that is used directly by 4387 // load/store instruction \p I. 4388 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4389 Instruction *I) { 4390 return Induction.second.getKind() == 4391 InductionDescriptor::IK_PtrInduction && 4392 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4393 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4394 }; 4395 4396 // Determine if all users of the induction variable are scalar after 4397 // vectorization. 4398 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4399 auto *I = cast<Instruction>(U); 4400 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4401 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4402 }); 4403 if (!ScalarInd) 4404 continue; 4405 4406 // Determine if all users of the induction variable update instruction are 4407 // scalar after vectorization. 4408 auto ScalarIndUpdate = 4409 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4410 auto *I = cast<Instruction>(U); 4411 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4412 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4413 }); 4414 if (!ScalarIndUpdate) 4415 continue; 4416 4417 // The induction variable and its update instruction will remain scalar. 4418 Worklist.insert(Ind); 4419 Worklist.insert(IndUpdate); 4420 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4421 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4422 << "\n"); 4423 } 4424 4425 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4426 } 4427 4428 bool LoopVectorizationCostModel::isScalarWithPredication( 4429 Instruction *I, ElementCount VF) const { 4430 if (!isPredicatedInst(I)) 4431 return false; 4432 4433 // Do we have a non-scalar lowering for this predicated 4434 // instruction? No - it is scalar with predication. 4435 switch(I->getOpcode()) { 4436 default: 4437 return true; 4438 case Instruction::Call: 4439 return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF); 4440 case Instruction::Load: 4441 case Instruction::Store: { 4442 auto *Ptr = getLoadStorePointerOperand(I); 4443 auto *Ty = getLoadStoreType(I); 4444 Type *VTy = Ty; 4445 if (VF.isVector()) 4446 VTy = VectorType::get(Ty, VF); 4447 const Align Alignment = getLoadStoreAlignment(I); 4448 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4449 TTI.isLegalMaskedGather(VTy, Alignment)) 4450 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4451 TTI.isLegalMaskedScatter(VTy, Alignment)); 4452 } 4453 case Instruction::UDiv: 4454 case Instruction::SDiv: 4455 case Instruction::SRem: 4456 case Instruction::URem: { 4457 // We have the option to use the safe-divisor idiom to avoid predication. 4458 // The cost based decision here will always select safe-divisor for 4459 // scalable vectors as scalarization isn't legal. 4460 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 4461 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 4462 } 4463 } 4464 } 4465 4466 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 4467 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4468 return false; 4469 4470 // Can we prove this instruction is safe to unconditionally execute? 4471 // If not, we must use some form of predication. 4472 switch(I->getOpcode()) { 4473 default: 4474 return false; 4475 case Instruction::Load: 4476 case Instruction::Store: { 4477 if (!Legal->isMaskRequired(I)) 4478 return false; 4479 // When we know the load's address is loop invariant and the instruction 4480 // in the original scalar loop was unconditionally executed then we 4481 // don't need to mark it as a predicated instruction. Tail folding may 4482 // introduce additional predication, but we're guaranteed to always have 4483 // at least one active lane. We call Legal->blockNeedsPredication here 4484 // because it doesn't query tail-folding. For stores, we need to prove 4485 // both speculation safety (which follows from the same argument as loads), 4486 // but also must prove the value being stored is correct. The easiest 4487 // form of the later is to require that all values stored are the same. 4488 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 4489 (isa<LoadInst>(I) || 4490 (isa<StoreInst>(I) && 4491 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4492 !Legal->blockNeedsPredication(I->getParent())) 4493 return false; 4494 return true; 4495 } 4496 case Instruction::UDiv: 4497 case Instruction::SDiv: 4498 case Instruction::SRem: 4499 case Instruction::URem: 4500 // TODO: We can use the loop-preheader as context point here and get 4501 // context sensitive reasoning 4502 return !isSafeToSpeculativelyExecute(I); 4503 case Instruction::Call: 4504 return Legal->isMaskRequired(I); 4505 } 4506 } 4507 4508 std::pair<InstructionCost, InstructionCost> 4509 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4510 ElementCount VF) const { 4511 assert(I->getOpcode() == Instruction::UDiv || 4512 I->getOpcode() == Instruction::SDiv || 4513 I->getOpcode() == Instruction::SRem || 4514 I->getOpcode() == Instruction::URem); 4515 assert(!isSafeToSpeculativelyExecute(I)); 4516 4517 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4518 4519 // Scalarization isn't legal for scalable vector types 4520 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4521 if (!VF.isScalable()) { 4522 // Get the scalarization cost and scale this amount by the probability of 4523 // executing the predicated block. If the instruction is not predicated, 4524 // we fall through to the next case. 4525 ScalarizationCost = 0; 4526 4527 // These instructions have a non-void type, so account for the phi nodes 4528 // that we will create. This cost is likely to be zero. The phi node 4529 // cost, if any, should be scaled by the block probability because it 4530 // models a copy at the end of each predicated block. 4531 ScalarizationCost += VF.getKnownMinValue() * 4532 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4533 4534 // The cost of the non-predicated instruction. 4535 ScalarizationCost += VF.getKnownMinValue() * 4536 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4537 4538 // The cost of insertelement and extractelement instructions needed for 4539 // scalarization. 4540 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4541 4542 // Scale the cost by the probability of executing the predicated blocks. 4543 // This assumes the predicated block for each vector lane is equally 4544 // likely. 4545 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4546 } 4547 InstructionCost SafeDivisorCost = 0; 4548 4549 auto *VecTy = ToVectorTy(I->getType(), VF); 4550 4551 // The cost of the select guard to ensure all lanes are well defined 4552 // after we speculate above any internal control flow. 4553 SafeDivisorCost += TTI.getCmpSelInstrCost( 4554 Instruction::Select, VecTy, 4555 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4556 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4557 4558 // Certain instructions can be cheaper to vectorize if they have a constant 4559 // second vector operand. One example of this are shifts on x86. 4560 Value *Op2 = I->getOperand(1); 4561 auto Op2Info = TTI.getOperandInfo(Op2); 4562 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4563 Legal->isInvariant(Op2)) 4564 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4565 4566 SmallVector<const Value *, 4> Operands(I->operand_values()); 4567 SafeDivisorCost += TTI.getArithmeticInstrCost( 4568 I->getOpcode(), VecTy, CostKind, 4569 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4570 Op2Info, Operands, I); 4571 return {ScalarizationCost, SafeDivisorCost}; 4572 } 4573 4574 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4575 Instruction *I, ElementCount VF) { 4576 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4577 assert(getWideningDecision(I, VF) == CM_Unknown && 4578 "Decision should not be set yet."); 4579 auto *Group = getInterleavedAccessGroup(I); 4580 assert(Group && "Must have a group."); 4581 4582 // If the instruction's allocated size doesn't equal it's type size, it 4583 // requires padding and will be scalarized. 4584 auto &DL = I->getModule()->getDataLayout(); 4585 auto *ScalarTy = getLoadStoreType(I); 4586 if (hasIrregularType(ScalarTy, DL)) 4587 return false; 4588 4589 // If the group involves a non-integral pointer, we may not be able to 4590 // losslessly cast all values to a common type. 4591 unsigned InterleaveFactor = Group->getFactor(); 4592 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4593 for (unsigned i = 0; i < InterleaveFactor; i++) { 4594 Instruction *Member = Group->getMember(i); 4595 if (!Member) 4596 continue; 4597 auto *MemberTy = getLoadStoreType(Member); 4598 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4599 // Don't coerce non-integral pointers to integers or vice versa. 4600 if (MemberNI != ScalarNI) { 4601 // TODO: Consider adding special nullptr value case here 4602 return false; 4603 } else if (MemberNI && ScalarNI && 4604 ScalarTy->getPointerAddressSpace() != 4605 MemberTy->getPointerAddressSpace()) { 4606 return false; 4607 } 4608 } 4609 4610 // Check if masking is required. 4611 // A Group may need masking for one of two reasons: it resides in a block that 4612 // needs predication, or it was decided to use masking to deal with gaps 4613 // (either a gap at the end of a load-access that may result in a speculative 4614 // load, or any gaps in a store-access). 4615 bool PredicatedAccessRequiresMasking = 4616 blockNeedsPredicationForAnyReason(I->getParent()) && 4617 Legal->isMaskRequired(I); 4618 bool LoadAccessWithGapsRequiresEpilogMasking = 4619 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4620 !isScalarEpilogueAllowed(); 4621 bool StoreAccessWithGapsRequiresMasking = 4622 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4623 if (!PredicatedAccessRequiresMasking && 4624 !LoadAccessWithGapsRequiresEpilogMasking && 4625 !StoreAccessWithGapsRequiresMasking) 4626 return true; 4627 4628 // If masked interleaving is required, we expect that the user/target had 4629 // enabled it, because otherwise it either wouldn't have been created or 4630 // it should have been invalidated by the CostModel. 4631 assert(useMaskedInterleavedAccesses(TTI) && 4632 "Masked interleave-groups for predicated accesses are not enabled."); 4633 4634 if (Group->isReverse()) 4635 return false; 4636 4637 auto *Ty = getLoadStoreType(I); 4638 const Align Alignment = getLoadStoreAlignment(I); 4639 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4640 : TTI.isLegalMaskedStore(Ty, Alignment); 4641 } 4642 4643 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4644 Instruction *I, ElementCount VF) { 4645 // Get and ensure we have a valid memory instruction. 4646 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4647 4648 auto *Ptr = getLoadStorePointerOperand(I); 4649 auto *ScalarTy = getLoadStoreType(I); 4650 4651 // In order to be widened, the pointer should be consecutive, first of all. 4652 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4653 return false; 4654 4655 // If the instruction is a store located in a predicated block, it will be 4656 // scalarized. 4657 if (isScalarWithPredication(I, VF)) 4658 return false; 4659 4660 // If the instruction's allocated size doesn't equal it's type size, it 4661 // requires padding and will be scalarized. 4662 auto &DL = I->getModule()->getDataLayout(); 4663 if (hasIrregularType(ScalarTy, DL)) 4664 return false; 4665 4666 return true; 4667 } 4668 4669 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4670 // We should not collect Uniforms more than once per VF. Right now, 4671 // this function is called from collectUniformsAndScalars(), which 4672 // already does this check. Collecting Uniforms for VF=1 does not make any 4673 // sense. 4674 4675 assert(VF.isVector() && !Uniforms.contains(VF) && 4676 "This function should not be visited twice for the same VF"); 4677 4678 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4679 // not analyze again. Uniforms.count(VF) will return 1. 4680 Uniforms[VF].clear(); 4681 4682 // We now know that the loop is vectorizable! 4683 // Collect instructions inside the loop that will remain uniform after 4684 // vectorization. 4685 4686 // Global values, params and instructions outside of current loop are out of 4687 // scope. 4688 auto isOutOfScope = [&](Value *V) -> bool { 4689 Instruction *I = dyn_cast<Instruction>(V); 4690 return (!I || !TheLoop->contains(I)); 4691 }; 4692 4693 // Worklist containing uniform instructions demanding lane 0. 4694 SetVector<Instruction *> Worklist; 4695 BasicBlock *Latch = TheLoop->getLoopLatch(); 4696 4697 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4698 // that are scalar with predication must not be considered uniform after 4699 // vectorization, because that would create an erroneous replicating region 4700 // where only a single instance out of VF should be formed. 4701 // TODO: optimize such seldom cases if found important, see PR40816. 4702 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4703 if (isOutOfScope(I)) { 4704 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4705 << *I << "\n"); 4706 return; 4707 } 4708 if (isScalarWithPredication(I, VF)) { 4709 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4710 << *I << "\n"); 4711 return; 4712 } 4713 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4714 Worklist.insert(I); 4715 }; 4716 4717 // Start with the conditional branch. If the branch condition is an 4718 // instruction contained in the loop that is only used by the branch, it is 4719 // uniform. 4720 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4721 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4722 addToWorklistIfAllowed(Cmp); 4723 4724 auto PrevVF = VF.divideCoefficientBy(2); 4725 // Return true if all lanes perform the same memory operation, and we can 4726 // thus chose to execute only one. 4727 auto isUniformMemOpUse = [&](Instruction *I) { 4728 // If the value was already known to not be uniform for the previous 4729 // (smaller VF), it cannot be uniform for the larger VF. 4730 if (PrevVF.isVector()) { 4731 auto Iter = Uniforms.find(PrevVF); 4732 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4733 return false; 4734 } 4735 if (!Legal->isUniformMemOp(*I, VF)) 4736 return false; 4737 if (isa<LoadInst>(I)) 4738 // Loading the same address always produces the same result - at least 4739 // assuming aliasing and ordering which have already been checked. 4740 return true; 4741 // Storing the same value on every iteration. 4742 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4743 }; 4744 4745 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4746 InstWidening WideningDecision = getWideningDecision(I, VF); 4747 assert(WideningDecision != CM_Unknown && 4748 "Widening decision should be ready at this moment"); 4749 4750 if (isUniformMemOpUse(I)) 4751 return true; 4752 4753 return (WideningDecision == CM_Widen || 4754 WideningDecision == CM_Widen_Reverse || 4755 WideningDecision == CM_Interleave); 4756 }; 4757 4758 // Returns true if Ptr is the pointer operand of a memory access instruction 4759 // I, I is known to not require scalarization, and the pointer is not also 4760 // stored. 4761 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4762 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4763 return false; 4764 return getLoadStorePointerOperand(I) == Ptr && 4765 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4766 }; 4767 4768 // Holds a list of values which are known to have at least one uniform use. 4769 // Note that there may be other uses which aren't uniform. A "uniform use" 4770 // here is something which only demands lane 0 of the unrolled iterations; 4771 // it does not imply that all lanes produce the same value (e.g. this is not 4772 // the usual meaning of uniform) 4773 SetVector<Value *> HasUniformUse; 4774 4775 // Scan the loop for instructions which are either a) known to have only 4776 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4777 for (auto *BB : TheLoop->blocks()) 4778 for (auto &I : *BB) { 4779 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4780 switch (II->getIntrinsicID()) { 4781 case Intrinsic::sideeffect: 4782 case Intrinsic::experimental_noalias_scope_decl: 4783 case Intrinsic::assume: 4784 case Intrinsic::lifetime_start: 4785 case Intrinsic::lifetime_end: 4786 if (TheLoop->hasLoopInvariantOperands(&I)) 4787 addToWorklistIfAllowed(&I); 4788 break; 4789 default: 4790 break; 4791 } 4792 } 4793 4794 // ExtractValue instructions must be uniform, because the operands are 4795 // known to be loop-invariant. 4796 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4797 assert(isOutOfScope(EVI->getAggregateOperand()) && 4798 "Expected aggregate value to be loop invariant"); 4799 addToWorklistIfAllowed(EVI); 4800 continue; 4801 } 4802 4803 // If there's no pointer operand, there's nothing to do. 4804 auto *Ptr = getLoadStorePointerOperand(&I); 4805 if (!Ptr) 4806 continue; 4807 4808 if (isUniformMemOpUse(&I)) 4809 addToWorklistIfAllowed(&I); 4810 4811 if (isVectorizedMemAccessUse(&I, Ptr)) 4812 HasUniformUse.insert(Ptr); 4813 } 4814 4815 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4816 // demanding) users. Since loops are assumed to be in LCSSA form, this 4817 // disallows uses outside the loop as well. 4818 for (auto *V : HasUniformUse) { 4819 if (isOutOfScope(V)) 4820 continue; 4821 auto *I = cast<Instruction>(V); 4822 auto UsersAreMemAccesses = 4823 llvm::all_of(I->users(), [&](User *U) -> bool { 4824 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4825 }); 4826 if (UsersAreMemAccesses) 4827 addToWorklistIfAllowed(I); 4828 } 4829 4830 // Expand Worklist in topological order: whenever a new instruction 4831 // is added , its users should be already inside Worklist. It ensures 4832 // a uniform instruction will only be used by uniform instructions. 4833 unsigned idx = 0; 4834 while (idx != Worklist.size()) { 4835 Instruction *I = Worklist[idx++]; 4836 4837 for (auto *OV : I->operand_values()) { 4838 // isOutOfScope operands cannot be uniform instructions. 4839 if (isOutOfScope(OV)) 4840 continue; 4841 // First order recurrence Phi's should typically be considered 4842 // non-uniform. 4843 auto *OP = dyn_cast<PHINode>(OV); 4844 if (OP && Legal->isFixedOrderRecurrence(OP)) 4845 continue; 4846 // If all the users of the operand are uniform, then add the 4847 // operand into the uniform worklist. 4848 auto *OI = cast<Instruction>(OV); 4849 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4850 auto *J = cast<Instruction>(U); 4851 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4852 })) 4853 addToWorklistIfAllowed(OI); 4854 } 4855 } 4856 4857 // For an instruction to be added into Worklist above, all its users inside 4858 // the loop should also be in Worklist. However, this condition cannot be 4859 // true for phi nodes that form a cyclic dependence. We must process phi 4860 // nodes separately. An induction variable will remain uniform if all users 4861 // of the induction variable and induction variable update remain uniform. 4862 // The code below handles both pointer and non-pointer induction variables. 4863 for (const auto &Induction : Legal->getInductionVars()) { 4864 auto *Ind = Induction.first; 4865 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4866 4867 // Determine if all users of the induction variable are uniform after 4868 // vectorization. 4869 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4870 auto *I = cast<Instruction>(U); 4871 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4872 isVectorizedMemAccessUse(I, Ind); 4873 }); 4874 if (!UniformInd) 4875 continue; 4876 4877 // Determine if all users of the induction variable update instruction are 4878 // uniform after vectorization. 4879 auto UniformIndUpdate = 4880 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4881 auto *I = cast<Instruction>(U); 4882 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4883 isVectorizedMemAccessUse(I, IndUpdate); 4884 }); 4885 if (!UniformIndUpdate) 4886 continue; 4887 4888 // The induction variable and its update instruction will remain uniform. 4889 addToWorklistIfAllowed(Ind); 4890 addToWorklistIfAllowed(IndUpdate); 4891 } 4892 4893 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4894 } 4895 4896 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4897 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4898 4899 if (Legal->getRuntimePointerChecking()->Need) { 4900 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4901 "runtime pointer checks needed. Enable vectorization of this " 4902 "loop with '#pragma clang loop vectorize(enable)' when " 4903 "compiling with -Os/-Oz", 4904 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4905 return true; 4906 } 4907 4908 if (!PSE.getPredicate().isAlwaysTrue()) { 4909 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4910 "runtime SCEV checks needed. Enable vectorization of this " 4911 "loop with '#pragma clang loop vectorize(enable)' when " 4912 "compiling with -Os/-Oz", 4913 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4914 return true; 4915 } 4916 4917 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4918 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4919 reportVectorizationFailure("Runtime stride check for small trip count", 4920 "runtime stride == 1 checks needed. Enable vectorization of " 4921 "this loop without such check by compiling with -Os/-Oz", 4922 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4923 return true; 4924 } 4925 4926 return false; 4927 } 4928 4929 ElementCount 4930 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4931 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4932 return ElementCount::getScalable(0); 4933 4934 if (Hints->isScalableVectorizationDisabled()) { 4935 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4936 "ScalableVectorizationDisabled", ORE, TheLoop); 4937 return ElementCount::getScalable(0); 4938 } 4939 4940 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4941 4942 auto MaxScalableVF = ElementCount::getScalable( 4943 std::numeric_limits<ElementCount::ScalarTy>::max()); 4944 4945 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4946 // FIXME: While for scalable vectors this is currently sufficient, this should 4947 // be replaced by a more detailed mechanism that filters out specific VFs, 4948 // instead of invalidating vectorization for a whole set of VFs based on the 4949 // MaxVF. 4950 4951 // Disable scalable vectorization if the loop contains unsupported reductions. 4952 if (!canVectorizeReductions(MaxScalableVF)) { 4953 reportVectorizationInfo( 4954 "Scalable vectorization not supported for the reduction " 4955 "operations found in this loop.", 4956 "ScalableVFUnfeasible", ORE, TheLoop); 4957 return ElementCount::getScalable(0); 4958 } 4959 4960 // Disable scalable vectorization if the loop contains any instructions 4961 // with element types not supported for scalable vectors. 4962 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4963 return !Ty->isVoidTy() && 4964 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4965 })) { 4966 reportVectorizationInfo("Scalable vectorization is not supported " 4967 "for all element types found in this loop.", 4968 "ScalableVFUnfeasible", ORE, TheLoop); 4969 return ElementCount::getScalable(0); 4970 } 4971 4972 if (Legal->isSafeForAnyVectorWidth()) 4973 return MaxScalableVF; 4974 4975 // Limit MaxScalableVF by the maximum safe dependence distance. 4976 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4977 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4978 else 4979 MaxScalableVF = ElementCount::getScalable(0); 4980 4981 if (!MaxScalableVF) 4982 reportVectorizationInfo( 4983 "Max legal vector width too small, scalable vectorization " 4984 "unfeasible.", 4985 "ScalableVFUnfeasible", ORE, TheLoop); 4986 4987 return MaxScalableVF; 4988 } 4989 4990 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4991 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4992 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4993 unsigned SmallestType, WidestType; 4994 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4995 4996 // Get the maximum safe dependence distance in bits computed by LAA. 4997 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4998 // the memory accesses that is most restrictive (involved in the smallest 4999 // dependence distance). 5000 unsigned MaxSafeElements = 5001 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5002 5003 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5004 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5005 5006 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5007 << ".\n"); 5008 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5009 << ".\n"); 5010 5011 // First analyze the UserVF, fall back if the UserVF should be ignored. 5012 if (UserVF) { 5013 auto MaxSafeUserVF = 5014 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5015 5016 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 5017 // If `VF=vscale x N` is safe, then so is `VF=N` 5018 if (UserVF.isScalable()) 5019 return FixedScalableVFPair( 5020 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 5021 else 5022 return UserVF; 5023 } 5024 5025 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5026 5027 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5028 // is better to ignore the hint and let the compiler choose a suitable VF. 5029 if (!UserVF.isScalable()) { 5030 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5031 << " is unsafe, clamping to max safe VF=" 5032 << MaxSafeFixedVF << ".\n"); 5033 ORE->emit([&]() { 5034 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5035 TheLoop->getStartLoc(), 5036 TheLoop->getHeader()) 5037 << "User-specified vectorization factor " 5038 << ore::NV("UserVectorizationFactor", UserVF) 5039 << " is unsafe, clamping to maximum safe vectorization factor " 5040 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5041 }); 5042 return MaxSafeFixedVF; 5043 } 5044 5045 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5046 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5047 << " is ignored because scalable vectors are not " 5048 "available.\n"); 5049 ORE->emit([&]() { 5050 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5051 TheLoop->getStartLoc(), 5052 TheLoop->getHeader()) 5053 << "User-specified vectorization factor " 5054 << ore::NV("UserVectorizationFactor", UserVF) 5055 << " is ignored because the target does not support scalable " 5056 "vectors. The compiler will pick a more suitable value."; 5057 }); 5058 } else { 5059 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5060 << " is unsafe. Ignoring scalable UserVF.\n"); 5061 ORE->emit([&]() { 5062 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5063 TheLoop->getStartLoc(), 5064 TheLoop->getHeader()) 5065 << "User-specified vectorization factor " 5066 << ore::NV("UserVectorizationFactor", UserVF) 5067 << " is unsafe. Ignoring the hint to let the compiler pick a " 5068 "more suitable value."; 5069 }); 5070 } 5071 } 5072 5073 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5074 << " / " << WidestType << " bits.\n"); 5075 5076 FixedScalableVFPair Result(ElementCount::getFixed(1), 5077 ElementCount::getScalable(0)); 5078 if (auto MaxVF = 5079 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5080 MaxSafeFixedVF, FoldTailByMasking)) 5081 Result.FixedVF = MaxVF; 5082 5083 if (auto MaxVF = 5084 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5085 MaxSafeScalableVF, FoldTailByMasking)) 5086 if (MaxVF.isScalable()) { 5087 Result.ScalableVF = MaxVF; 5088 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5089 << "\n"); 5090 } 5091 5092 return Result; 5093 } 5094 5095 FixedScalableVFPair 5096 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5097 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5098 // TODO: It may by useful to do since it's still likely to be dynamically 5099 // uniform if the target can skip. 5100 reportVectorizationFailure( 5101 "Not inserting runtime ptr check for divergent target", 5102 "runtime pointer checks needed. Not enabled for divergent target", 5103 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5104 return FixedScalableVFPair::getNone(); 5105 } 5106 5107 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5108 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5109 if (TC == 1) { 5110 reportVectorizationFailure("Single iteration (non) loop", 5111 "loop trip count is one, irrelevant for vectorization", 5112 "SingleIterationLoop", ORE, TheLoop); 5113 return FixedScalableVFPair::getNone(); 5114 } 5115 5116 switch (ScalarEpilogueStatus) { 5117 case CM_ScalarEpilogueAllowed: 5118 return computeFeasibleMaxVF(TC, UserVF, false); 5119 case CM_ScalarEpilogueNotAllowedUsePredicate: 5120 [[fallthrough]]; 5121 case CM_ScalarEpilogueNotNeededUsePredicate: 5122 LLVM_DEBUG( 5123 dbgs() << "LV: vector predicate hint/switch found.\n" 5124 << "LV: Not allowing scalar epilogue, creating predicated " 5125 << "vector loop.\n"); 5126 break; 5127 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5128 // fallthrough as a special case of OptForSize 5129 case CM_ScalarEpilogueNotAllowedOptSize: 5130 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5131 LLVM_DEBUG( 5132 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5133 else 5134 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5135 << "count.\n"); 5136 5137 // Bail if runtime checks are required, which are not good when optimising 5138 // for size. 5139 if (runtimeChecksRequired()) 5140 return FixedScalableVFPair::getNone(); 5141 5142 break; 5143 } 5144 5145 // The only loops we can vectorize without a scalar epilogue, are loops with 5146 // a bottom-test and a single exiting block. We'd have to handle the fact 5147 // that not every instruction executes on the last iteration. This will 5148 // require a lane mask which varies through the vector loop body. (TODO) 5149 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5150 // If there was a tail-folding hint/switch, but we can't fold the tail by 5151 // masking, fallback to a vectorization with a scalar epilogue. 5152 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5153 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5154 "scalar epilogue instead.\n"); 5155 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5156 return computeFeasibleMaxVF(TC, UserVF, false); 5157 } 5158 return FixedScalableVFPair::getNone(); 5159 } 5160 5161 // Now try the tail folding 5162 5163 // Invalidate interleave groups that require an epilogue if we can't mask 5164 // the interleave-group. 5165 if (!useMaskedInterleavedAccesses(TTI)) { 5166 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5167 "No decisions should have been taken at this point"); 5168 // Note: There is no need to invalidate any cost modeling decisions here, as 5169 // non where taken so far. 5170 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5171 } 5172 5173 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5174 5175 // Avoid tail folding if the trip count is known to be a multiple of any VF 5176 // we choose. 5177 std::optional<unsigned> MaxPowerOf2RuntimeVF = 5178 MaxFactors.FixedVF.getFixedValue(); 5179 if (MaxFactors.ScalableVF) { 5180 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 5181 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 5182 MaxPowerOf2RuntimeVF = std::max<unsigned>( 5183 *MaxPowerOf2RuntimeVF, 5184 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 5185 } else 5186 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 5187 } 5188 5189 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 5190 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 5191 "MaxFixedVF must be a power of 2"); 5192 unsigned MaxVFtimesIC = 5193 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 5194 ScalarEvolution *SE = PSE.getSE(); 5195 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5196 const SCEV *ExitCount = SE->getAddExpr( 5197 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5198 const SCEV *Rem = SE->getURemExpr( 5199 SE->applyLoopGuards(ExitCount, TheLoop), 5200 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5201 if (Rem->isZero()) { 5202 // Accept MaxFixedVF if we do not have a tail. 5203 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5204 return MaxFactors; 5205 } 5206 } 5207 5208 // If we don't know the precise trip count, or if the trip count that we 5209 // found modulo the vectorization factor is not zero, try to fold the tail 5210 // by masking. 5211 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5212 if (Legal->prepareToFoldTailByMasking()) { 5213 CanFoldTailByMasking = true; 5214 return MaxFactors; 5215 } 5216 5217 // If there was a tail-folding hint/switch, but we can't fold the tail by 5218 // masking, fallback to a vectorization with a scalar epilogue. 5219 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5220 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5221 "scalar epilogue instead.\n"); 5222 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5223 return MaxFactors; 5224 } 5225 5226 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5227 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5228 return FixedScalableVFPair::getNone(); 5229 } 5230 5231 if (TC == 0) { 5232 reportVectorizationFailure( 5233 "Unable to calculate the loop count due to complex control flow", 5234 "unable to calculate the loop count due to complex control flow", 5235 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5236 return FixedScalableVFPair::getNone(); 5237 } 5238 5239 reportVectorizationFailure( 5240 "Cannot optimize for size and vectorize at the same time.", 5241 "cannot optimize for size and vectorize at the same time. " 5242 "Enable vectorization of this loop with '#pragma clang loop " 5243 "vectorize(enable)' when compiling with -Os/-Oz", 5244 "NoTailLoopWithOptForSize", ORE, TheLoop); 5245 return FixedScalableVFPair::getNone(); 5246 } 5247 5248 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5249 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5250 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5251 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5252 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 5253 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5254 : TargetTransformInfo::RGK_FixedWidthVector); 5255 5256 // Convenience function to return the minimum of two ElementCounts. 5257 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5258 assert((LHS.isScalable() == RHS.isScalable()) && 5259 "Scalable flags must match"); 5260 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5261 }; 5262 5263 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5264 // Note that both WidestRegister and WidestType may not be a powers of 2. 5265 auto MaxVectorElementCount = ElementCount::get( 5266 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 5267 ComputeScalableMaxVF); 5268 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5269 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5270 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5271 5272 if (!MaxVectorElementCount) { 5273 LLVM_DEBUG(dbgs() << "LV: The target has no " 5274 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5275 << " vector registers.\n"); 5276 return ElementCount::getFixed(1); 5277 } 5278 5279 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 5280 if (MaxVectorElementCount.isScalable() && 5281 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5282 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5283 auto Min = Attr.getVScaleRangeMin(); 5284 WidestRegisterMinEC *= Min; 5285 } 5286 5287 // When a scalar epilogue is required, at least one iteration of the scalar 5288 // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a 5289 // max VF that results in a dead vector loop. 5290 if (ConstTripCount > 0 && requiresScalarEpilogue(true)) 5291 ConstTripCount -= 1; 5292 5293 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && 5294 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5295 // If loop trip count (TC) is known at compile time there is no point in 5296 // choosing VF greater than TC (as done in the loop below). Select maximum 5297 // power of two which doesn't exceed TC. 5298 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5299 // when the TC is less than or equal to the known number of lanes. 5300 auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); 5301 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5302 "exceeding the constant trip count: " 5303 << ClampedConstTripCount << "\n"); 5304 return ElementCount::getFixed(ClampedConstTripCount); 5305 } 5306 5307 TargetTransformInfo::RegisterKind RegKind = 5308 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5309 : TargetTransformInfo::RGK_FixedWidthVector; 5310 ElementCount MaxVF = MaxVectorElementCount; 5311 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5312 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5313 auto MaxVectorElementCountMaxBW = ElementCount::get( 5314 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 5315 ComputeScalableMaxVF); 5316 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5317 5318 // Collect all viable vectorization factors larger than the default MaxVF 5319 // (i.e. MaxVectorElementCount). 5320 SmallVector<ElementCount, 8> VFs; 5321 for (ElementCount VS = MaxVectorElementCount * 2; 5322 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5323 VFs.push_back(VS); 5324 5325 // For each VF calculate its register usage. 5326 auto RUs = calculateRegisterUsage(VFs); 5327 5328 // Select the largest VF which doesn't require more registers than existing 5329 // ones. 5330 for (int i = RUs.size() - 1; i >= 0; --i) { 5331 bool Selected = true; 5332 for (auto &pair : RUs[i].MaxLocalUsers) { 5333 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5334 if (pair.second > TargetNumRegisters) 5335 Selected = false; 5336 } 5337 if (Selected) { 5338 MaxVF = VFs[i]; 5339 break; 5340 } 5341 } 5342 if (ElementCount MinVF = 5343 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5344 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5345 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5346 << ") with target's minimum: " << MinVF << '\n'); 5347 MaxVF = MinVF; 5348 } 5349 } 5350 5351 // Invalidate any widening decisions we might have made, in case the loop 5352 // requires prediction (decided later), but we have already made some 5353 // load/store widening decisions. 5354 invalidateCostModelingDecisions(); 5355 } 5356 return MaxVF; 5357 } 5358 5359 /// Convenience function that returns the value of vscale_range iff 5360 /// vscale_range.min == vscale_range.max or otherwise returns the value 5361 /// returned by the corresponding TTI method. 5362 static std::optional<unsigned> 5363 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 5364 const Function *Fn = L->getHeader()->getParent(); 5365 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 5366 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 5367 auto Min = Attr.getVScaleRangeMin(); 5368 auto Max = Attr.getVScaleRangeMax(); 5369 if (Max && Min == Max) 5370 return Max; 5371 } 5372 5373 return TTI.getVScaleForTuning(); 5374 } 5375 5376 bool LoopVectorizationPlanner::isMoreProfitable( 5377 const VectorizationFactor &A, const VectorizationFactor &B) const { 5378 InstructionCost CostA = A.Cost; 5379 InstructionCost CostB = B.Cost; 5380 5381 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 5382 5383 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 5384 // If the trip count is a known (possibly small) constant, the trip count 5385 // will be rounded up to an integer number of iterations under 5386 // FoldTailByMasking. The total cost in that case will be 5387 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 5388 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 5389 // some extra overheads, but for the purpose of comparing the costs of 5390 // different VFs we can use this to compare the total loop-body cost 5391 // expected after vectorization. 5392 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 5393 InstructionCost VectorCost, 5394 InstructionCost ScalarCost) { 5395 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 5396 : VectorCost * (MaxTripCount / VF) + 5397 ScalarCost * (MaxTripCount % VF); 5398 }; 5399 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 5400 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 5401 5402 return RTCostA < RTCostB; 5403 } 5404 5405 // Improve estimate for the vector width if it is scalable. 5406 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5407 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5408 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 5409 if (A.Width.isScalable()) 5410 EstimatedWidthA *= *VScale; 5411 if (B.Width.isScalable()) 5412 EstimatedWidthB *= *VScale; 5413 } 5414 5415 // Assume vscale may be larger than 1 (or the value being tuned for), 5416 // so that scalable vectorization is slightly favorable over fixed-width 5417 // vectorization. 5418 if (A.Width.isScalable() && !B.Width.isScalable()) 5419 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5420 5421 // To avoid the need for FP division: 5422 // (CostA / A.Width) < (CostB / B.Width) 5423 // <=> (CostA * B.Width) < (CostB * A.Width) 5424 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5425 } 5426 5427 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 5428 OptimizationRemarkEmitter *ORE, 5429 Loop *TheLoop) { 5430 if (InvalidCosts.empty()) 5431 return; 5432 5433 // Emit a report of VFs with invalid costs in the loop. 5434 5435 // Group the remarks per instruction, keeping the instruction order from 5436 // InvalidCosts. 5437 std::map<Instruction *, unsigned> Numbering; 5438 unsigned I = 0; 5439 for (auto &Pair : InvalidCosts) 5440 if (!Numbering.count(Pair.first)) 5441 Numbering[Pair.first] = I++; 5442 5443 // Sort the list, first on instruction(number) then on VF. 5444 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5445 if (Numbering[A.first] != Numbering[B.first]) 5446 return Numbering[A.first] < Numbering[B.first]; 5447 ElementCountComparator ECC; 5448 return ECC(A.second, B.second); 5449 }); 5450 5451 // For a list of ordered instruction-vf pairs: 5452 // [(load, vf1), (load, vf2), (store, vf1)] 5453 // Group the instructions together to emit separate remarks for: 5454 // load (vf1, vf2) 5455 // store (vf1) 5456 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5457 auto Subset = ArrayRef<InstructionVFPair>(); 5458 do { 5459 if (Subset.empty()) 5460 Subset = Tail.take_front(1); 5461 5462 Instruction *I = Subset.front().first; 5463 5464 // If the next instruction is different, or if there are no other pairs, 5465 // emit a remark for the collated subset. e.g. 5466 // [(load, vf1), (load, vf2))] 5467 // to emit: 5468 // remark: invalid costs for 'load' at VF=(vf, vf2) 5469 if (Subset == Tail || Tail[Subset.size()].first != I) { 5470 std::string OutString; 5471 raw_string_ostream OS(OutString); 5472 assert(!Subset.empty() && "Unexpected empty range"); 5473 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5474 for (const auto &Pair : Subset) 5475 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 5476 OS << "):"; 5477 if (auto *CI = dyn_cast<CallInst>(I)) 5478 OS << " call to " << CI->getCalledFunction()->getName(); 5479 else 5480 OS << " " << I->getOpcodeName(); 5481 OS.flush(); 5482 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5483 Tail = Tail.drop_front(Subset.size()); 5484 Subset = {}; 5485 } else 5486 // Grow the subset by one element 5487 Subset = Tail.take_front(Subset.size() + 1); 5488 } while (!Tail.empty()); 5489 } 5490 5491 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 5492 const ElementCountSet &VFCandidates) { 5493 InstructionCost ExpectedCost = 5494 CM.expectedCost(ElementCount::getFixed(1)).first; 5495 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5496 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5497 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5498 "Expected Scalar VF to be a candidate"); 5499 5500 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5501 ExpectedCost); 5502 VectorizationFactor ChosenFactor = ScalarCost; 5503 5504 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 5505 if (ForceVectorization && VFCandidates.size() > 1) { 5506 // Ignore scalar width, because the user explicitly wants vectorization. 5507 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5508 // evaluation. 5509 ChosenFactor.Cost = InstructionCost::getMax(); 5510 } 5511 5512 SmallVector<InstructionVFPair> InvalidCosts; 5513 for (const auto &i : VFCandidates) { 5514 // The cost for scalar VF=1 is already calculated, so ignore it. 5515 if (i.isScalar()) 5516 continue; 5517 5518 LoopVectorizationCostModel::VectorizationCostTy C = 5519 CM.expectedCost(i, &InvalidCosts); 5520 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5521 5522 #ifndef NDEBUG 5523 unsigned AssumedMinimumVscale = 1; 5524 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5525 AssumedMinimumVscale = *VScale; 5526 unsigned Width = 5527 Candidate.Width.isScalable() 5528 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5529 : Candidate.Width.getFixedValue(); 5530 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5531 << " costs: " << (Candidate.Cost / Width)); 5532 if (i.isScalable()) 5533 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5534 << AssumedMinimumVscale << ")"); 5535 LLVM_DEBUG(dbgs() << ".\n"); 5536 #endif 5537 5538 if (!C.second && !ForceVectorization) { 5539 LLVM_DEBUG( 5540 dbgs() << "LV: Not considering vector loop of width " << i 5541 << " because it will not generate any vector instructions.\n"); 5542 continue; 5543 } 5544 5545 // If profitable add it to ProfitableVF list. 5546 if (isMoreProfitable(Candidate, ScalarCost)) 5547 ProfitableVFs.push_back(Candidate); 5548 5549 if (isMoreProfitable(Candidate, ChosenFactor)) 5550 ChosenFactor = Candidate; 5551 } 5552 5553 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5554 5555 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5556 reportVectorizationFailure( 5557 "There are conditional stores.", 5558 "store that is conditionally executed prevents vectorization", 5559 "ConditionalStore", ORE, OrigLoop); 5560 ChosenFactor = ScalarCost; 5561 } 5562 5563 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5564 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5565 << "LV: Vectorization seems to be not beneficial, " 5566 << "but was forced by a user.\n"); 5567 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5568 return ChosenFactor; 5569 } 5570 5571 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5572 ElementCount VF) const { 5573 // Cross iteration phis such as reductions need special handling and are 5574 // currently unsupported. 5575 if (any_of(OrigLoop->getHeader()->phis(), 5576 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5577 return false; 5578 5579 // Phis with uses outside of the loop require special handling and are 5580 // currently unsupported. 5581 for (const auto &Entry : Legal->getInductionVars()) { 5582 // Look for uses of the value of the induction at the last iteration. 5583 Value *PostInc = 5584 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5585 for (User *U : PostInc->users()) 5586 if (!OrigLoop->contains(cast<Instruction>(U))) 5587 return false; 5588 // Look for uses of penultimate value of the induction. 5589 for (User *U : Entry.first->users()) 5590 if (!OrigLoop->contains(cast<Instruction>(U))) 5591 return false; 5592 } 5593 5594 // Epilogue vectorization code has not been auditted to ensure it handles 5595 // non-latch exits properly. It may be fine, but it needs auditted and 5596 // tested. 5597 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5598 return false; 5599 5600 return true; 5601 } 5602 5603 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5604 const ElementCount VF) const { 5605 // FIXME: We need a much better cost-model to take different parameters such 5606 // as register pressure, code size increase and cost of extra branches into 5607 // account. For now we apply a very crude heuristic and only consider loops 5608 // with vectorization factors larger than a certain value. 5609 5610 // Allow the target to opt out entirely. 5611 if (!TTI.preferEpilogueVectorization()) 5612 return false; 5613 5614 // We also consider epilogue vectorization unprofitable for targets that don't 5615 // consider interleaving beneficial (eg. MVE). 5616 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5617 return false; 5618 5619 unsigned Multiplier = 1; 5620 if (VF.isScalable()) 5621 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5622 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5623 return true; 5624 return false; 5625 } 5626 5627 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5628 const ElementCount MainLoopVF, unsigned IC) { 5629 VectorizationFactor Result = VectorizationFactor::Disabled(); 5630 if (!EnableEpilogueVectorization) { 5631 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5632 return Result; 5633 } 5634 5635 if (!CM.isScalarEpilogueAllowed()) { 5636 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5637 "epilogue is allowed.\n"); 5638 return Result; 5639 } 5640 5641 // Not really a cost consideration, but check for unsupported cases here to 5642 // simplify the logic. 5643 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5644 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5645 "is not a supported candidate.\n"); 5646 return Result; 5647 } 5648 5649 if (EpilogueVectorizationForceVF > 1) { 5650 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5651 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5652 if (hasPlanWithVF(ForcedEC)) 5653 return {ForcedEC, 0, 0}; 5654 else { 5655 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5656 "viable.\n"); 5657 return Result; 5658 } 5659 } 5660 5661 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5662 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5663 LLVM_DEBUG( 5664 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5665 return Result; 5666 } 5667 5668 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5669 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5670 "this loop\n"); 5671 return Result; 5672 } 5673 5674 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5675 // the main loop handles 8 lanes per iteration. We could still benefit from 5676 // vectorizing the epilogue loop with VF=4. 5677 ElementCount EstimatedRuntimeVF = MainLoopVF; 5678 if (MainLoopVF.isScalable()) { 5679 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5680 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5681 EstimatedRuntimeVF *= *VScale; 5682 } 5683 5684 ScalarEvolution &SE = *PSE.getSE(); 5685 Type *TCType = Legal->getWidestInductionType(); 5686 const SCEV *RemainingIterations = nullptr; 5687 for (auto &NextVF : ProfitableVFs) { 5688 // Skip candidate VFs without a corresponding VPlan. 5689 if (!hasPlanWithVF(NextVF.Width)) 5690 continue; 5691 5692 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5693 // vectors) or the VF of the main loop (fixed vectors). 5694 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5695 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5696 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5697 continue; 5698 5699 // If NextVF is greater than the number of remaining iterations, the 5700 // epilogue loop would be dead. Skip such factors. 5701 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5702 // TODO: extend to support scalable VFs. 5703 if (!RemainingIterations) { 5704 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5705 RemainingIterations = SE.getURemExpr( 5706 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5707 } 5708 if (SE.isKnownPredicate( 5709 CmpInst::ICMP_UGT, 5710 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5711 RemainingIterations)) 5712 continue; 5713 } 5714 5715 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5716 Result = NextVF; 5717 } 5718 5719 if (Result != VectorizationFactor::Disabled()) 5720 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5721 << Result.Width << "\n"); 5722 return Result; 5723 } 5724 5725 std::pair<unsigned, unsigned> 5726 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5727 unsigned MinWidth = -1U; 5728 unsigned MaxWidth = 8; 5729 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5730 // For in-loop reductions, no element types are added to ElementTypesInLoop 5731 // if there are no loads/stores in the loop. In this case, check through the 5732 // reduction variables to determine the maximum width. 5733 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5734 // Reset MaxWidth so that we can find the smallest type used by recurrences 5735 // in the loop. 5736 MaxWidth = -1U; 5737 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5738 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5739 // When finding the min width used by the recurrence we need to account 5740 // for casts on the input operands of the recurrence. 5741 MaxWidth = std::min<unsigned>( 5742 MaxWidth, std::min<unsigned>( 5743 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5744 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5745 } 5746 } else { 5747 for (Type *T : ElementTypesInLoop) { 5748 MinWidth = std::min<unsigned>( 5749 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5750 MaxWidth = std::max<unsigned>( 5751 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5752 } 5753 } 5754 return {MinWidth, MaxWidth}; 5755 } 5756 5757 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5758 ElementTypesInLoop.clear(); 5759 // For each block. 5760 for (BasicBlock *BB : TheLoop->blocks()) { 5761 // For each instruction in the loop. 5762 for (Instruction &I : BB->instructionsWithoutDebug()) { 5763 Type *T = I.getType(); 5764 5765 // Skip ignored values. 5766 if (ValuesToIgnore.count(&I)) 5767 continue; 5768 5769 // Only examine Loads, Stores and PHINodes. 5770 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5771 continue; 5772 5773 // Examine PHI nodes that are reduction variables. Update the type to 5774 // account for the recurrence type. 5775 if (auto *PN = dyn_cast<PHINode>(&I)) { 5776 if (!Legal->isReductionVariable(PN)) 5777 continue; 5778 const RecurrenceDescriptor &RdxDesc = 5779 Legal->getReductionVars().find(PN)->second; 5780 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5781 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5782 RdxDesc.getRecurrenceType(), 5783 TargetTransformInfo::ReductionFlags())) 5784 continue; 5785 T = RdxDesc.getRecurrenceType(); 5786 } 5787 5788 // Examine the stored values. 5789 if (auto *ST = dyn_cast<StoreInst>(&I)) 5790 T = ST->getValueOperand()->getType(); 5791 5792 assert(T->isSized() && 5793 "Expected the load/store/recurrence type to be sized"); 5794 5795 ElementTypesInLoop.insert(T); 5796 } 5797 } 5798 } 5799 5800 unsigned 5801 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5802 InstructionCost LoopCost) { 5803 // -- The interleave heuristics -- 5804 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5805 // There are many micro-architectural considerations that we can't predict 5806 // at this level. For example, frontend pressure (on decode or fetch) due to 5807 // code size, or the number and capabilities of the execution ports. 5808 // 5809 // We use the following heuristics to select the interleave count: 5810 // 1. If the code has reductions, then we interleave to break the cross 5811 // iteration dependency. 5812 // 2. If the loop is really small, then we interleave to reduce the loop 5813 // overhead. 5814 // 3. We don't interleave if we think that we will spill registers to memory 5815 // due to the increased register pressure. 5816 5817 if (!isScalarEpilogueAllowed()) 5818 return 1; 5819 5820 // We used the distance for the interleave count. 5821 if (!Legal->isSafeForAnyVectorWidth()) 5822 return 1; 5823 5824 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5825 const bool HasReductions = !Legal->getReductionVars().empty(); 5826 // Do not interleave loops with a relatively small known or estimated trip 5827 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5828 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5829 // because with the above conditions interleaving can expose ILP and break 5830 // cross iteration dependences for reductions. 5831 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5832 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5833 return 1; 5834 5835 // If we did not calculate the cost for VF (because the user selected the VF) 5836 // then we calculate the cost of VF here. 5837 if (LoopCost == 0) { 5838 LoopCost = expectedCost(VF).first; 5839 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5840 5841 // Loop body is free and there is no need for interleaving. 5842 if (LoopCost == 0) 5843 return 1; 5844 } 5845 5846 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5847 // We divide by these constants so assume that we have at least one 5848 // instruction that uses at least one register. 5849 for (auto& pair : R.MaxLocalUsers) { 5850 pair.second = std::max(pair.second, 1U); 5851 } 5852 5853 // We calculate the interleave count using the following formula. 5854 // Subtract the number of loop invariants from the number of available 5855 // registers. These registers are used by all of the interleaved instances. 5856 // Next, divide the remaining registers by the number of registers that is 5857 // required by the loop, in order to estimate how many parallel instances 5858 // fit without causing spills. All of this is rounded down if necessary to be 5859 // a power of two. We want power of two interleave count to simplify any 5860 // addressing operations or alignment considerations. 5861 // We also want power of two interleave counts to ensure that the induction 5862 // variable of the vector loop wraps to zero, when tail is folded by masking; 5863 // this currently happens when OptForSize, in which case IC is set to 1 above. 5864 unsigned IC = UINT_MAX; 5865 5866 for (auto& pair : R.MaxLocalUsers) { 5867 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5868 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5869 << " registers of " 5870 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5871 if (VF.isScalar()) { 5872 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5873 TargetNumRegisters = ForceTargetNumScalarRegs; 5874 } else { 5875 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5876 TargetNumRegisters = ForceTargetNumVectorRegs; 5877 } 5878 unsigned MaxLocalUsers = pair.second; 5879 unsigned LoopInvariantRegs = 0; 5880 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5881 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5882 5883 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5884 MaxLocalUsers); 5885 // Don't count the induction variable as interleaved. 5886 if (EnableIndVarRegisterHeur) { 5887 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5888 std::max(1U, (MaxLocalUsers - 1))); 5889 } 5890 5891 IC = std::min(IC, TmpIC); 5892 } 5893 5894 // Clamp the interleave ranges to reasonable counts. 5895 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5896 5897 // Check if the user has overridden the max. 5898 if (VF.isScalar()) { 5899 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5900 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5901 } else { 5902 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5903 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5904 } 5905 5906 // If trip count is known or estimated compile time constant, limit the 5907 // interleave count to be less than the trip count divided by VF, provided it 5908 // is at least 1. 5909 // 5910 // For scalable vectors we can't know if interleaving is beneficial. It may 5911 // not be beneficial for small loops if none of the lanes in the second vector 5912 // iterations is enabled. However, for larger loops, there is likely to be a 5913 // similar benefit as for fixed-width vectors. For now, we choose to leave 5914 // the InterleaveCount as if vscale is '1', although if some information about 5915 // the vector is known (e.g. min vector size), we can make a better decision. 5916 if (BestKnownTC) { 5917 MaxInterleaveCount = 5918 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5919 // Make sure MaxInterleaveCount is greater than 0. 5920 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5921 } 5922 5923 assert(MaxInterleaveCount > 0 && 5924 "Maximum interleave count must be greater than 0"); 5925 5926 // Clamp the calculated IC to be between the 1 and the max interleave count 5927 // that the target and trip count allows. 5928 if (IC > MaxInterleaveCount) 5929 IC = MaxInterleaveCount; 5930 else 5931 // Make sure IC is greater than 0. 5932 IC = std::max(1u, IC); 5933 5934 assert(IC > 0 && "Interleave count must be greater than 0."); 5935 5936 // Interleave if we vectorized this loop and there is a reduction that could 5937 // benefit from interleaving. 5938 if (VF.isVector() && HasReductions) { 5939 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5940 return IC; 5941 } 5942 5943 // For any scalar loop that either requires runtime checks or predication we 5944 // are better off leaving this to the unroller. Note that if we've already 5945 // vectorized the loop we will have done the runtime check and so interleaving 5946 // won't require further checks. 5947 bool ScalarInterleavingRequiresPredication = 5948 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5949 return Legal->blockNeedsPredication(BB); 5950 })); 5951 bool ScalarInterleavingRequiresRuntimePointerCheck = 5952 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5953 5954 // We want to interleave small loops in order to reduce the loop overhead and 5955 // potentially expose ILP opportunities. 5956 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5957 << "LV: IC is " << IC << '\n' 5958 << "LV: VF is " << VF << '\n'); 5959 const bool AggressivelyInterleaveReductions = 5960 TTI.enableAggressiveInterleaving(HasReductions); 5961 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5962 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5963 // We assume that the cost overhead is 1 and we use the cost model 5964 // to estimate the cost of the loop and interleave until the cost of the 5965 // loop overhead is about 5% of the cost of the loop. 5966 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5967 SmallLoopCost / *LoopCost.getValue())); 5968 5969 // Interleave until store/load ports (estimated by max interleave count) are 5970 // saturated. 5971 unsigned NumStores = Legal->getNumStores(); 5972 unsigned NumLoads = Legal->getNumLoads(); 5973 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5974 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5975 5976 // There is little point in interleaving for reductions containing selects 5977 // and compares when VF=1 since it may just create more overhead than it's 5978 // worth for loops with small trip counts. This is because we still have to 5979 // do the final reduction after the loop. 5980 bool HasSelectCmpReductions = 5981 HasReductions && 5982 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5983 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5984 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5985 RdxDesc.getRecurrenceKind()); 5986 }); 5987 if (HasSelectCmpReductions) { 5988 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5989 return 1; 5990 } 5991 5992 // If we have a scalar reduction (vector reductions are already dealt with 5993 // by this point), we can increase the critical path length if the loop 5994 // we're interleaving is inside another loop. For tree-wise reductions 5995 // set the limit to 2, and for ordered reductions it's best to disable 5996 // interleaving entirely. 5997 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5998 bool HasOrderedReductions = 5999 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 6000 const RecurrenceDescriptor &RdxDesc = Reduction.second; 6001 return RdxDesc.isOrdered(); 6002 }); 6003 if (HasOrderedReductions) { 6004 LLVM_DEBUG( 6005 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 6006 return 1; 6007 } 6008 6009 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6010 SmallIC = std::min(SmallIC, F); 6011 StoresIC = std::min(StoresIC, F); 6012 LoadsIC = std::min(LoadsIC, F); 6013 } 6014 6015 if (EnableLoadStoreRuntimeInterleave && 6016 std::max(StoresIC, LoadsIC) > SmallIC) { 6017 LLVM_DEBUG( 6018 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6019 return std::max(StoresIC, LoadsIC); 6020 } 6021 6022 // If there are scalar reductions and TTI has enabled aggressive 6023 // interleaving for reductions, we will interleave to expose ILP. 6024 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6025 AggressivelyInterleaveReductions) { 6026 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6027 // Interleave no less than SmallIC but not as aggressive as the normal IC 6028 // to satisfy the rare situation when resources are too limited. 6029 return std::max(IC / 2, SmallIC); 6030 } else { 6031 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6032 return SmallIC; 6033 } 6034 } 6035 6036 // Interleave if this is a large loop (small loops are already dealt with by 6037 // this point) that could benefit from interleaving. 6038 if (AggressivelyInterleaveReductions) { 6039 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6040 return IC; 6041 } 6042 6043 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6044 return 1; 6045 } 6046 6047 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6048 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6049 // This function calculates the register usage by measuring the highest number 6050 // of values that are alive at a single location. Obviously, this is a very 6051 // rough estimation. We scan the loop in a topological order in order and 6052 // assign a number to each instruction. We use RPO to ensure that defs are 6053 // met before their users. We assume that each instruction that has in-loop 6054 // users starts an interval. We record every time that an in-loop value is 6055 // used, so we have a list of the first and last occurrences of each 6056 // instruction. Next, we transpose this data structure into a multi map that 6057 // holds the list of intervals that *end* at a specific location. This multi 6058 // map allows us to perform a linear search. We scan the instructions linearly 6059 // and record each time that a new interval starts, by placing it in a set. 6060 // If we find this value in the multi-map then we remove it from the set. 6061 // The max register usage is the maximum size of the set. 6062 // We also search for instructions that are defined outside the loop, but are 6063 // used inside the loop. We need this number separately from the max-interval 6064 // usage number because when we unroll, loop-invariant values do not take 6065 // more register. 6066 LoopBlocksDFS DFS(TheLoop); 6067 DFS.perform(LI); 6068 6069 RegisterUsage RU; 6070 6071 // Each 'key' in the map opens a new interval. The values 6072 // of the map are the index of the 'last seen' usage of the 6073 // instruction that is the key. 6074 using IntervalMap = DenseMap<Instruction *, unsigned>; 6075 6076 // Maps instruction to its index. 6077 SmallVector<Instruction *, 64> IdxToInstr; 6078 // Marks the end of each interval. 6079 IntervalMap EndPoint; 6080 // Saves the list of instruction indices that are used in the loop. 6081 SmallPtrSet<Instruction *, 8> Ends; 6082 // Saves the list of values that are used in the loop but are defined outside 6083 // the loop (not including non-instruction values such as arguments and 6084 // constants). 6085 SmallSetVector<Instruction *, 8> LoopInvariants; 6086 6087 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6088 for (Instruction &I : BB->instructionsWithoutDebug()) { 6089 IdxToInstr.push_back(&I); 6090 6091 // Save the end location of each USE. 6092 for (Value *U : I.operands()) { 6093 auto *Instr = dyn_cast<Instruction>(U); 6094 6095 // Ignore non-instruction values such as arguments, constants, etc. 6096 // FIXME: Might need some motivation why these values are ignored. If 6097 // for example an argument is used inside the loop it will increase the 6098 // register pressure (so shouldn't we add it to LoopInvariants). 6099 if (!Instr) 6100 continue; 6101 6102 // If this instruction is outside the loop then record it and continue. 6103 if (!TheLoop->contains(Instr)) { 6104 LoopInvariants.insert(Instr); 6105 continue; 6106 } 6107 6108 // Overwrite previous end points. 6109 EndPoint[Instr] = IdxToInstr.size(); 6110 Ends.insert(Instr); 6111 } 6112 } 6113 } 6114 6115 // Saves the list of intervals that end with the index in 'key'. 6116 using InstrList = SmallVector<Instruction *, 2>; 6117 DenseMap<unsigned, InstrList> TransposeEnds; 6118 6119 // Transpose the EndPoints to a list of values that end at each index. 6120 for (auto &Interval : EndPoint) 6121 TransposeEnds[Interval.second].push_back(Interval.first); 6122 6123 SmallPtrSet<Instruction *, 8> OpenIntervals; 6124 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6125 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6126 6127 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6128 6129 const auto &TTICapture = TTI; 6130 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 6131 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6132 return 0; 6133 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6134 }; 6135 6136 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6137 Instruction *I = IdxToInstr[i]; 6138 6139 // Remove all of the instructions that end at this location. 6140 InstrList &List = TransposeEnds[i]; 6141 for (Instruction *ToRemove : List) 6142 OpenIntervals.erase(ToRemove); 6143 6144 // Ignore instructions that are never used within the loop. 6145 if (!Ends.count(I)) 6146 continue; 6147 6148 // Skip ignored values. 6149 if (ValuesToIgnore.count(I)) 6150 continue; 6151 6152 // For each VF find the maximum usage of registers. 6153 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6154 // Count the number of registers used, per register class, given all open 6155 // intervals. 6156 // Note that elements in this SmallMapVector will be default constructed 6157 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 6158 // there is no previous entry for ClassID. 6159 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6160 6161 if (VFs[j].isScalar()) { 6162 for (auto *Inst : OpenIntervals) { 6163 unsigned ClassID = 6164 TTI.getRegisterClassForType(false, Inst->getType()); 6165 // FIXME: The target might use more than one register for the type 6166 // even in the scalar case. 6167 RegUsage[ClassID] += 1; 6168 } 6169 } else { 6170 collectUniformsAndScalars(VFs[j]); 6171 for (auto *Inst : OpenIntervals) { 6172 // Skip ignored values for VF > 1. 6173 if (VecValuesToIgnore.count(Inst)) 6174 continue; 6175 if (isScalarAfterVectorization(Inst, VFs[j])) { 6176 unsigned ClassID = 6177 TTI.getRegisterClassForType(false, Inst->getType()); 6178 // FIXME: The target might use more than one register for the type 6179 // even in the scalar case. 6180 RegUsage[ClassID] += 1; 6181 } else { 6182 unsigned ClassID = 6183 TTI.getRegisterClassForType(true, Inst->getType()); 6184 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6185 } 6186 } 6187 } 6188 6189 for (auto& pair : RegUsage) { 6190 auto &Entry = MaxUsages[j][pair.first]; 6191 Entry = std::max(Entry, pair.second); 6192 } 6193 } 6194 6195 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6196 << OpenIntervals.size() << '\n'); 6197 6198 // Add the current instruction to the list of open intervals. 6199 OpenIntervals.insert(I); 6200 } 6201 6202 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6203 // Note that elements in this SmallMapVector will be default constructed 6204 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 6205 // there is no previous entry for ClassID. 6206 SmallMapVector<unsigned, unsigned, 4> Invariant; 6207 6208 for (auto *Inst : LoopInvariants) { 6209 // FIXME: The target might use more than one register for the type 6210 // even in the scalar case. 6211 bool IsScalar = all_of(Inst->users(), [&](User *U) { 6212 auto *I = cast<Instruction>(U); 6213 return TheLoop != LI->getLoopFor(I->getParent()) || 6214 isScalarAfterVectorization(I, VFs[i]); 6215 }); 6216 6217 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 6218 unsigned ClassID = 6219 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 6220 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 6221 } 6222 6223 LLVM_DEBUG({ 6224 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6225 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6226 << " item\n"; 6227 for (const auto &pair : MaxUsages[i]) { 6228 dbgs() << "LV(REG): RegisterClass: " 6229 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6230 << " registers\n"; 6231 } 6232 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6233 << " item\n"; 6234 for (const auto &pair : Invariant) { 6235 dbgs() << "LV(REG): RegisterClass: " 6236 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6237 << " registers\n"; 6238 } 6239 }); 6240 6241 RU.LoopInvariantRegs = Invariant; 6242 RU.MaxLocalUsers = MaxUsages[i]; 6243 RUs[i] = RU; 6244 } 6245 6246 return RUs; 6247 } 6248 6249 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6250 ElementCount VF) { 6251 // TODO: Cost model for emulated masked load/store is completely 6252 // broken. This hack guides the cost model to use an artificially 6253 // high enough value to practically disable vectorization with such 6254 // operations, except where previously deployed legality hack allowed 6255 // using very low cost values. This is to avoid regressions coming simply 6256 // from moving "masked load/store" check from legality to cost model. 6257 // Masked Load/Gather emulation was previously never allowed. 6258 // Limited number of Masked Store/Scatter emulation was allowed. 6259 assert((isPredicatedInst(I)) && 6260 "Expecting a scalar emulated instruction"); 6261 return isa<LoadInst>(I) || 6262 (isa<StoreInst>(I) && 6263 NumPredStores > NumberOfStoresToPredicate); 6264 } 6265 6266 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6267 // If we aren't vectorizing the loop, or if we've already collected the 6268 // instructions to scalarize, there's nothing to do. Collection may already 6269 // have occurred if we have a user-selected VF and are now computing the 6270 // expected cost for interleaving. 6271 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 6272 return; 6273 6274 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6275 // not profitable to scalarize any instructions, the presence of VF in the 6276 // map will indicate that we've analyzed it already. 6277 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6278 6279 PredicatedBBsAfterVectorization[VF].clear(); 6280 6281 // Find all the instructions that are scalar with predication in the loop and 6282 // determine if it would be better to not if-convert the blocks they are in. 6283 // If so, we also record the instructions to scalarize. 6284 for (BasicBlock *BB : TheLoop->blocks()) { 6285 if (!blockNeedsPredicationForAnyReason(BB)) 6286 continue; 6287 for (Instruction &I : *BB) 6288 if (isScalarWithPredication(&I, VF)) { 6289 ScalarCostsTy ScalarCosts; 6290 // Do not apply discount if scalable, because that would lead to 6291 // invalid scalarization costs. 6292 // Do not apply discount logic if hacked cost is needed 6293 // for emulated masked memrefs. 6294 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6295 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6296 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6297 // Remember that BB will remain after vectorization. 6298 PredicatedBBsAfterVectorization[VF].insert(BB); 6299 } 6300 } 6301 } 6302 6303 InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 6304 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6305 assert(!isUniformAfterVectorization(PredInst, VF) && 6306 "Instruction marked uniform-after-vectorization will be predicated"); 6307 6308 // Initialize the discount to zero, meaning that the scalar version and the 6309 // vector version cost the same. 6310 InstructionCost Discount = 0; 6311 6312 // Holds instructions to analyze. The instructions we visit are mapped in 6313 // ScalarCosts. Those instructions are the ones that would be scalarized if 6314 // we find that the scalar version costs less. 6315 SmallVector<Instruction *, 8> Worklist; 6316 6317 // Returns true if the given instruction can be scalarized. 6318 auto canBeScalarized = [&](Instruction *I) -> bool { 6319 // We only attempt to scalarize instructions forming a single-use chain 6320 // from the original predicated block that would otherwise be vectorized. 6321 // Although not strictly necessary, we give up on instructions we know will 6322 // already be scalar to avoid traversing chains that are unlikely to be 6323 // beneficial. 6324 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6325 isScalarAfterVectorization(I, VF)) 6326 return false; 6327 6328 // If the instruction is scalar with predication, it will be analyzed 6329 // separately. We ignore it within the context of PredInst. 6330 if (isScalarWithPredication(I, VF)) 6331 return false; 6332 6333 // If any of the instruction's operands are uniform after vectorization, 6334 // the instruction cannot be scalarized. This prevents, for example, a 6335 // masked load from being scalarized. 6336 // 6337 // We assume we will only emit a value for lane zero of an instruction 6338 // marked uniform after vectorization, rather than VF identical values. 6339 // Thus, if we scalarize an instruction that uses a uniform, we would 6340 // create uses of values corresponding to the lanes we aren't emitting code 6341 // for. This behavior can be changed by allowing getScalarValue to clone 6342 // the lane zero values for uniforms rather than asserting. 6343 for (Use &U : I->operands()) 6344 if (auto *J = dyn_cast<Instruction>(U.get())) 6345 if (isUniformAfterVectorization(J, VF)) 6346 return false; 6347 6348 // Otherwise, we can scalarize the instruction. 6349 return true; 6350 }; 6351 6352 // Compute the expected cost discount from scalarizing the entire expression 6353 // feeding the predicated instruction. We currently only consider expressions 6354 // that are single-use instruction chains. 6355 Worklist.push_back(PredInst); 6356 while (!Worklist.empty()) { 6357 Instruction *I = Worklist.pop_back_val(); 6358 6359 // If we've already analyzed the instruction, there's nothing to do. 6360 if (ScalarCosts.contains(I)) 6361 continue; 6362 6363 // Compute the cost of the vector instruction. Note that this cost already 6364 // includes the scalarization overhead of the predicated instruction. 6365 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6366 6367 // Compute the cost of the scalarized instruction. This cost is the cost of 6368 // the instruction as if it wasn't if-converted and instead remained in the 6369 // predicated block. We will scale this cost by block probability after 6370 // computing the scalarization overhead. 6371 InstructionCost ScalarCost = 6372 VF.getFixedValue() * 6373 getInstructionCost(I, ElementCount::getFixed(1)).first; 6374 6375 // Compute the scalarization overhead of needed insertelement instructions 6376 // and phi nodes. 6377 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6378 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6379 ScalarCost += TTI.getScalarizationOverhead( 6380 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6381 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 6382 /*Extract*/ false, CostKind); 6383 ScalarCost += 6384 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 6385 } 6386 6387 // Compute the scalarization overhead of needed extractelement 6388 // instructions. For each of the instruction's operands, if the operand can 6389 // be scalarized, add it to the worklist; otherwise, account for the 6390 // overhead. 6391 for (Use &U : I->operands()) 6392 if (auto *J = dyn_cast<Instruction>(U.get())) { 6393 assert(VectorType::isValidElementType(J->getType()) && 6394 "Instruction has non-scalar type"); 6395 if (canBeScalarized(J)) 6396 Worklist.push_back(J); 6397 else if (needsExtract(J, VF)) { 6398 ScalarCost += TTI.getScalarizationOverhead( 6399 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6400 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 6401 /*Extract*/ true, CostKind); 6402 } 6403 } 6404 6405 // Scale the total scalar cost by block probability. 6406 ScalarCost /= getReciprocalPredBlockProb(); 6407 6408 // Compute the discount. A non-negative discount means the vector version 6409 // of the instruction costs more, and scalarizing would be beneficial. 6410 Discount += VectorCost - ScalarCost; 6411 ScalarCosts[I] = ScalarCost; 6412 } 6413 6414 return Discount; 6415 } 6416 6417 LoopVectorizationCostModel::VectorizationCostTy 6418 LoopVectorizationCostModel::expectedCost( 6419 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6420 VectorizationCostTy Cost; 6421 6422 // For each block. 6423 for (BasicBlock *BB : TheLoop->blocks()) { 6424 VectorizationCostTy BlockCost; 6425 6426 // For each instruction in the old loop. 6427 for (Instruction &I : BB->instructionsWithoutDebug()) { 6428 // Skip ignored values. 6429 if (ValuesToIgnore.count(&I) || 6430 (VF.isVector() && VecValuesToIgnore.count(&I))) 6431 continue; 6432 6433 VectorizationCostTy C = getInstructionCost(&I, VF); 6434 6435 // Check if we should override the cost. 6436 if (C.first.isValid() && 6437 ForceTargetInstructionCost.getNumOccurrences() > 0) 6438 C.first = InstructionCost(ForceTargetInstructionCost); 6439 6440 // Keep a list of instructions with invalid costs. 6441 if (Invalid && !C.first.isValid()) 6442 Invalid->emplace_back(&I, VF); 6443 6444 BlockCost.first += C.first; 6445 BlockCost.second |= C.second; 6446 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6447 << " for VF " << VF << " For instruction: " << I 6448 << '\n'); 6449 } 6450 6451 // If we are vectorizing a predicated block, it will have been 6452 // if-converted. This means that the block's instructions (aside from 6453 // stores and instructions that may divide by zero) will now be 6454 // unconditionally executed. For the scalar case, we may not always execute 6455 // the predicated block, if it is an if-else block. Thus, scale the block's 6456 // cost by the probability of executing it. blockNeedsPredication from 6457 // Legal is used so as to not include all blocks in tail folded loops. 6458 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6459 BlockCost.first /= getReciprocalPredBlockProb(); 6460 6461 Cost.first += BlockCost.first; 6462 Cost.second |= BlockCost.second; 6463 } 6464 6465 return Cost; 6466 } 6467 6468 /// Gets Address Access SCEV after verifying that the access pattern 6469 /// is loop invariant except the induction variable dependence. 6470 /// 6471 /// This SCEV can be sent to the Target in order to estimate the address 6472 /// calculation cost. 6473 static const SCEV *getAddressAccessSCEV( 6474 Value *Ptr, 6475 LoopVectorizationLegality *Legal, 6476 PredicatedScalarEvolution &PSE, 6477 const Loop *TheLoop) { 6478 6479 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6480 if (!Gep) 6481 return nullptr; 6482 6483 // We are looking for a gep with all loop invariant indices except for one 6484 // which should be an induction variable. 6485 auto SE = PSE.getSE(); 6486 unsigned NumOperands = Gep->getNumOperands(); 6487 for (unsigned i = 1; i < NumOperands; ++i) { 6488 Value *Opd = Gep->getOperand(i); 6489 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6490 !Legal->isInductionVariable(Opd)) 6491 return nullptr; 6492 } 6493 6494 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6495 return PSE.getSCEV(Ptr); 6496 } 6497 6498 InstructionCost 6499 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6500 ElementCount VF) { 6501 assert(VF.isVector() && 6502 "Scalarization cost of instruction implies vectorization."); 6503 if (VF.isScalable()) 6504 return InstructionCost::getInvalid(); 6505 6506 Type *ValTy = getLoadStoreType(I); 6507 auto SE = PSE.getSE(); 6508 6509 unsigned AS = getLoadStoreAddressSpace(I); 6510 Value *Ptr = getLoadStorePointerOperand(I); 6511 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6512 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6513 // that it is being called from this specific place. 6514 6515 // Figure out whether the access is strided and get the stride value 6516 // if it's known in compile time 6517 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6518 6519 // Get the cost of the scalar memory instruction and address computation. 6520 InstructionCost Cost = 6521 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6522 6523 // Don't pass *I here, since it is scalar but will actually be part of a 6524 // vectorized loop where the user of it is a vectorized instruction. 6525 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6526 const Align Alignment = getLoadStoreAlignment(I); 6527 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6528 ValTy->getScalarType(), 6529 Alignment, AS, CostKind); 6530 6531 // Get the overhead of the extractelement and insertelement instructions 6532 // we might create due to scalarization. 6533 Cost += getScalarizationOverhead(I, VF, CostKind); 6534 6535 // If we have a predicated load/store, it will need extra i1 extracts and 6536 // conditional branches, but may not be executed for each vector lane. Scale 6537 // the cost by the probability of executing the predicated block. 6538 if (isPredicatedInst(I)) { 6539 Cost /= getReciprocalPredBlockProb(); 6540 6541 // Add the cost of an i1 extract and a branch 6542 auto *Vec_i1Ty = 6543 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6544 Cost += TTI.getScalarizationOverhead( 6545 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6546 /*Insert=*/false, /*Extract=*/true, CostKind); 6547 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6548 6549 if (useEmulatedMaskMemRefHack(I, VF)) 6550 // Artificially setting to a high enough value to practically disable 6551 // vectorization with such operations. 6552 Cost = 3000000; 6553 } 6554 6555 return Cost; 6556 } 6557 6558 InstructionCost 6559 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6560 ElementCount VF) { 6561 Type *ValTy = getLoadStoreType(I); 6562 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6563 Value *Ptr = getLoadStorePointerOperand(I); 6564 unsigned AS = getLoadStoreAddressSpace(I); 6565 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6566 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6567 6568 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6569 "Stride should be 1 or -1 for consecutive memory access"); 6570 const Align Alignment = getLoadStoreAlignment(I); 6571 InstructionCost Cost = 0; 6572 if (Legal->isMaskRequired(I)) { 6573 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6574 CostKind); 6575 } else { 6576 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6577 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6578 CostKind, OpInfo, I); 6579 } 6580 6581 bool Reverse = ConsecutiveStride < 0; 6582 if (Reverse) 6583 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6584 std::nullopt, CostKind, 0); 6585 return Cost; 6586 } 6587 6588 InstructionCost 6589 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6590 ElementCount VF) { 6591 assert(Legal->isUniformMemOp(*I, VF)); 6592 6593 Type *ValTy = getLoadStoreType(I); 6594 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6595 const Align Alignment = getLoadStoreAlignment(I); 6596 unsigned AS = getLoadStoreAddressSpace(I); 6597 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6598 if (isa<LoadInst>(I)) { 6599 return TTI.getAddressComputationCost(ValTy) + 6600 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6601 CostKind) + 6602 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6603 } 6604 StoreInst *SI = cast<StoreInst>(I); 6605 6606 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6607 return TTI.getAddressComputationCost(ValTy) + 6608 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6609 CostKind) + 6610 (isLoopInvariantStoreValue 6611 ? 0 6612 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6613 CostKind, VF.getKnownMinValue() - 1)); 6614 } 6615 6616 InstructionCost 6617 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6618 ElementCount VF) { 6619 Type *ValTy = getLoadStoreType(I); 6620 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6621 const Align Alignment = getLoadStoreAlignment(I); 6622 const Value *Ptr = getLoadStorePointerOperand(I); 6623 6624 return TTI.getAddressComputationCost(VectorTy) + 6625 TTI.getGatherScatterOpCost( 6626 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6627 TargetTransformInfo::TCK_RecipThroughput, I); 6628 } 6629 6630 InstructionCost 6631 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6632 ElementCount VF) { 6633 Type *ValTy = getLoadStoreType(I); 6634 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6635 unsigned AS = getLoadStoreAddressSpace(I); 6636 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6637 6638 auto Group = getInterleavedAccessGroup(I); 6639 assert(Group && "Fail to get an interleaved access group."); 6640 6641 unsigned InterleaveFactor = Group->getFactor(); 6642 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6643 6644 // Holds the indices of existing members in the interleaved group. 6645 SmallVector<unsigned, 4> Indices; 6646 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6647 if (Group->getMember(IF)) 6648 Indices.push_back(IF); 6649 6650 // Calculate the cost of the whole interleaved group. 6651 bool UseMaskForGaps = 6652 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6653 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6654 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6655 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6656 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6657 6658 if (Group->isReverse()) { 6659 // TODO: Add support for reversed masked interleaved access. 6660 assert(!Legal->isMaskRequired(I) && 6661 "Reverse masked interleaved access not supported."); 6662 Cost += Group->getNumMembers() * 6663 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6664 std::nullopt, CostKind, 0); 6665 } 6666 return Cost; 6667 } 6668 6669 std::optional<InstructionCost> 6670 LoopVectorizationCostModel::getReductionPatternCost( 6671 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6672 using namespace llvm::PatternMatch; 6673 // Early exit for no inloop reductions 6674 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6675 return std::nullopt; 6676 auto *VectorTy = cast<VectorType>(Ty); 6677 6678 // We are looking for a pattern of, and finding the minimal acceptable cost: 6679 // reduce(mul(ext(A), ext(B))) or 6680 // reduce(mul(A, B)) or 6681 // reduce(ext(A)) or 6682 // reduce(A). 6683 // The basic idea is that we walk down the tree to do that, finding the root 6684 // reduction instruction in InLoopReductionImmediateChains. From there we find 6685 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6686 // of the components. If the reduction cost is lower then we return it for the 6687 // reduction instruction and 0 for the other instructions in the pattern. If 6688 // it is not we return an invalid cost specifying the orignal cost method 6689 // should be used. 6690 Instruction *RetI = I; 6691 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6692 if (!RetI->hasOneUser()) 6693 return std::nullopt; 6694 RetI = RetI->user_back(); 6695 } 6696 6697 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6698 RetI->user_back()->getOpcode() == Instruction::Add) { 6699 RetI = RetI->user_back(); 6700 } 6701 6702 // Test if the found instruction is a reduction, and if not return an invalid 6703 // cost specifying the parent to use the original cost modelling. 6704 if (!InLoopReductionImmediateChains.count(RetI)) 6705 return std::nullopt; 6706 6707 // Find the reduction this chain is a part of and calculate the basic cost of 6708 // the reduction on its own. 6709 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6710 Instruction *ReductionPhi = LastChain; 6711 while (!isa<PHINode>(ReductionPhi)) 6712 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6713 6714 const RecurrenceDescriptor &RdxDesc = 6715 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6716 6717 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6718 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6719 6720 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6721 // normal fmul instruction to the cost of the fadd reduction. 6722 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6723 BaseCost += 6724 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6725 6726 // If we're using ordered reductions then we can just return the base cost 6727 // here, since getArithmeticReductionCost calculates the full ordered 6728 // reduction cost when FP reassociation is not allowed. 6729 if (useOrderedReductions(RdxDesc)) 6730 return BaseCost; 6731 6732 // Get the operand that was not the reduction chain and match it to one of the 6733 // patterns, returning the better cost if it is found. 6734 Instruction *RedOp = RetI->getOperand(1) == LastChain 6735 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6736 : dyn_cast<Instruction>(RetI->getOperand(1)); 6737 6738 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6739 6740 Instruction *Op0, *Op1; 6741 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6742 match(RedOp, 6743 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6744 match(Op0, m_ZExtOrSExt(m_Value())) && 6745 Op0->getOpcode() == Op1->getOpcode() && 6746 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6747 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6748 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6749 6750 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6751 // Note that the extend opcodes need to all match, or if A==B they will have 6752 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6753 // which is equally fine. 6754 bool IsUnsigned = isa<ZExtInst>(Op0); 6755 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6756 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6757 6758 InstructionCost ExtCost = 6759 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6760 TTI::CastContextHint::None, CostKind, Op0); 6761 InstructionCost MulCost = 6762 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6763 InstructionCost Ext2Cost = 6764 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6765 TTI::CastContextHint::None, CostKind, RedOp); 6766 6767 InstructionCost RedCost = TTI.getMulAccReductionCost( 6768 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6769 6770 if (RedCost.isValid() && 6771 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6772 return I == RetI ? RedCost : 0; 6773 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6774 !TheLoop->isLoopInvariant(RedOp)) { 6775 // Matched reduce(ext(A)) 6776 bool IsUnsigned = isa<ZExtInst>(RedOp); 6777 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6778 InstructionCost RedCost = TTI.getExtendedReductionCost( 6779 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6780 RdxDesc.getFastMathFlags(), CostKind); 6781 6782 InstructionCost ExtCost = 6783 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6784 TTI::CastContextHint::None, CostKind, RedOp); 6785 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6786 return I == RetI ? RedCost : 0; 6787 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6788 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6789 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6790 Op0->getOpcode() == Op1->getOpcode() && 6791 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6792 bool IsUnsigned = isa<ZExtInst>(Op0); 6793 Type *Op0Ty = Op0->getOperand(0)->getType(); 6794 Type *Op1Ty = Op1->getOperand(0)->getType(); 6795 Type *LargestOpTy = 6796 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6797 : Op0Ty; 6798 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6799 6800 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6801 // different sizes. We take the largest type as the ext to reduce, and add 6802 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6803 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6804 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6805 TTI::CastContextHint::None, CostKind, Op0); 6806 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6807 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6808 TTI::CastContextHint::None, CostKind, Op1); 6809 InstructionCost MulCost = 6810 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6811 6812 InstructionCost RedCost = TTI.getMulAccReductionCost( 6813 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6814 InstructionCost ExtraExtCost = 0; 6815 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6816 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6817 ExtraExtCost = TTI.getCastInstrCost( 6818 ExtraExtOp->getOpcode(), ExtType, 6819 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6820 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6821 } 6822 6823 if (RedCost.isValid() && 6824 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6825 return I == RetI ? RedCost : 0; 6826 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6827 // Matched reduce.add(mul()) 6828 InstructionCost MulCost = 6829 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6830 6831 InstructionCost RedCost = TTI.getMulAccReductionCost( 6832 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6833 6834 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6835 return I == RetI ? RedCost : 0; 6836 } 6837 } 6838 6839 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6840 } 6841 6842 InstructionCost 6843 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6844 ElementCount VF) { 6845 // Calculate scalar cost only. Vectorization cost should be ready at this 6846 // moment. 6847 if (VF.isScalar()) { 6848 Type *ValTy = getLoadStoreType(I); 6849 const Align Alignment = getLoadStoreAlignment(I); 6850 unsigned AS = getLoadStoreAddressSpace(I); 6851 6852 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6853 return TTI.getAddressComputationCost(ValTy) + 6854 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6855 TTI::TCK_RecipThroughput, OpInfo, I); 6856 } 6857 return getWideningCost(I, VF); 6858 } 6859 6860 LoopVectorizationCostModel::VectorizationCostTy 6861 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6862 ElementCount VF) { 6863 // If we know that this instruction will remain uniform, check the cost of 6864 // the scalar version. 6865 if (isUniformAfterVectorization(I, VF)) 6866 VF = ElementCount::getFixed(1); 6867 6868 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6869 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6870 6871 // Forced scalars do not have any scalarization overhead. 6872 auto ForcedScalar = ForcedScalars.find(VF); 6873 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6874 auto InstSet = ForcedScalar->second; 6875 if (InstSet.count(I)) 6876 return VectorizationCostTy( 6877 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6878 VF.getKnownMinValue()), 6879 false); 6880 } 6881 6882 Type *VectorTy; 6883 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6884 6885 bool TypeNotScalarized = false; 6886 if (VF.isVector() && VectorTy->isVectorTy()) { 6887 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6888 if (VF.isScalable()) 6889 // <vscale x 1 x iN> is assumed to be profitable over iN because 6890 // scalable registers are a distinct register class from scalar ones. 6891 // If we ever find a target which wants to lower scalable vectors 6892 // back to scalars, we'll need to update this code to explicitly 6893 // ask TTI about the register class uses for each part. 6894 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6895 else 6896 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6897 } else 6898 C = InstructionCost::getInvalid(); 6899 } 6900 return VectorizationCostTy(C, TypeNotScalarized); 6901 } 6902 6903 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6904 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6905 6906 // There is no mechanism yet to create a scalable scalarization loop, 6907 // so this is currently Invalid. 6908 if (VF.isScalable()) 6909 return InstructionCost::getInvalid(); 6910 6911 if (VF.isScalar()) 6912 return 0; 6913 6914 InstructionCost Cost = 0; 6915 Type *RetTy = ToVectorTy(I->getType(), VF); 6916 if (!RetTy->isVoidTy() && 6917 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6918 Cost += TTI.getScalarizationOverhead( 6919 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6920 /*Insert*/ true, 6921 /*Extract*/ false, CostKind); 6922 6923 // Some targets keep addresses scalar. 6924 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6925 return Cost; 6926 6927 // Some targets support efficient element stores. 6928 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6929 return Cost; 6930 6931 // Collect operands to consider. 6932 CallInst *CI = dyn_cast<CallInst>(I); 6933 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6934 6935 // Skip operands that do not require extraction/scalarization and do not incur 6936 // any overhead. 6937 SmallVector<Type *> Tys; 6938 for (auto *V : filterExtractingOperands(Ops, VF)) 6939 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6940 return Cost + TTI.getOperandsScalarizationOverhead( 6941 filterExtractingOperands(Ops, VF), Tys, CostKind); 6942 } 6943 6944 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6945 if (VF.isScalar()) 6946 return; 6947 NumPredStores = 0; 6948 for (BasicBlock *BB : TheLoop->blocks()) { 6949 // For each instruction in the old loop. 6950 for (Instruction &I : *BB) { 6951 Value *Ptr = getLoadStorePointerOperand(&I); 6952 if (!Ptr) 6953 continue; 6954 6955 // TODO: We should generate better code and update the cost model for 6956 // predicated uniform stores. Today they are treated as any other 6957 // predicated store (see added test cases in 6958 // invariant-store-vectorization.ll). 6959 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6960 NumPredStores++; 6961 6962 if (Legal->isUniformMemOp(I, VF)) { 6963 auto isLegalToScalarize = [&]() { 6964 if (!VF.isScalable()) 6965 // Scalarization of fixed length vectors "just works". 6966 return true; 6967 6968 // We have dedicated lowering for unpredicated uniform loads and 6969 // stores. Note that even with tail folding we know that at least 6970 // one lane is active (i.e. generalized predication is not possible 6971 // here), and the logic below depends on this fact. 6972 if (!foldTailByMasking()) 6973 return true; 6974 6975 // For scalable vectors, a uniform memop load is always 6976 // uniform-by-parts and we know how to scalarize that. 6977 if (isa<LoadInst>(I)) 6978 return true; 6979 6980 // A uniform store isn't neccessarily uniform-by-part 6981 // and we can't assume scalarization. 6982 auto &SI = cast<StoreInst>(I); 6983 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6984 }; 6985 6986 const InstructionCost GatherScatterCost = 6987 isLegalGatherOrScatter(&I, VF) ? 6988 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6989 6990 // Load: Scalar load + broadcast 6991 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6992 // FIXME: This cost is a significant under-estimate for tail folded 6993 // memory ops. 6994 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6995 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6996 6997 // Choose better solution for the current VF, Note that Invalid 6998 // costs compare as maximumal large. If both are invalid, we get 6999 // scalable invalid which signals a failure and a vectorization abort. 7000 if (GatherScatterCost < ScalarizationCost) 7001 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 7002 else 7003 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 7004 continue; 7005 } 7006 7007 // We assume that widening is the best solution when possible. 7008 if (memoryInstructionCanBeWidened(&I, VF)) { 7009 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7010 int ConsecutiveStride = Legal->isConsecutivePtr( 7011 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 7012 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7013 "Expected consecutive stride."); 7014 InstWidening Decision = 7015 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7016 setWideningDecision(&I, VF, Decision, Cost); 7017 continue; 7018 } 7019 7020 // Choose between Interleaving, Gather/Scatter or Scalarization. 7021 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7022 unsigned NumAccesses = 1; 7023 if (isAccessInterleaved(&I)) { 7024 auto Group = getInterleavedAccessGroup(&I); 7025 assert(Group && "Fail to get an interleaved access group."); 7026 7027 // Make one decision for the whole group. 7028 if (getWideningDecision(&I, VF) != CM_Unknown) 7029 continue; 7030 7031 NumAccesses = Group->getNumMembers(); 7032 if (interleavedAccessCanBeWidened(&I, VF)) 7033 InterleaveCost = getInterleaveGroupCost(&I, VF); 7034 } 7035 7036 InstructionCost GatherScatterCost = 7037 isLegalGatherOrScatter(&I, VF) 7038 ? getGatherScatterCost(&I, VF) * NumAccesses 7039 : InstructionCost::getInvalid(); 7040 7041 InstructionCost ScalarizationCost = 7042 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7043 7044 // Choose better solution for the current VF, 7045 // write down this decision and use it during vectorization. 7046 InstructionCost Cost; 7047 InstWidening Decision; 7048 if (InterleaveCost <= GatherScatterCost && 7049 InterleaveCost < ScalarizationCost) { 7050 Decision = CM_Interleave; 7051 Cost = InterleaveCost; 7052 } else if (GatherScatterCost < ScalarizationCost) { 7053 Decision = CM_GatherScatter; 7054 Cost = GatherScatterCost; 7055 } else { 7056 Decision = CM_Scalarize; 7057 Cost = ScalarizationCost; 7058 } 7059 // If the instructions belongs to an interleave group, the whole group 7060 // receives the same decision. The whole group receives the cost, but 7061 // the cost will actually be assigned to one instruction. 7062 if (auto Group = getInterleavedAccessGroup(&I)) 7063 setWideningDecision(Group, VF, Decision, Cost); 7064 else 7065 setWideningDecision(&I, VF, Decision, Cost); 7066 } 7067 } 7068 7069 // Make sure that any load of address and any other address computation 7070 // remains scalar unless there is gather/scatter support. This avoids 7071 // inevitable extracts into address registers, and also has the benefit of 7072 // activating LSR more, since that pass can't optimize vectorized 7073 // addresses. 7074 if (TTI.prefersVectorizedAddressing()) 7075 return; 7076 7077 // Start with all scalar pointer uses. 7078 SmallPtrSet<Instruction *, 8> AddrDefs; 7079 for (BasicBlock *BB : TheLoop->blocks()) 7080 for (Instruction &I : *BB) { 7081 Instruction *PtrDef = 7082 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7083 if (PtrDef && TheLoop->contains(PtrDef) && 7084 getWideningDecision(&I, VF) != CM_GatherScatter) 7085 AddrDefs.insert(PtrDef); 7086 } 7087 7088 // Add all instructions used to generate the addresses. 7089 SmallVector<Instruction *, 4> Worklist; 7090 append_range(Worklist, AddrDefs); 7091 while (!Worklist.empty()) { 7092 Instruction *I = Worklist.pop_back_val(); 7093 for (auto &Op : I->operands()) 7094 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7095 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7096 AddrDefs.insert(InstOp).second) 7097 Worklist.push_back(InstOp); 7098 } 7099 7100 for (auto *I : AddrDefs) { 7101 if (isa<LoadInst>(I)) { 7102 // Setting the desired widening decision should ideally be handled in 7103 // by cost functions, but since this involves the task of finding out 7104 // if the loaded register is involved in an address computation, it is 7105 // instead changed here when we know this is the case. 7106 InstWidening Decision = getWideningDecision(I, VF); 7107 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7108 // Scalarize a widened load of address. 7109 setWideningDecision( 7110 I, VF, CM_Scalarize, 7111 (VF.getKnownMinValue() * 7112 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7113 else if (auto Group = getInterleavedAccessGroup(I)) { 7114 // Scalarize an interleave group of address loads. 7115 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7116 if (Instruction *Member = Group->getMember(I)) 7117 setWideningDecision( 7118 Member, VF, CM_Scalarize, 7119 (VF.getKnownMinValue() * 7120 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7121 } 7122 } 7123 } else 7124 // Make sure I gets scalarized and a cost estimate without 7125 // scalarization overhead. 7126 ForcedScalars[VF].insert(I); 7127 } 7128 } 7129 7130 InstructionCost 7131 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7132 Type *&VectorTy) { 7133 Type *RetTy = I->getType(); 7134 if (canTruncateToMinimalBitwidth(I, VF)) 7135 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7136 auto SE = PSE.getSE(); 7137 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7138 7139 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7140 ElementCount VF) -> bool { 7141 if (VF.isScalar()) 7142 return true; 7143 7144 auto Scalarized = InstsToScalarize.find(VF); 7145 assert(Scalarized != InstsToScalarize.end() && 7146 "VF not yet analyzed for scalarization profitability"); 7147 return !Scalarized->second.count(I) && 7148 llvm::all_of(I->users(), [&](User *U) { 7149 auto *UI = cast<Instruction>(U); 7150 return !Scalarized->second.count(UI); 7151 }); 7152 }; 7153 (void) hasSingleCopyAfterVectorization; 7154 7155 if (isScalarAfterVectorization(I, VF)) { 7156 // With the exception of GEPs and PHIs, after scalarization there should 7157 // only be one copy of the instruction generated in the loop. This is 7158 // because the VF is either 1, or any instructions that need scalarizing 7159 // have already been dealt with by the the time we get here. As a result, 7160 // it means we don't have to multiply the instruction cost by VF. 7161 assert(I->getOpcode() == Instruction::GetElementPtr || 7162 I->getOpcode() == Instruction::PHI || 7163 (I->getOpcode() == Instruction::BitCast && 7164 I->getType()->isPointerTy()) || 7165 hasSingleCopyAfterVectorization(I, VF)); 7166 VectorTy = RetTy; 7167 } else 7168 VectorTy = ToVectorTy(RetTy, VF); 7169 7170 // TODO: We need to estimate the cost of intrinsic calls. 7171 switch (I->getOpcode()) { 7172 case Instruction::GetElementPtr: 7173 // We mark this instruction as zero-cost because the cost of GEPs in 7174 // vectorized code depends on whether the corresponding memory instruction 7175 // is scalarized or not. Therefore, we handle GEPs with the memory 7176 // instruction cost. 7177 return 0; 7178 case Instruction::Br: { 7179 // In cases of scalarized and predicated instructions, there will be VF 7180 // predicated blocks in the vectorized loop. Each branch around these 7181 // blocks requires also an extract of its vector compare i1 element. 7182 bool ScalarPredicatedBB = false; 7183 BranchInst *BI = cast<BranchInst>(I); 7184 if (VF.isVector() && BI->isConditional() && 7185 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 7186 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 7187 ScalarPredicatedBB = true; 7188 7189 if (ScalarPredicatedBB) { 7190 // Not possible to scalarize scalable vector with predicated instructions. 7191 if (VF.isScalable()) 7192 return InstructionCost::getInvalid(); 7193 // Return cost for branches around scalarized and predicated blocks. 7194 auto *Vec_i1Ty = 7195 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7196 return ( 7197 TTI.getScalarizationOverhead( 7198 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 7199 /*Insert*/ false, /*Extract*/ true, CostKind) + 7200 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7201 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7202 // The back-edge branch will remain, as will all scalar branches. 7203 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7204 else 7205 // This branch will be eliminated by if-conversion. 7206 return 0; 7207 // Note: We currently assume zero cost for an unconditional branch inside 7208 // a predicated block since it will become a fall-through, although we 7209 // may decide in the future to call TTI for all branches. 7210 } 7211 case Instruction::PHI: { 7212 auto *Phi = cast<PHINode>(I); 7213 7214 // First-order recurrences are replaced by vector shuffles inside the loop. 7215 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 7216 SmallVector<int> Mask(VF.getKnownMinValue()); 7217 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 7218 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 7219 cast<VectorType>(VectorTy), Mask, CostKind, 7220 VF.getKnownMinValue() - 1); 7221 } 7222 7223 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7224 // converted into select instructions. We require N - 1 selects per phi 7225 // node, where N is the number of incoming values. 7226 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7227 return (Phi->getNumIncomingValues() - 1) * 7228 TTI.getCmpSelInstrCost( 7229 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7230 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7231 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7232 7233 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7234 } 7235 case Instruction::UDiv: 7236 case Instruction::SDiv: 7237 case Instruction::URem: 7238 case Instruction::SRem: 7239 if (VF.isVector() && isPredicatedInst(I)) { 7240 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 7241 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 7242 ScalarCost : SafeDivisorCost; 7243 } 7244 // We've proven all lanes safe to speculate, fall through. 7245 [[fallthrough]]; 7246 case Instruction::Add: 7247 case Instruction::FAdd: 7248 case Instruction::Sub: 7249 case Instruction::FSub: 7250 case Instruction::Mul: 7251 case Instruction::FMul: 7252 case Instruction::FDiv: 7253 case Instruction::FRem: 7254 case Instruction::Shl: 7255 case Instruction::LShr: 7256 case Instruction::AShr: 7257 case Instruction::And: 7258 case Instruction::Or: 7259 case Instruction::Xor: { 7260 // If we're speculating on the stride being 1, the multiplication may 7261 // fold away. We can generalize this for all operations using the notion 7262 // of neutral elements. (TODO) 7263 if (I->getOpcode() == Instruction::Mul && 7264 (PSE.getSCEV(I->getOperand(0))->isOne() || 7265 PSE.getSCEV(I->getOperand(1))->isOne())) 7266 return 0; 7267 7268 // Detect reduction patterns 7269 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7270 return *RedCost; 7271 7272 // Certain instructions can be cheaper to vectorize if they have a constant 7273 // second vector operand. One example of this are shifts on x86. 7274 Value *Op2 = I->getOperand(1); 7275 auto Op2Info = TTI.getOperandInfo(Op2); 7276 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 7277 Legal->isInvariant(Op2)) 7278 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7279 7280 SmallVector<const Value *, 4> Operands(I->operand_values()); 7281 return TTI.getArithmeticInstrCost( 7282 I->getOpcode(), VectorTy, CostKind, 7283 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7284 Op2Info, Operands, I); 7285 } 7286 case Instruction::FNeg: { 7287 return TTI.getArithmeticInstrCost( 7288 I->getOpcode(), VectorTy, CostKind, 7289 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7290 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7291 I->getOperand(0), I); 7292 } 7293 case Instruction::Select: { 7294 SelectInst *SI = cast<SelectInst>(I); 7295 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7296 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7297 7298 const Value *Op0, *Op1; 7299 using namespace llvm::PatternMatch; 7300 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7301 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7302 // select x, y, false --> x & y 7303 // select x, true, y --> x | y 7304 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7305 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7306 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7307 Op1->getType()->getScalarSizeInBits() == 1); 7308 7309 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7310 return TTI.getArithmeticInstrCost( 7311 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7312 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7313 } 7314 7315 Type *CondTy = SI->getCondition()->getType(); 7316 if (!ScalarCond) 7317 CondTy = VectorType::get(CondTy, VF); 7318 7319 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7320 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7321 Pred = Cmp->getPredicate(); 7322 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7323 CostKind, I); 7324 } 7325 case Instruction::ICmp: 7326 case Instruction::FCmp: { 7327 Type *ValTy = I->getOperand(0)->getType(); 7328 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7329 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7330 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7331 VectorTy = ToVectorTy(ValTy, VF); 7332 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7333 cast<CmpInst>(I)->getPredicate(), CostKind, 7334 I); 7335 } 7336 case Instruction::Store: 7337 case Instruction::Load: { 7338 ElementCount Width = VF; 7339 if (Width.isVector()) { 7340 InstWidening Decision = getWideningDecision(I, Width); 7341 assert(Decision != CM_Unknown && 7342 "CM decision should be taken at this point"); 7343 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7344 return InstructionCost::getInvalid(); 7345 if (Decision == CM_Scalarize) 7346 Width = ElementCount::getFixed(1); 7347 } 7348 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7349 return getMemoryInstructionCost(I, VF); 7350 } 7351 case Instruction::BitCast: 7352 if (I->getType()->isPointerTy()) 7353 return 0; 7354 [[fallthrough]]; 7355 case Instruction::ZExt: 7356 case Instruction::SExt: 7357 case Instruction::FPToUI: 7358 case Instruction::FPToSI: 7359 case Instruction::FPExt: 7360 case Instruction::PtrToInt: 7361 case Instruction::IntToPtr: 7362 case Instruction::SIToFP: 7363 case Instruction::UIToFP: 7364 case Instruction::Trunc: 7365 case Instruction::FPTrunc: { 7366 // Computes the CastContextHint from a Load/Store instruction. 7367 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7368 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7369 "Expected a load or a store!"); 7370 7371 if (VF.isScalar() || !TheLoop->contains(I)) 7372 return TTI::CastContextHint::Normal; 7373 7374 switch (getWideningDecision(I, VF)) { 7375 case LoopVectorizationCostModel::CM_GatherScatter: 7376 return TTI::CastContextHint::GatherScatter; 7377 case LoopVectorizationCostModel::CM_Interleave: 7378 return TTI::CastContextHint::Interleave; 7379 case LoopVectorizationCostModel::CM_Scalarize: 7380 case LoopVectorizationCostModel::CM_Widen: 7381 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7382 : TTI::CastContextHint::Normal; 7383 case LoopVectorizationCostModel::CM_Widen_Reverse: 7384 return TTI::CastContextHint::Reversed; 7385 case LoopVectorizationCostModel::CM_Unknown: 7386 llvm_unreachable("Instr did not go through cost modelling?"); 7387 } 7388 7389 llvm_unreachable("Unhandled case!"); 7390 }; 7391 7392 unsigned Opcode = I->getOpcode(); 7393 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7394 // For Trunc, the context is the only user, which must be a StoreInst. 7395 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7396 if (I->hasOneUse()) 7397 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7398 CCH = ComputeCCH(Store); 7399 } 7400 // For Z/Sext, the context is the operand, which must be a LoadInst. 7401 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7402 Opcode == Instruction::FPExt) { 7403 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7404 CCH = ComputeCCH(Load); 7405 } 7406 7407 // We optimize the truncation of induction variables having constant 7408 // integer steps. The cost of these truncations is the same as the scalar 7409 // operation. 7410 if (isOptimizableIVTruncate(I, VF)) { 7411 auto *Trunc = cast<TruncInst>(I); 7412 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7413 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7414 } 7415 7416 // Detect reduction patterns 7417 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7418 return *RedCost; 7419 7420 Type *SrcScalarTy = I->getOperand(0)->getType(); 7421 Type *SrcVecTy = 7422 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7423 if (canTruncateToMinimalBitwidth(I, VF)) { 7424 // This cast is going to be shrunk. This may remove the cast or it might 7425 // turn it into slightly different cast. For example, if MinBW == 16, 7426 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7427 // 7428 // Calculate the modified src and dest types. 7429 Type *MinVecTy = VectorTy; 7430 if (Opcode == Instruction::Trunc) { 7431 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7432 VectorTy = 7433 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7434 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7435 // Leave SrcVecTy unchanged - we only shrink the destination element 7436 // type. 7437 VectorTy = 7438 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7439 } 7440 } 7441 7442 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7443 } 7444 case Instruction::Call: { 7445 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7446 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7447 return *RedCost; 7448 Function *Variant; 7449 CallInst *CI = cast<CallInst>(I); 7450 InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); 7451 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7452 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7453 return std::min(CallCost, IntrinsicCost); 7454 } 7455 return CallCost; 7456 } 7457 case Instruction::ExtractValue: 7458 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7459 case Instruction::Alloca: 7460 // We cannot easily widen alloca to a scalable alloca, as 7461 // the result would need to be a vector of pointers. 7462 if (VF.isScalable()) 7463 return InstructionCost::getInvalid(); 7464 [[fallthrough]]; 7465 default: 7466 // This opcode is unknown. Assume that it is the same as 'mul'. 7467 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7468 } // end of switch. 7469 } 7470 7471 void LoopVectorizationCostModel::collectValuesToIgnore() { 7472 // Ignore ephemeral values. 7473 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7474 7475 // Find all stores to invariant variables. Since they are going to sink 7476 // outside the loop we do not need calculate cost for them. 7477 for (BasicBlock *BB : TheLoop->blocks()) 7478 for (Instruction &I : *BB) { 7479 StoreInst *SI; 7480 if ((SI = dyn_cast<StoreInst>(&I)) && 7481 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7482 ValuesToIgnore.insert(&I); 7483 } 7484 7485 // Ignore type-promoting instructions we identified during reduction 7486 // detection. 7487 for (const auto &Reduction : Legal->getReductionVars()) { 7488 const RecurrenceDescriptor &RedDes = Reduction.second; 7489 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7490 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7491 } 7492 // Ignore type-casting instructions we identified during induction 7493 // detection. 7494 for (const auto &Induction : Legal->getInductionVars()) { 7495 const InductionDescriptor &IndDes = Induction.second; 7496 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7497 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7498 } 7499 } 7500 7501 void LoopVectorizationCostModel::collectInLoopReductions() { 7502 for (const auto &Reduction : Legal->getReductionVars()) { 7503 PHINode *Phi = Reduction.first; 7504 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7505 7506 // We don't collect reductions that are type promoted (yet). 7507 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7508 continue; 7509 7510 // If the target would prefer this reduction to happen "in-loop", then we 7511 // want to record it as such. 7512 unsigned Opcode = RdxDesc.getOpcode(); 7513 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7514 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7515 TargetTransformInfo::ReductionFlags())) 7516 continue; 7517 7518 // Check that we can correctly put the reductions into the loop, by 7519 // finding the chain of operations that leads from the phi to the loop 7520 // exit value. 7521 SmallVector<Instruction *, 4> ReductionOperations = 7522 RdxDesc.getReductionOpChain(Phi, TheLoop); 7523 bool InLoop = !ReductionOperations.empty(); 7524 if (InLoop) { 7525 InLoopReductionChains[Phi] = ReductionOperations; 7526 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7527 Instruction *LastChain = Phi; 7528 for (auto *I : ReductionOperations) { 7529 InLoopReductionImmediateChains[I] = LastChain; 7530 LastChain = I; 7531 } 7532 } 7533 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7534 << " reduction for phi: " << *Phi << "\n"); 7535 } 7536 } 7537 7538 // TODO: we could return a pair of values that specify the max VF and 7539 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7540 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7541 // doesn't have a cost model that can choose which plan to execute if 7542 // more than one is generated. 7543 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7544 LoopVectorizationCostModel &CM) { 7545 unsigned WidestType; 7546 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7547 return WidestVectorRegBits / WidestType; 7548 } 7549 7550 VectorizationFactor 7551 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7552 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7553 ElementCount VF = UserVF; 7554 // Outer loop handling: They may require CFG and instruction level 7555 // transformations before even evaluating whether vectorization is profitable. 7556 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7557 // the vectorization pipeline. 7558 if (!OrigLoop->isInnermost()) { 7559 // If the user doesn't provide a vectorization factor, determine a 7560 // reasonable one. 7561 if (UserVF.isZero()) { 7562 VF = ElementCount::getFixed(determineVPlanVF( 7563 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7564 .getFixedValue(), 7565 CM)); 7566 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7567 7568 // Make sure we have a VF > 1 for stress testing. 7569 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7570 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7571 << "overriding computed VF.\n"); 7572 VF = ElementCount::getFixed(4); 7573 } 7574 } 7575 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7576 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7577 "VF needs to be a power of two"); 7578 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7579 << "VF " << VF << " to build VPlans.\n"); 7580 buildVPlans(VF, VF); 7581 7582 // For VPlan build stress testing, we bail out after VPlan construction. 7583 if (VPlanBuildStressTest) 7584 return VectorizationFactor::Disabled(); 7585 7586 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7587 } 7588 7589 LLVM_DEBUG( 7590 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7591 "VPlan-native path.\n"); 7592 return VectorizationFactor::Disabled(); 7593 } 7594 7595 std::optional<VectorizationFactor> 7596 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7597 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7598 CM.collectValuesToIgnore(); 7599 CM.collectElementTypesForWidening(); 7600 7601 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7602 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7603 return std::nullopt; 7604 7605 // Invalidate interleave groups if all blocks of loop will be predicated. 7606 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7607 !useMaskedInterleavedAccesses(TTI)) { 7608 LLVM_DEBUG( 7609 dbgs() 7610 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7611 "which requires masked-interleaved support.\n"); 7612 if (CM.InterleaveInfo.invalidateGroups()) 7613 // Invalidating interleave groups also requires invalidating all decisions 7614 // based on them, which includes widening decisions and uniform and scalar 7615 // values. 7616 CM.invalidateCostModelingDecisions(); 7617 } 7618 7619 ElementCount MaxUserVF = 7620 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7621 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7622 if (!UserVF.isZero() && UserVFIsLegal) { 7623 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7624 "VF needs to be a power of two"); 7625 // Collect the instructions (and their associated costs) that will be more 7626 // profitable to scalarize. 7627 if (CM.selectUserVectorizationFactor(UserVF)) { 7628 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7629 CM.collectInLoopReductions(); 7630 buildVPlansWithVPRecipes(UserVF, UserVF); 7631 if (!hasPlanWithVF(UserVF)) { 7632 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7633 << ".\n"); 7634 return std::nullopt; 7635 } 7636 7637 LLVM_DEBUG(printPlans(dbgs())); 7638 return {{UserVF, 0, 0}}; 7639 } else 7640 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7641 "InvalidCost", ORE, OrigLoop); 7642 } 7643 7644 // Populate the set of Vectorization Factor Candidates. 7645 ElementCountSet VFCandidates; 7646 for (auto VF = ElementCount::getFixed(1); 7647 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7648 VFCandidates.insert(VF); 7649 for (auto VF = ElementCount::getScalable(1); 7650 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7651 VFCandidates.insert(VF); 7652 7653 for (const auto &VF : VFCandidates) { 7654 // Collect Uniform and Scalar instructions after vectorization with VF. 7655 CM.collectUniformsAndScalars(VF); 7656 7657 // Collect the instructions (and their associated costs) that will be more 7658 // profitable to scalarize. 7659 if (VF.isVector()) 7660 CM.collectInstsToScalarize(VF); 7661 } 7662 7663 CM.collectInLoopReductions(); 7664 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7665 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7666 7667 LLVM_DEBUG(printPlans(dbgs())); 7668 if (!MaxFactors.hasVector()) 7669 return VectorizationFactor::Disabled(); 7670 7671 // Select the optimal vectorization factor. 7672 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7673 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7674 if (!hasPlanWithVF(VF.Width)) { 7675 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7676 << ".\n"); 7677 return std::nullopt; 7678 } 7679 return VF; 7680 } 7681 7682 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7683 assert(count_if(VPlans, 7684 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7685 1 && 7686 "Best VF has not a single VPlan."); 7687 7688 for (const VPlanPtr &Plan : VPlans) { 7689 if (Plan->hasVF(VF)) 7690 return *Plan.get(); 7691 } 7692 llvm_unreachable("No plan found!"); 7693 } 7694 7695 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7696 SmallVector<Metadata *, 4> MDs; 7697 // Reserve first location for self reference to the LoopID metadata node. 7698 MDs.push_back(nullptr); 7699 bool IsUnrollMetadata = false; 7700 MDNode *LoopID = L->getLoopID(); 7701 if (LoopID) { 7702 // First find existing loop unrolling disable metadata. 7703 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7704 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7705 if (MD) { 7706 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7707 IsUnrollMetadata = 7708 S && S->getString().startswith("llvm.loop.unroll.disable"); 7709 } 7710 MDs.push_back(LoopID->getOperand(i)); 7711 } 7712 } 7713 7714 if (!IsUnrollMetadata) { 7715 // Add runtime unroll disable metadata. 7716 LLVMContext &Context = L->getHeader()->getContext(); 7717 SmallVector<Metadata *, 1> DisableOperands; 7718 DisableOperands.push_back( 7719 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7720 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7721 MDs.push_back(DisableNode); 7722 MDNode *NewLoopID = MDNode::get(Context, MDs); 7723 // Set operand 0 to refer to the loop id itself. 7724 NewLoopID->replaceOperandWith(0, NewLoopID); 7725 L->setLoopID(NewLoopID); 7726 } 7727 } 7728 7729 SCEV2ValueTy LoopVectorizationPlanner::executePlan( 7730 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7731 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7732 DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7733 assert(BestVPlan.hasVF(BestVF) && 7734 "Trying to execute plan with unsupported VF"); 7735 assert(BestVPlan.hasUF(BestUF) && 7736 "Trying to execute plan with unsupported UF"); 7737 assert( 7738 (IsEpilogueVectorization || !ExpandedSCEVs) && 7739 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7740 7741 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7742 << '\n'); 7743 7744 if (!IsEpilogueVectorization) 7745 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7746 7747 // Perform the actual loop transformation. 7748 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7749 7750 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7751 // before making any changes to the CFG. 7752 if (!BestVPlan.getPreheader()->empty()) { 7753 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7754 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7755 BestVPlan.getPreheader()->execute(&State); 7756 } 7757 if (!ILV.getTripCount()) 7758 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7759 else 7760 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7761 "count during epilogue vectorization"); 7762 7763 // 1. Set up the skeleton for vectorization, including vector pre-header and 7764 // middle block. The vector loop is created during VPlan execution. 7765 Value *CanonicalIVStartValue; 7766 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7767 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7768 : State.ExpandedSCEVs); 7769 7770 // Only use noalias metadata when using memory checks guaranteeing no overlap 7771 // across all iterations. 7772 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7773 std::unique_ptr<LoopVersioning> LVer = nullptr; 7774 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7775 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7776 7777 // We currently don't use LoopVersioning for the actual loop cloning but we 7778 // still use it to add the noalias metadata. 7779 // TODO: Find a better way to re-use LoopVersioning functionality to add 7780 // metadata. 7781 LVer = std::make_unique<LoopVersioning>( 7782 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7783 PSE.getSE()); 7784 State.LVer = &*LVer; 7785 State.LVer->prepareNoAliasMetadata(); 7786 } 7787 7788 ILV.collectPoisonGeneratingRecipes(State); 7789 7790 ILV.printDebugTracesAtStart(); 7791 7792 //===------------------------------------------------===// 7793 // 7794 // Notice: any optimization or new instruction that go 7795 // into the code below should also be implemented in 7796 // the cost-model. 7797 // 7798 //===------------------------------------------------===// 7799 7800 // 2. Copy and widen instructions from the old loop into the new loop. 7801 BestVPlan.prepareToExecute( 7802 ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), 7803 CanonicalIVStartValue, State, IsEpilogueVectorization); 7804 7805 BestVPlan.execute(&State); 7806 7807 // Keep all loop hints from the original loop on the vector loop (we'll 7808 // replace the vectorizer-specific hints below). 7809 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7810 7811 std::optional<MDNode *> VectorizedLoopID = 7812 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7813 LLVMLoopVectorizeFollowupVectorized}); 7814 7815 VPBasicBlock *HeaderVPBB = 7816 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7817 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7818 if (VectorizedLoopID) 7819 L->setLoopID(*VectorizedLoopID); 7820 else { 7821 // Keep all loop hints from the original loop on the vector loop (we'll 7822 // replace the vectorizer-specific hints below). 7823 if (MDNode *LID = OrigLoop->getLoopID()) 7824 L->setLoopID(LID); 7825 7826 LoopVectorizeHints Hints(L, true, *ORE); 7827 Hints.setAlreadyVectorized(); 7828 } 7829 TargetTransformInfo::UnrollingPreferences UP; 7830 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7831 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7832 AddRuntimeUnrollDisableMetaData(L); 7833 7834 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7835 // predication, updating analyses. 7836 ILV.fixVectorizedLoop(State, BestVPlan); 7837 7838 ILV.printDebugTracesAtEnd(); 7839 7840 return State.ExpandedSCEVs; 7841 } 7842 7843 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7844 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7845 for (const auto &Plan : VPlans) 7846 if (PrintVPlansInDotFormat) 7847 Plan->printDOT(O); 7848 else 7849 Plan->print(O); 7850 } 7851 #endif 7852 7853 //===--------------------------------------------------------------------===// 7854 // EpilogueVectorizerMainLoop 7855 //===--------------------------------------------------------------------===// 7856 7857 /// This function is partially responsible for generating the control flow 7858 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7859 std::pair<BasicBlock *, Value *> 7860 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7861 const SCEV2ValueTy &ExpandedSCEVs) { 7862 createVectorLoopSkeleton(""); 7863 7864 // Generate the code to check the minimum iteration count of the vector 7865 // epilogue (see below). 7866 EPI.EpilogueIterationCountCheck = 7867 emitIterationCountCheck(LoopScalarPreHeader, true); 7868 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7869 7870 // Generate the code to check any assumptions that we've made for SCEV 7871 // expressions. 7872 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7873 7874 // Generate the code that checks at runtime if arrays overlap. We put the 7875 // checks into a separate block to make the more common case of few elements 7876 // faster. 7877 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7878 7879 // Generate the iteration count check for the main loop, *after* the check 7880 // for the epilogue loop, so that the path-length is shorter for the case 7881 // that goes directly through the vector epilogue. The longer-path length for 7882 // the main loop is compensated for, by the gain from vectorizing the larger 7883 // trip count. Note: the branch will get updated later on when we vectorize 7884 // the epilogue. 7885 EPI.MainLoopIterationCountCheck = 7886 emitIterationCountCheck(LoopScalarPreHeader, false); 7887 7888 // Generate the induction variable. 7889 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7890 7891 // Skip induction resume value creation here because they will be created in 7892 // the second pass for the scalar loop. The induction resume values for the 7893 // inductions in the epilogue loop are created before executing the plan for 7894 // the epilogue loop. 7895 7896 return {completeLoopSkeleton(), nullptr}; 7897 } 7898 7899 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7900 LLVM_DEBUG({ 7901 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7902 << "Main Loop VF:" << EPI.MainLoopVF 7903 << ", Main Loop UF:" << EPI.MainLoopUF 7904 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7905 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7906 }); 7907 } 7908 7909 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7910 DEBUG_WITH_TYPE(VerboseDebug, { 7911 dbgs() << "intermediate fn:\n" 7912 << *OrigLoop->getHeader()->getParent() << "\n"; 7913 }); 7914 } 7915 7916 BasicBlock * 7917 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7918 bool ForEpilogue) { 7919 assert(Bypass && "Expected valid bypass basic block."); 7920 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7921 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7922 Value *Count = getTripCount(); 7923 // Reuse existing vector loop preheader for TC checks. 7924 // Note that new preheader block is generated for vector loop. 7925 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7926 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7927 7928 // Generate code to check if the loop's trip count is less than VF * UF of the 7929 // main vector loop. 7930 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7931 : VF.isVector()) 7932 ? ICmpInst::ICMP_ULE 7933 : ICmpInst::ICMP_ULT; 7934 7935 Value *CheckMinIters = Builder.CreateICmp( 7936 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7937 "min.iters.check"); 7938 7939 if (!ForEpilogue) 7940 TCCheckBlock->setName("vector.main.loop.iter.check"); 7941 7942 // Create new preheader for vector loop. 7943 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7944 DT, LI, nullptr, "vector.ph"); 7945 7946 if (ForEpilogue) { 7947 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7948 DT->getNode(Bypass)->getIDom()) && 7949 "TC check is expected to dominate Bypass"); 7950 7951 // Update dominator for Bypass & LoopExit. 7952 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7953 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7954 // For loops with multiple exits, there's no edge from the middle block 7955 // to exit blocks (as the epilogue must run) and thus no need to update 7956 // the immediate dominator of the exit blocks. 7957 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7958 7959 LoopBypassBlocks.push_back(TCCheckBlock); 7960 7961 // Save the trip count so we don't have to regenerate it in the 7962 // vec.epilog.iter.check. This is safe to do because the trip count 7963 // generated here dominates the vector epilog iter check. 7964 EPI.TripCount = Count; 7965 } 7966 7967 ReplaceInstWithInst( 7968 TCCheckBlock->getTerminator(), 7969 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7970 7971 return TCCheckBlock; 7972 } 7973 7974 //===--------------------------------------------------------------------===// 7975 // EpilogueVectorizerEpilogueLoop 7976 //===--------------------------------------------------------------------===// 7977 7978 /// This function is partially responsible for generating the control flow 7979 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7980 std::pair<BasicBlock *, Value *> 7981 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7982 const SCEV2ValueTy &ExpandedSCEVs) { 7983 createVectorLoopSkeleton("vec.epilog."); 7984 7985 // Now, compare the remaining count and if there aren't enough iterations to 7986 // execute the vectorized epilogue skip to the scalar part. 7987 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7988 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7989 LoopVectorPreHeader = 7990 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7991 LI, nullptr, "vec.epilog.ph"); 7992 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7993 VecEpilogueIterationCountCheck); 7994 7995 // Adjust the control flow taking the state info from the main loop 7996 // vectorization into account. 7997 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7998 "expected this to be saved from the previous pass."); 7999 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8000 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8001 8002 DT->changeImmediateDominator(LoopVectorPreHeader, 8003 EPI.MainLoopIterationCountCheck); 8004 8005 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8006 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8007 8008 if (EPI.SCEVSafetyCheck) 8009 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8010 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8011 if (EPI.MemSafetyCheck) 8012 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8013 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8014 8015 DT->changeImmediateDominator( 8016 VecEpilogueIterationCountCheck, 8017 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8018 8019 DT->changeImmediateDominator(LoopScalarPreHeader, 8020 EPI.EpilogueIterationCountCheck); 8021 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 8022 // If there is an epilogue which must run, there's no edge from the 8023 // middle block to exit blocks and thus no need to update the immediate 8024 // dominator of the exit blocks. 8025 DT->changeImmediateDominator(LoopExitBlock, 8026 EPI.EpilogueIterationCountCheck); 8027 8028 // Keep track of bypass blocks, as they feed start values to the induction and 8029 // reduction phis in the scalar loop preheader. 8030 if (EPI.SCEVSafetyCheck) 8031 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8032 if (EPI.MemSafetyCheck) 8033 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8034 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8035 8036 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 8037 // reductions which merge control-flow from the latch block and the middle 8038 // block. Update the incoming values here and move the Phi into the preheader. 8039 SmallVector<PHINode *, 4> PhisInBlock; 8040 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 8041 PhisInBlock.push_back(&Phi); 8042 8043 for (PHINode *Phi : PhisInBlock) { 8044 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 8045 Phi->replaceIncomingBlockWith( 8046 VecEpilogueIterationCountCheck->getSinglePredecessor(), 8047 VecEpilogueIterationCountCheck); 8048 8049 // If the phi doesn't have an incoming value from the 8050 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 8051 // value and also those from other check blocks. This is needed for 8052 // reduction phis only. 8053 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 8054 return EPI.EpilogueIterationCountCheck == IncB; 8055 })) 8056 continue; 8057 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 8058 if (EPI.SCEVSafetyCheck) 8059 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 8060 if (EPI.MemSafetyCheck) 8061 Phi->removeIncomingValue(EPI.MemSafetyCheck); 8062 } 8063 8064 // Generate a resume induction for the vector epilogue and put it in the 8065 // vector epilogue preheader 8066 Type *IdxTy = Legal->getWidestInductionType(); 8067 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8068 LoopVectorPreHeader->getFirstNonPHI()); 8069 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8070 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8071 EPI.MainLoopIterationCountCheck); 8072 8073 // Generate induction resume values. These variables save the new starting 8074 // indexes for the scalar loop. They are used to test if there are any tail 8075 // iterations left once the vector loop has completed. 8076 // Note that when the vectorized epilogue is skipped due to iteration count 8077 // check, then the resume value for the induction variable comes from 8078 // the trip count of the main vector loop, hence passing the AdditionalBypass 8079 // argument. 8080 createInductionResumeValues(ExpandedSCEVs, 8081 {VecEpilogueIterationCountCheck, 8082 EPI.VectorTripCount} /* AdditionalBypass */); 8083 8084 return {completeLoopSkeleton(), EPResumeVal}; 8085 } 8086 8087 BasicBlock * 8088 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8089 BasicBlock *Bypass, BasicBlock *Insert) { 8090 8091 assert(EPI.TripCount && 8092 "Expected trip count to have been safed in the first pass."); 8093 assert( 8094 (!isa<Instruction>(EPI.TripCount) || 8095 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8096 "saved trip count does not dominate insertion point."); 8097 Value *TC = EPI.TripCount; 8098 IRBuilder<> Builder(Insert->getTerminator()); 8099 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8100 8101 // Generate code to check if the loop's trip count is less than VF * UF of the 8102 // vector epilogue loop. 8103 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 8104 ? ICmpInst::ICMP_ULE 8105 : ICmpInst::ICMP_ULT; 8106 8107 Value *CheckMinIters = 8108 Builder.CreateICmp(P, Count, 8109 createStepForVF(Builder, Count->getType(), 8110 EPI.EpilogueVF, EPI.EpilogueUF), 8111 "min.epilog.iters.check"); 8112 8113 ReplaceInstWithInst( 8114 Insert->getTerminator(), 8115 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8116 8117 LoopBypassBlocks.push_back(Insert); 8118 return Insert; 8119 } 8120 8121 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8122 LLVM_DEBUG({ 8123 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8124 << "Epilogue Loop VF:" << EPI.EpilogueVF 8125 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8126 }); 8127 } 8128 8129 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8130 DEBUG_WITH_TYPE(VerboseDebug, { 8131 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 8132 }); 8133 } 8134 8135 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8136 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8137 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8138 bool PredicateAtRangeStart = Predicate(Range.Start); 8139 8140 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 8141 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8142 Range.End = TmpVF; 8143 break; 8144 } 8145 8146 return PredicateAtRangeStart; 8147 } 8148 8149 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8150 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8151 /// of VF's starting at a given VF and extending it as much as possible. Each 8152 /// vectorization decision can potentially shorten this sub-range during 8153 /// buildVPlan(). 8154 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8155 ElementCount MaxVF) { 8156 auto MaxVFTimes2 = MaxVF * 2; 8157 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8158 VFRange SubRange = {VF, MaxVFTimes2}; 8159 VPlans.push_back(buildVPlan(SubRange)); 8160 VF = SubRange.End; 8161 } 8162 } 8163 8164 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8165 VPlan &Plan) { 8166 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8167 8168 // Look for cached value. 8169 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8170 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8171 if (ECEntryIt != EdgeMaskCache.end()) 8172 return ECEntryIt->second; 8173 8174 VPValue *SrcMask = createBlockInMask(Src, Plan); 8175 8176 // The terminator has to be a branch inst! 8177 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8178 assert(BI && "Unexpected terminator found"); 8179 8180 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8181 return EdgeMaskCache[Edge] = SrcMask; 8182 8183 // If source is an exiting block, we know the exit edge is dynamically dead 8184 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8185 // adding uses of an otherwise potentially dead instruction. 8186 if (OrigLoop->isLoopExiting(Src)) 8187 return EdgeMaskCache[Edge] = SrcMask; 8188 8189 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 8190 assert(EdgeMask && "No Edge Mask found for condition"); 8191 8192 if (BI->getSuccessor(0) != Dst) 8193 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8194 8195 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8196 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8197 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8198 // The select version does not introduce new UB if SrcMask is false and 8199 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8200 VPValue *False = Plan.getVPValueOrAddLiveIn( 8201 ConstantInt::getFalse(BI->getCondition()->getType())); 8202 EdgeMask = 8203 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8204 } 8205 8206 return EdgeMaskCache[Edge] = EdgeMask; 8207 } 8208 8209 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8210 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8211 8212 // Look for cached value. 8213 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8214 if (BCEntryIt != BlockMaskCache.end()) 8215 return BCEntryIt->second; 8216 8217 // All-one mask is modelled as no-mask following the convention for masked 8218 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8219 VPValue *BlockMask = nullptr; 8220 8221 if (OrigLoop->getHeader() == BB) { 8222 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8223 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8224 8225 assert(CM.foldTailByMasking() && "must fold the tail"); 8226 8227 // If we're using the active lane mask for control flow, then we get the 8228 // mask from the active lane mask PHI that is cached in the VPlan. 8229 TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); 8230 if (useActiveLaneMaskForControlFlow(TFStyle)) 8231 return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); 8232 8233 // Introduce the early-exit compare IV <= BTC to form header block mask. 8234 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8235 // constructing the desired canonical IV in the header block as its first 8236 // non-phi instructions. 8237 8238 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8239 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8240 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8241 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8242 8243 VPBuilder::InsertPointGuard Guard(Builder); 8244 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8245 if (useActiveLaneMask(TFStyle)) { 8246 VPValue *TC = Plan.getTripCount(); 8247 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, 8248 nullptr, "active.lane.mask"); 8249 } else { 8250 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8251 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8252 } 8253 return BlockMaskCache[BB] = BlockMask; 8254 } 8255 8256 // This is the block mask. We OR all incoming edges. 8257 for (auto *Predecessor : predecessors(BB)) { 8258 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8259 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8260 return BlockMaskCache[BB] = EdgeMask; 8261 8262 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8263 BlockMask = EdgeMask; 8264 continue; 8265 } 8266 8267 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8268 } 8269 8270 return BlockMaskCache[BB] = BlockMask; 8271 } 8272 8273 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8274 ArrayRef<VPValue *> Operands, 8275 VFRange &Range, 8276 VPlanPtr &Plan) { 8277 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8278 "Must be called with either a load or store"); 8279 8280 auto willWiden = [&](ElementCount VF) -> bool { 8281 LoopVectorizationCostModel::InstWidening Decision = 8282 CM.getWideningDecision(I, VF); 8283 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8284 "CM decision should be taken at this point."); 8285 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8286 return true; 8287 if (CM.isScalarAfterVectorization(I, VF) || 8288 CM.isProfitableToScalarize(I, VF)) 8289 return false; 8290 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8291 }; 8292 8293 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8294 return nullptr; 8295 8296 VPValue *Mask = nullptr; 8297 if (Legal->isMaskRequired(I)) 8298 Mask = createBlockInMask(I->getParent(), *Plan); 8299 8300 // Determine if the pointer operand of the access is either consecutive or 8301 // reverse consecutive. 8302 LoopVectorizationCostModel::InstWidening Decision = 8303 CM.getWideningDecision(I, Range.Start); 8304 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8305 bool Consecutive = 8306 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8307 8308 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8309 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8310 Consecutive, Reverse); 8311 8312 StoreInst *Store = cast<StoreInst>(I); 8313 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8314 Mask, Consecutive, Reverse); 8315 } 8316 8317 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8318 /// insert a recipe to expand the step for the induction recipe. 8319 static VPWidenIntOrFpInductionRecipe * 8320 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8321 VPValue *Start, const InductionDescriptor &IndDesc, 8322 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8323 VFRange &Range) { 8324 assert(IndDesc.getStartValue() == 8325 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8326 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8327 "step must be loop invariant"); 8328 8329 VPValue *Step = 8330 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8331 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8332 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8333 } 8334 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8335 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8336 } 8337 8338 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8339 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8340 8341 // Check if this is an integer or fp induction. If so, build the recipe that 8342 // produces its scalar and vector values. 8343 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8344 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8345 *PSE.getSE(), *OrigLoop, Range); 8346 8347 // Check if this is pointer induction. If so, build the recipe for it. 8348 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8349 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8350 *PSE.getSE()); 8351 return new VPWidenPointerInductionRecipe( 8352 Phi, Operands[0], Step, *II, 8353 LoopVectorizationPlanner::getDecisionAndClampRange( 8354 [&](ElementCount VF) { 8355 return CM.isScalarAfterVectorization(Phi, VF); 8356 }, 8357 Range)); 8358 } 8359 return nullptr; 8360 } 8361 8362 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8363 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8364 // Optimize the special case where the source is a constant integer 8365 // induction variable. Notice that we can only optimize the 'trunc' case 8366 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8367 // (c) other casts depend on pointer size. 8368 8369 // Determine whether \p K is a truncation based on an induction variable that 8370 // can be optimized. 8371 auto isOptimizableIVTruncate = 8372 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8373 return [=](ElementCount VF) -> bool { 8374 return CM.isOptimizableIVTruncate(K, VF); 8375 }; 8376 }; 8377 8378 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8379 isOptimizableIVTruncate(I), Range)) { 8380 8381 auto *Phi = cast<PHINode>(I->getOperand(0)); 8382 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8383 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8384 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8385 *OrigLoop, Range); 8386 } 8387 return nullptr; 8388 } 8389 8390 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8391 ArrayRef<VPValue *> Operands, 8392 VPlanPtr &Plan) { 8393 // If all incoming values are equal, the incoming VPValue can be used directly 8394 // instead of creating a new VPBlendRecipe. 8395 if (llvm::all_equal(Operands)) 8396 return Operands[0]; 8397 8398 unsigned NumIncoming = Phi->getNumIncomingValues(); 8399 // For in-loop reductions, we do not need to create an additional select. 8400 VPValue *InLoopVal = nullptr; 8401 for (unsigned In = 0; In < NumIncoming; In++) { 8402 PHINode *PhiOp = 8403 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8404 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8405 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8406 InLoopVal = Operands[In]; 8407 } 8408 } 8409 8410 assert((!InLoopVal || NumIncoming == 2) && 8411 "Found an in-loop reduction for PHI with unexpected number of " 8412 "incoming values"); 8413 if (InLoopVal) 8414 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8415 8416 // We know that all PHIs in non-header blocks are converted into selects, so 8417 // we don't have to worry about the insertion order and we can just use the 8418 // builder. At this point we generate the predication tree. There may be 8419 // duplications since this is a simple recursive scan, but future 8420 // optimizations will clean it up. 8421 SmallVector<VPValue *, 2> OperandsWithMask; 8422 8423 for (unsigned In = 0; In < NumIncoming; In++) { 8424 VPValue *EdgeMask = 8425 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8426 assert((EdgeMask || NumIncoming == 1) && 8427 "Multiple predecessors with one having a full mask"); 8428 OperandsWithMask.push_back(Operands[In]); 8429 if (EdgeMask) 8430 OperandsWithMask.push_back(EdgeMask); 8431 } 8432 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8433 } 8434 8435 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8436 ArrayRef<VPValue *> Operands, 8437 VFRange &Range, 8438 VPlanPtr &Plan) { 8439 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8440 [this, CI](ElementCount VF) { 8441 return CM.isScalarWithPredication(CI, VF); 8442 }, 8443 Range); 8444 8445 if (IsPredicated) 8446 return nullptr; 8447 8448 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8449 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8450 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8451 ID == Intrinsic::pseudoprobe || 8452 ID == Intrinsic::experimental_noalias_scope_decl)) 8453 return nullptr; 8454 8455 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8456 8457 // Is it beneficial to perform intrinsic call compared to lib call? 8458 bool ShouldUseVectorIntrinsic = 8459 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8460 [&](ElementCount VF) -> bool { 8461 Function *Variant; 8462 // Is it beneficial to perform intrinsic call compared to lib 8463 // call? 8464 InstructionCost CallCost = 8465 CM.getVectorCallCost(CI, VF, &Variant); 8466 InstructionCost IntrinsicCost = 8467 CM.getVectorIntrinsicCost(CI, VF); 8468 return IntrinsicCost <= CallCost; 8469 }, 8470 Range); 8471 if (ShouldUseVectorIntrinsic) 8472 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); 8473 8474 Function *Variant = nullptr; 8475 ElementCount VariantVF; 8476 bool NeedsMask = false; 8477 // Is better to call a vectorized version of the function than to to scalarize 8478 // the call? 8479 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8480 [&](ElementCount VF) -> bool { 8481 // The following case may be scalarized depending on the VF. 8482 // The flag shows whether we can use a usual Call for vectorized 8483 // version of the instruction. 8484 8485 // If we've found a variant at a previous VF, then stop looking. A 8486 // vectorized variant of a function expects input in a certain shape 8487 // -- basically the number of input registers, the number of lanes 8488 // per register, and whether there's a mask required. 8489 // We store a pointer to the variant in the VPWidenCallRecipe, so 8490 // once we have an appropriate variant it's only valid for that VF. 8491 // This will force a different vplan to be generated for each VF that 8492 // finds a valid variant. 8493 if (Variant) 8494 return false; 8495 CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); 8496 // If we found a valid vector variant at this VF, then store the VF 8497 // in case we need to generate a mask. 8498 if (Variant) 8499 VariantVF = VF; 8500 return Variant != nullptr; 8501 }, 8502 Range); 8503 if (ShouldUseVectorCall) { 8504 if (NeedsMask) { 8505 // We have 2 cases that would require a mask: 8506 // 1) The block needs to be predicated, either due to a conditional 8507 // in the scalar loop or use of an active lane mask with 8508 // tail-folding, and we use the appropriate mask for the block. 8509 // 2) No mask is required for the block, but the only available 8510 // vector variant at this VF requires a mask, so we synthesize an 8511 // all-true mask. 8512 VPValue *Mask = nullptr; 8513 if (Legal->isMaskRequired(CI)) 8514 Mask = createBlockInMask(CI->getParent(), *Plan); 8515 else 8516 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8517 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8518 8519 VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); 8520 unsigned MaskPos = 0; 8521 8522 for (const VFInfo &Info : VFDatabase::getMappings(*CI)) 8523 if (Info.Shape == Shape) { 8524 assert(Info.isMasked() && "Vector function info shape mismatch"); 8525 MaskPos = Info.getParamIndexForOptionalMask().value(); 8526 break; 8527 } 8528 8529 Ops.insert(Ops.begin() + MaskPos, Mask); 8530 } 8531 8532 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8533 Intrinsic::not_intrinsic, Variant); 8534 } 8535 8536 return nullptr; 8537 } 8538 8539 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8540 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8541 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8542 // Instruction should be widened, unless it is scalar after vectorization, 8543 // scalarization is profitable or it is predicated. 8544 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8545 return CM.isScalarAfterVectorization(I, VF) || 8546 CM.isProfitableToScalarize(I, VF) || 8547 CM.isScalarWithPredication(I, VF); 8548 }; 8549 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8550 Range); 8551 } 8552 8553 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8554 ArrayRef<VPValue *> Operands, 8555 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8556 switch (I->getOpcode()) { 8557 default: 8558 return nullptr; 8559 case Instruction::SDiv: 8560 case Instruction::UDiv: 8561 case Instruction::SRem: 8562 case Instruction::URem: { 8563 // If not provably safe, use a select to form a safe divisor before widening the 8564 // div/rem operation itself. Otherwise fall through to general handling below. 8565 if (CM.isPredicatedInst(I)) { 8566 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8567 VPValue *Mask = createBlockInMask(I->getParent(), *Plan); 8568 VPValue *One = Plan->getVPValueOrAddLiveIn( 8569 ConstantInt::get(I->getType(), 1u, false)); 8570 auto *SafeRHS = 8571 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8572 I->getDebugLoc()); 8573 VPBB->appendRecipe(SafeRHS); 8574 Ops[1] = SafeRHS; 8575 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8576 } 8577 [[fallthrough]]; 8578 } 8579 case Instruction::Add: 8580 case Instruction::And: 8581 case Instruction::AShr: 8582 case Instruction::FAdd: 8583 case Instruction::FCmp: 8584 case Instruction::FDiv: 8585 case Instruction::FMul: 8586 case Instruction::FNeg: 8587 case Instruction::FRem: 8588 case Instruction::FSub: 8589 case Instruction::ICmp: 8590 case Instruction::LShr: 8591 case Instruction::Mul: 8592 case Instruction::Or: 8593 case Instruction::Select: 8594 case Instruction::Shl: 8595 case Instruction::Sub: 8596 case Instruction::Xor: 8597 case Instruction::Freeze: 8598 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8599 }; 8600 } 8601 8602 void VPRecipeBuilder::fixHeaderPhis() { 8603 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8604 for (VPHeaderPHIRecipe *R : PhisToFix) { 8605 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8606 VPRecipeBase *IncR = 8607 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8608 R->addOperand(IncR->getVPSingleValue()); 8609 } 8610 } 8611 8612 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8613 VFRange &Range, 8614 VPlan &Plan) { 8615 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8616 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8617 Range); 8618 8619 bool IsPredicated = CM.isPredicatedInst(I); 8620 8621 // Even if the instruction is not marked as uniform, there are certain 8622 // intrinsic calls that can be effectively treated as such, so we check for 8623 // them here. Conservatively, we only do this for scalable vectors, since 8624 // for fixed-width VFs we can always fall back on full scalarization. 8625 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8626 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8627 case Intrinsic::assume: 8628 case Intrinsic::lifetime_start: 8629 case Intrinsic::lifetime_end: 8630 // For scalable vectors if one of the operands is variant then we still 8631 // want to mark as uniform, which will generate one instruction for just 8632 // the first lane of the vector. We can't scalarize the call in the same 8633 // way as for fixed-width vectors because we don't know how many lanes 8634 // there are. 8635 // 8636 // The reasons for doing it this way for scalable vectors are: 8637 // 1. For the assume intrinsic generating the instruction for the first 8638 // lane is still be better than not generating any at all. For 8639 // example, the input may be a splat across all lanes. 8640 // 2. For the lifetime start/end intrinsics the pointer operand only 8641 // does anything useful when the input comes from a stack object, 8642 // which suggests it should always be uniform. For non-stack objects 8643 // the effect is to poison the object, which still allows us to 8644 // remove the call. 8645 IsUniform = true; 8646 break; 8647 default: 8648 break; 8649 } 8650 } 8651 VPValue *BlockInMask = nullptr; 8652 if (!IsPredicated) { 8653 // Finalize the recipe for Instr, first if it is not predicated. 8654 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8655 } else { 8656 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8657 // Instructions marked for predication are replicated and a mask operand is 8658 // added initially. Masked replicate recipes will later be placed under an 8659 // if-then construct to prevent side-effects. Generate recipes to compute 8660 // the block mask for this region. 8661 BlockInMask = createBlockInMask(I->getParent(), Plan); 8662 } 8663 8664 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8665 IsUniform, BlockInMask); 8666 return toVPRecipeResult(Recipe); 8667 } 8668 8669 VPRecipeOrVPValueTy 8670 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8671 ArrayRef<VPValue *> Operands, 8672 VFRange &Range, VPBasicBlock *VPBB, 8673 VPlanPtr &Plan) { 8674 // First, check for specific widening recipes that deal with inductions, Phi 8675 // nodes, calls and memory operations. 8676 VPRecipeBase *Recipe; 8677 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8678 if (Phi->getParent() != OrigLoop->getHeader()) 8679 return tryToBlend(Phi, Operands, Plan); 8680 8681 // Always record recipes for header phis. Later first-order recurrence phis 8682 // can have earlier phis as incoming values. 8683 recordRecipeOf(Phi); 8684 8685 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8686 return toVPRecipeResult(Recipe); 8687 8688 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8689 assert((Legal->isReductionVariable(Phi) || 8690 Legal->isFixedOrderRecurrence(Phi)) && 8691 "can only widen reductions and fixed-order recurrences here"); 8692 VPValue *StartV = Operands[0]; 8693 if (Legal->isReductionVariable(Phi)) { 8694 const RecurrenceDescriptor &RdxDesc = 8695 Legal->getReductionVars().find(Phi)->second; 8696 assert(RdxDesc.getRecurrenceStartValue() == 8697 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8698 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8699 CM.isInLoopReduction(Phi), 8700 CM.useOrderedReductions(RdxDesc)); 8701 } else { 8702 // TODO: Currently fixed-order recurrences are modeled as chains of 8703 // first-order recurrences. If there are no users of the intermediate 8704 // recurrences in the chain, the fixed order recurrence should be modeled 8705 // directly, enabling more efficient codegen. 8706 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8707 } 8708 8709 // Record the incoming value from the backedge, so we can add the incoming 8710 // value from the backedge after all recipes have been created. 8711 auto *Inc = cast<Instruction>( 8712 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8713 auto RecipeIter = Ingredient2Recipe.find(Inc); 8714 if (RecipeIter == Ingredient2Recipe.end()) 8715 recordRecipeOf(Inc); 8716 8717 PhisToFix.push_back(PhiRecipe); 8718 return toVPRecipeResult(PhiRecipe); 8719 } 8720 8721 if (isa<TruncInst>(Instr) && 8722 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8723 Range, *Plan))) 8724 return toVPRecipeResult(Recipe); 8725 8726 // All widen recipes below deal only with VF > 1. 8727 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8728 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8729 return nullptr; 8730 8731 if (auto *CI = dyn_cast<CallInst>(Instr)) 8732 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8733 8734 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8735 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8736 8737 if (!shouldWiden(Instr, Range)) 8738 return nullptr; 8739 8740 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8741 return toVPRecipeResult(new VPWidenGEPRecipe( 8742 GEP, make_range(Operands.begin(), Operands.end()))); 8743 8744 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8745 return toVPRecipeResult(new VPWidenSelectRecipe( 8746 *SI, make_range(Operands.begin(), Operands.end()))); 8747 } 8748 8749 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8750 return toVPRecipeResult( 8751 new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); 8752 } 8753 8754 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8755 } 8756 8757 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8758 ElementCount MaxVF) { 8759 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8760 8761 // Add assume instructions we need to drop to DeadInstructions, to prevent 8762 // them from being added to the VPlan. 8763 // TODO: We only need to drop assumes in blocks that get flattend. If the 8764 // control flow is preserved, we should keep them. 8765 SmallPtrSet<Instruction *, 4> DeadInstructions; 8766 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8767 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8768 8769 auto MaxVFTimes2 = MaxVF * 2; 8770 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8771 VFRange SubRange = {VF, MaxVFTimes2}; 8772 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) 8773 VPlans.push_back(std::move(*Plan)); 8774 VF = SubRange.End; 8775 } 8776 } 8777 8778 // Add the necessary canonical IV and branch recipes required to control the 8779 // loop. 8780 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8781 TailFoldingStyle Style) { 8782 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8783 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8784 8785 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8786 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8787 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8788 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8789 Header->insert(CanonicalIVPHI, Header->begin()); 8790 8791 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8792 // IV by VF * UF. 8793 bool HasNUW = Style == TailFoldingStyle::None; 8794 auto *CanonicalIVIncrement = 8795 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8796 : VPInstruction::CanonicalIVIncrement, 8797 {CanonicalIVPHI}, DL, "index.next"); 8798 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8799 8800 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8801 if (useActiveLaneMaskForControlFlow(Style)) { 8802 // Create the active lane mask instruction in the vplan preheader. 8803 VPBasicBlock *VecPreheader = 8804 cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); 8805 8806 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since 8807 // we have to take unrolling into account. Each part needs to start at 8808 // Part * VF 8809 auto *CanonicalIVIncrementParts = 8810 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8811 : VPInstruction::CanonicalIVIncrementForPart, 8812 {StartV}, DL, "index.part.next"); 8813 VecPreheader->appendRecipe(CanonicalIVIncrementParts); 8814 8815 // Create the ActiveLaneMask instruction using the correct start values. 8816 VPValue *TC = Plan.getTripCount(); 8817 8818 VPValue *TripCount, *IncrementValue; 8819 if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 8820 // When avoiding a runtime check, the active.lane.mask inside the loop 8821 // uses a modified trip count and the induction variable increment is 8822 // done after the active.lane.mask intrinsic is called. 8823 auto *TCMinusVF = 8824 new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); 8825 VecPreheader->appendRecipe(TCMinusVF); 8826 IncrementValue = CanonicalIVPHI; 8827 TripCount = TCMinusVF; 8828 } else { 8829 // When the loop is guarded by a runtime overflow check for the loop 8830 // induction variable increment by VF, we can increment the value before 8831 // the get.active.lane mask and use the unmodified tripcount. 8832 EB->appendRecipe(CanonicalIVIncrement); 8833 IncrementValue = CanonicalIVIncrement; 8834 TripCount = TC; 8835 } 8836 8837 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8838 {CanonicalIVIncrementParts, TC}, DL, 8839 "active.lane.mask.entry"); 8840 VecPreheader->appendRecipe(EntryALM); 8841 8842 // Now create the ActiveLaneMaskPhi recipe in the main loop using the 8843 // preheader ActiveLaneMask instruction. 8844 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); 8845 Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); 8846 8847 // Create the active lane mask for the next iteration of the loop. 8848 CanonicalIVIncrementParts = 8849 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW 8850 : VPInstruction::CanonicalIVIncrementForPart, 8851 {IncrementValue}, DL); 8852 EB->appendRecipe(CanonicalIVIncrementParts); 8853 8854 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, 8855 {CanonicalIVIncrementParts, TripCount}, DL, 8856 "active.lane.mask.next"); 8857 EB->appendRecipe(ALM); 8858 LaneMaskPhi->addOperand(ALM); 8859 8860 if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 8861 // Do the increment of the canonical IV after the active.lane.mask, because 8862 // that value is still based off %CanonicalIVPHI 8863 EB->appendRecipe(CanonicalIVIncrement); 8864 } 8865 8866 // We have to invert the mask here because a true condition means jumping 8867 // to the exit block. 8868 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); 8869 EB->appendRecipe(NotMask); 8870 8871 VPInstruction *BranchBack = 8872 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); 8873 EB->appendRecipe(BranchBack); 8874 } else { 8875 EB->appendRecipe(CanonicalIVIncrement); 8876 8877 // Add the BranchOnCount VPInstruction to the latch. 8878 VPInstruction *BranchBack = new VPInstruction( 8879 VPInstruction::BranchOnCount, 8880 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8881 EB->appendRecipe(BranchBack); 8882 } 8883 } 8884 8885 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8886 // original exit block. 8887 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8888 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8889 VPlan &Plan) { 8890 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8891 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8892 // Only handle single-exit loops with unique exit blocks for now. 8893 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8894 return; 8895 8896 // Introduce VPUsers modeling the exit values. 8897 for (PHINode &ExitPhi : ExitBB->phis()) { 8898 Value *IncomingValue = 8899 ExitPhi.getIncomingValueForBlock(ExitingBB); 8900 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8901 Plan.addLiveOut(&ExitPhi, V); 8902 } 8903 } 8904 8905 std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( 8906 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8907 8908 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8909 8910 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8911 8912 // --------------------------------------------------------------------------- 8913 // Pre-construction: record ingredients whose recipes we'll need to further 8914 // process after constructing the initial VPlan. 8915 // --------------------------------------------------------------------------- 8916 8917 for (const auto &Reduction : CM.getInLoopReductionChains()) { 8918 PHINode *Phi = Reduction.first; 8919 RecurKind Kind = 8920 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8921 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8922 8923 RecipeBuilder.recordRecipeOf(Phi); 8924 for (const auto &R : ReductionOperations) { 8925 RecipeBuilder.recordRecipeOf(R); 8926 // For min/max reductions, where we have a pair of icmp/select, we also 8927 // need to record the ICmp recipe, so it can be removed later. 8928 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8929 "Only min/max recurrences allowed for inloop reductions"); 8930 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8931 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8932 } 8933 } 8934 8935 // For each interleave group which is relevant for this (possibly trimmed) 8936 // Range, add it to the set of groups to be later applied to the VPlan and add 8937 // placeholders for its members' Recipes which we'll be replacing with a 8938 // single VPInterleaveRecipe. 8939 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8940 auto applyIG = [IG, this](ElementCount VF) -> bool { 8941 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8942 CM.getWideningDecision(IG->getInsertPos(), VF) == 8943 LoopVectorizationCostModel::CM_Interleave); 8944 // For scalable vectors, the only interleave factor currently supported 8945 // is 2 since we require the (de)interleave2 intrinsics instead of 8946 // shufflevectors. 8947 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8948 "Unsupported interleave factor for scalable vectors"); 8949 return Result; 8950 }; 8951 if (!getDecisionAndClampRange(applyIG, Range)) 8952 continue; 8953 InterleaveGroups.insert(IG); 8954 for (unsigned i = 0; i < IG->getFactor(); i++) 8955 if (Instruction *Member = IG->getMember(i)) 8956 RecipeBuilder.recordRecipeOf(Member); 8957 }; 8958 8959 // --------------------------------------------------------------------------- 8960 // Build initial VPlan: Scan the body of the loop in a topological order to 8961 // visit each basic block after having visited its predecessor basic blocks. 8962 // --------------------------------------------------------------------------- 8963 8964 // Create initial VPlan skeleton, having a basic block for the pre-header 8965 // which contains SCEV expansions that need to happen before the CFG is 8966 // modified; a basic block for the vector pre-header, followed by a region for 8967 // the vector loop, followed by the middle basic block. The skeleton vector 8968 // loop region contains a header and latch basic blocks. 8969 VPlanPtr Plan = VPlan::createInitialVPlan( 8970 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8971 *PSE.getSE()); 8972 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8973 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8974 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8975 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8976 VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); 8977 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8978 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8979 8980 // Don't use getDecisionAndClampRange here, because we don't know the UF 8981 // so this function is better to be conservative, rather than to split 8982 // it up into different VPlans. 8983 bool IVUpdateMayOverflow = false; 8984 for (ElementCount VF : Range) 8985 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8986 8987 Instruction *DLInst = 8988 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8989 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8990 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8991 CM.getTailFoldingStyle(IVUpdateMayOverflow)); 8992 8993 // Scan the body of the loop in a topological order to visit each basic block 8994 // after having visited its predecessor basic blocks. 8995 LoopBlocksDFS DFS(OrigLoop); 8996 DFS.perform(LI); 8997 8998 VPBasicBlock *VPBB = HeaderVPBB; 8999 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9000 // Relevant instructions from basic block BB will be grouped into VPRecipe 9001 // ingredients and fill a new VPBasicBlock. 9002 if (VPBB != HeaderVPBB) 9003 VPBB->setName(BB->getName()); 9004 Builder.setInsertPoint(VPBB); 9005 9006 // Introduce each ingredient into VPlan. 9007 // TODO: Model and preserve debug intrinsics in VPlan. 9008 for (Instruction &I : BB->instructionsWithoutDebug(false)) { 9009 Instruction *Instr = &I; 9010 9011 // First filter out irrelevant instructions, to ensure no recipes are 9012 // built for them. 9013 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9014 continue; 9015 9016 SmallVector<VPValue *, 4> Operands; 9017 auto *Phi = dyn_cast<PHINode>(Instr); 9018 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9019 Operands.push_back(Plan->getVPValueOrAddLiveIn( 9020 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9021 } else { 9022 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9023 Operands = {OpRange.begin(), OpRange.end()}; 9024 } 9025 9026 // Invariant stores inside loop will be deleted and a single store 9027 // with the final reduction value will be added to the exit block 9028 StoreInst *SI; 9029 if ((SI = dyn_cast<StoreInst>(&I)) && 9030 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 9031 continue; 9032 9033 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9034 Instr, Operands, Range, VPBB, Plan); 9035 if (!RecipeOrValue) 9036 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 9037 // If Instr can be simplified to an existing VPValue, use it. 9038 if (isa<VPValue *>(RecipeOrValue)) { 9039 auto *VPV = cast<VPValue *>(RecipeOrValue); 9040 Plan->addVPValue(Instr, VPV); 9041 // If the re-used value is a recipe, register the recipe for the 9042 // instruction, in case the recipe for Instr needs to be recorded. 9043 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 9044 RecipeBuilder.setRecipe(Instr, R); 9045 continue; 9046 } 9047 // Otherwise, add the new recipe. 9048 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 9049 for (auto *Def : Recipe->definedValues()) { 9050 auto *UV = Def->getUnderlyingValue(); 9051 Plan->addVPValue(UV, Def); 9052 } 9053 9054 RecipeBuilder.setRecipe(Instr, Recipe); 9055 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 9056 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 9057 // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the 9058 // phi section of HeaderVPBB. 9059 assert(isa<TruncInst>(Instr)); 9060 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 9061 } else 9062 VPBB->appendRecipe(Recipe); 9063 } 9064 9065 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 9066 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 9067 } 9068 9069 // After here, VPBB should not be used. 9070 VPBB = nullptr; 9071 9072 if (CM.requiresScalarEpilogue(Range)) { 9073 // No edge from the middle block to the unique exit block has been inserted 9074 // and there is nothing to fix from vector loop; phis should have incoming 9075 // from scalar loop only. 9076 } else 9077 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 9078 9079 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 9080 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 9081 "entry block must be set to a VPRegionBlock having a non-empty entry " 9082 "VPBasicBlock"); 9083 RecipeBuilder.fixHeaderPhis(); 9084 9085 // --------------------------------------------------------------------------- 9086 // Transform initial VPlan: Apply previously taken decisions, in order, to 9087 // bring the VPlan to its final state. 9088 // --------------------------------------------------------------------------- 9089 9090 // Adjust the recipes for any inloop reductions. 9091 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 9092 RecipeBuilder, Range.Start); 9093 9094 // Interleave memory: for each Interleave Group we marked earlier as relevant 9095 // for this VPlan, replace the Recipes widening its memory instructions with a 9096 // single VPInterleaveRecipe at its insertion point. 9097 for (const auto *IG : InterleaveGroups) { 9098 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9099 RecipeBuilder.getRecipe(IG->getInsertPos())); 9100 SmallVector<VPValue *, 4> StoredValues; 9101 for (unsigned i = 0; i < IG->getFactor(); ++i) 9102 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9103 auto *StoreR = 9104 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9105 StoredValues.push_back(StoreR->getStoredValue()); 9106 } 9107 9108 bool NeedsMaskForGaps = 9109 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 9110 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9111 Recipe->getMask(), NeedsMaskForGaps); 9112 VPIG->insertBefore(Recipe); 9113 unsigned J = 0; 9114 for (unsigned i = 0; i < IG->getFactor(); ++i) 9115 if (Instruction *Member = IG->getMember(i)) { 9116 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 9117 if (!Member->getType()->isVoidTy()) { 9118 VPValue *OriginalV = MemberR->getVPSingleValue(); 9119 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9120 J++; 9121 } 9122 MemberR->eraseFromParent(); 9123 } 9124 } 9125 9126 for (ElementCount VF : Range) 9127 Plan->addVF(VF); 9128 Plan->setName("Initial VPlan"); 9129 9130 // Replace VPValues for known constant strides guaranteed by predicate scalar 9131 // evolution. 9132 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 9133 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 9134 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 9135 // Only handle constant strides for now. 9136 if (!ScevStride) 9137 continue; 9138 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 9139 9140 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 9141 // The versioned value may not be used in the loop directly, so just add a 9142 // new live-in in those cases. 9143 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 9144 } 9145 9146 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9147 // in ways that accessing values using original IR values is incorrect. 9148 Plan->disableValue2VPValue(); 9149 9150 // Sink users of fixed-order recurrence past the recipe defining the previous 9151 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 9152 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 9153 return std::nullopt; 9154 9155 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 9156 VPlanTransforms::removeRedundantInductionCasts(*Plan); 9157 9158 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9159 VPlanTransforms::removeDeadRecipes(*Plan); 9160 9161 VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); 9162 9163 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9164 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); 9165 9166 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9167 return std::make_optional(std::move(Plan)); 9168 } 9169 9170 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9171 // Outer loop handling: They may require CFG and instruction level 9172 // transformations before even evaluating whether vectorization is profitable. 9173 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9174 // the vectorization pipeline. 9175 assert(!OrigLoop->isInnermost()); 9176 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9177 9178 // Create new empty VPlan 9179 auto Plan = VPlan::createInitialVPlan( 9180 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 9181 *PSE.getSE()); 9182 9183 // Build hierarchical CFG 9184 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9185 HCFGBuilder.buildHierarchicalCFG(); 9186 9187 for (ElementCount VF : Range) 9188 Plan->addVF(VF); 9189 9190 VPlanTransforms::VPInstructionsToVPRecipes( 9191 Plan, 9192 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9193 *PSE.getSE(), *TLI); 9194 9195 // Remove the existing terminator of the exiting block of the top-most region. 9196 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9197 auto *Term = 9198 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9199 Term->eraseFromParent(); 9200 9201 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9202 CM.getTailFoldingStyle()); 9203 return Plan; 9204 } 9205 9206 // Adjust the recipes for reductions. For in-loop reductions the chain of 9207 // instructions leading from the loop exit instr to the phi need to be converted 9208 // to reductions, with one operand being vector and the other being the scalar 9209 // reduction chain. For other reductions, a select is introduced between the phi 9210 // and live-out recipes when folding the tail. 9211 void LoopVectorizationPlanner::adjustRecipesForReductions( 9212 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9213 ElementCount MinVF) { 9214 for (const auto &Reduction : CM.getInLoopReductionChains()) { 9215 PHINode *Phi = Reduction.first; 9216 const RecurrenceDescriptor &RdxDesc = 9217 Legal->getReductionVars().find(Phi)->second; 9218 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9219 9220 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9221 continue; 9222 9223 // ReductionOperations are orders top-down from the phi's use to the 9224 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9225 // which of the two operands will remain scalar and which will be reduced. 9226 // For minmax the chain will be the select instructions. 9227 Instruction *Chain = Phi; 9228 for (Instruction *R : ReductionOperations) { 9229 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9230 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9231 9232 VPValue *ChainOp = Plan->getVPValue(Chain); 9233 unsigned FirstOpId; 9234 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9235 "Only min/max recurrences allowed for inloop reductions"); 9236 // Recognize a call to the llvm.fmuladd intrinsic. 9237 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9238 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9239 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9240 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9241 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9242 "Expected to replace a VPWidenSelectSC"); 9243 FirstOpId = 1; 9244 } else { 9245 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9246 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9247 "Expected to replace a VPWidenSC"); 9248 FirstOpId = 0; 9249 } 9250 unsigned VecOpId = 9251 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9252 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9253 9254 VPValue *CondOp = nullptr; 9255 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { 9256 VPBuilder::InsertPointGuard Guard(Builder); 9257 Builder.setInsertPoint(WidenRecipe->getParent(), 9258 WidenRecipe->getIterator()); 9259 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); 9260 } 9261 9262 if (IsFMulAdd) { 9263 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9264 // need to create an fmul recipe to use as the vector operand for the 9265 // fadd reduction. 9266 VPInstruction *FMulRecipe = new VPInstruction( 9267 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9268 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9269 WidenRecipe->getParent()->insert(FMulRecipe, 9270 WidenRecipe->getIterator()); 9271 VecOp = FMulRecipe; 9272 } 9273 VPReductionRecipe *RedRecipe = 9274 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); 9275 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9276 Plan->removeVPValueFor(R); 9277 Plan->addVPValue(R, RedRecipe); 9278 // Append the recipe to the end of the VPBasicBlock because we need to 9279 // ensure that it comes after all of it's inputs, including CondOp. 9280 WidenRecipe->getParent()->appendRecipe(RedRecipe); 9281 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9282 WidenRecipe->eraseFromParent(); 9283 9284 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9285 VPRecipeBase *CompareRecipe = 9286 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9287 assert(isa<VPWidenRecipe>(CompareRecipe) && 9288 "Expected to replace a VPWidenSC"); 9289 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9290 "Expected no remaining users"); 9291 CompareRecipe->eraseFromParent(); 9292 } 9293 Chain = R; 9294 } 9295 } 9296 9297 // If tail is folded by masking, introduce selects between the phi 9298 // and the live-out instruction of each reduction, at the beginning of the 9299 // dedicated latch block. 9300 if (CM.foldTailByMasking()) { 9301 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9302 for (VPRecipeBase &R : 9303 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9304 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9305 if (!PhiR || PhiR->isInLoop()) 9306 continue; 9307 VPValue *Cond = 9308 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); 9309 VPValue *Red = PhiR->getBackedgeValue(); 9310 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && 9311 "reduction recipe must be defined before latch"); 9312 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9313 } 9314 } 9315 9316 VPlanTransforms::clearReductionWrapFlags(*Plan); 9317 } 9318 9319 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9320 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9321 VPSlotTracker &SlotTracker) const { 9322 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9323 IG->getInsertPos()->printAsOperand(O, false); 9324 O << ", "; 9325 getAddr()->printAsOperand(O, SlotTracker); 9326 VPValue *Mask = getMask(); 9327 if (Mask) { 9328 O << ", "; 9329 Mask->printAsOperand(O, SlotTracker); 9330 } 9331 9332 unsigned OpIdx = 0; 9333 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9334 if (!IG->getMember(i)) 9335 continue; 9336 if (getNumStoreOperands() > 0) { 9337 O << "\n" << Indent << " store "; 9338 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9339 O << " to index " << i; 9340 } else { 9341 O << "\n" << Indent << " "; 9342 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9343 O << " = load from index " << i; 9344 } 9345 ++OpIdx; 9346 } 9347 } 9348 #endif 9349 9350 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9351 assert(!State.Instance && "Int or FP induction being replicated."); 9352 9353 Value *Start = getStartValue()->getLiveInIRValue(); 9354 const InductionDescriptor &ID = getInductionDescriptor(); 9355 TruncInst *Trunc = getTruncInst(); 9356 IRBuilderBase &Builder = State.Builder; 9357 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9358 assert(State.VF.isVector() && "must have vector VF"); 9359 9360 // The value from the original loop to which we are mapping the new induction 9361 // variable. 9362 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9363 9364 // Fast-math-flags propagate from the original induction instruction. 9365 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9366 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9367 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9368 9369 // Now do the actual transformations, and start with fetching the step value. 9370 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9371 9372 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9373 "Expected either an induction phi-node or a truncate of it!"); 9374 9375 // Construct the initial value of the vector IV in the vector loop preheader 9376 auto CurrIP = Builder.saveIP(); 9377 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9378 Builder.SetInsertPoint(VectorPH->getTerminator()); 9379 if (isa<TruncInst>(EntryVal)) { 9380 assert(Start->getType()->isIntegerTy() && 9381 "Truncation requires an integer type"); 9382 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9383 Step = Builder.CreateTrunc(Step, TruncType); 9384 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9385 } 9386 9387 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9388 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9389 Value *SteppedStart = getStepVector( 9390 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9391 9392 // We create vector phi nodes for both integer and floating-point induction 9393 // variables. Here, we determine the kind of arithmetic we will perform. 9394 Instruction::BinaryOps AddOp; 9395 Instruction::BinaryOps MulOp; 9396 if (Step->getType()->isIntegerTy()) { 9397 AddOp = Instruction::Add; 9398 MulOp = Instruction::Mul; 9399 } else { 9400 AddOp = ID.getInductionOpcode(); 9401 MulOp = Instruction::FMul; 9402 } 9403 9404 // Multiply the vectorization factor by the step using integer or 9405 // floating-point arithmetic as appropriate. 9406 Type *StepType = Step->getType(); 9407 Value *RuntimeVF; 9408 if (Step->getType()->isFloatingPointTy()) 9409 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9410 else 9411 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9412 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9413 9414 // Create a vector splat to use in the induction update. 9415 // 9416 // FIXME: If the step is non-constant, we create the vector splat with 9417 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9418 // handle a constant vector splat. 9419 Value *SplatVF = isa<Constant>(Mul) 9420 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9421 : Builder.CreateVectorSplat(State.VF, Mul); 9422 Builder.restoreIP(CurrIP); 9423 9424 // We may need to add the step a number of times, depending on the unroll 9425 // factor. The last of those goes into the PHI. 9426 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9427 &*State.CFG.PrevBB->getFirstInsertionPt()); 9428 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9429 Instruction *LastInduction = VecInd; 9430 for (unsigned Part = 0; Part < State.UF; ++Part) { 9431 State.set(this, LastInduction, Part); 9432 9433 if (isa<TruncInst>(EntryVal)) 9434 State.addMetadata(LastInduction, EntryVal); 9435 9436 LastInduction = cast<Instruction>( 9437 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9438 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9439 } 9440 9441 LastInduction->setName("vec.ind.next"); 9442 VecInd->addIncoming(SteppedStart, VectorPH); 9443 // Add induction update using an incorrect block temporarily. The phi node 9444 // will be fixed after VPlan execution. Note that at this point the latch 9445 // block cannot be used, as it does not exist yet. 9446 // TODO: Model increment value in VPlan, by turning the recipe into a 9447 // multi-def and a subclass of VPHeaderPHIRecipe. 9448 VecInd->addIncoming(LastInduction, VectorPH); 9449 } 9450 9451 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9452 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9453 "Not a pointer induction according to InductionDescriptor!"); 9454 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9455 "Unexpected type."); 9456 9457 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9458 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9459 9460 if (onlyScalarsGenerated(State.VF)) { 9461 // This is the normalized GEP that starts counting at zero. 9462 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9463 CanonicalIV, IndDesc.getStep()->getType()); 9464 // Determine the number of scalars we need to generate for each unroll 9465 // iteration. If the instruction is uniform, we only need to generate the 9466 // first lane. Otherwise, we generate all VF values. 9467 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9468 assert((IsUniform || !State.VF.isScalable()) && 9469 "Cannot scalarize a scalable VF"); 9470 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9471 9472 for (unsigned Part = 0; Part < State.UF; ++Part) { 9473 Value *PartStart = 9474 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9475 9476 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9477 Value *Idx = State.Builder.CreateAdd( 9478 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9479 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9480 9481 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9482 Value *SclrGep = emitTransformedIndex( 9483 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9484 SclrGep->setName("next.gep"); 9485 State.set(this, SclrGep, VPIteration(Part, Lane)); 9486 } 9487 } 9488 return; 9489 } 9490 9491 Type *PhiType = IndDesc.getStep()->getType(); 9492 9493 // Build a pointer phi 9494 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9495 Type *ScStValueType = ScalarStartValue->getType(); 9496 PHINode *NewPointerPhi = 9497 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9498 9499 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9500 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9501 9502 // A pointer induction, performed by using a gep 9503 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9504 9505 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9506 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9507 Value *NumUnrolledElems = 9508 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9509 Value *InductionGEP = GetElementPtrInst::Create( 9510 State.Builder.getInt8Ty(), NewPointerPhi, 9511 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9512 InductionLoc); 9513 // Add induction update using an incorrect block temporarily. The phi node 9514 // will be fixed after VPlan execution. Note that at this point the latch 9515 // block cannot be used, as it does not exist yet. 9516 // TODO: Model increment value in VPlan, by turning the recipe into a 9517 // multi-def and a subclass of VPHeaderPHIRecipe. 9518 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9519 9520 // Create UF many actual address geps that use the pointer 9521 // phi as base and a vectorized version of the step value 9522 // (<step*0, ..., step*N>) as offset. 9523 for (unsigned Part = 0; Part < State.UF; ++Part) { 9524 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9525 Value *StartOffsetScalar = 9526 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9527 Value *StartOffset = 9528 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9529 // Create a vector of consecutive numbers from zero to VF. 9530 StartOffset = State.Builder.CreateAdd( 9531 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9532 9533 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9534 "scalar step must be the same across all parts"); 9535 Value *GEP = State.Builder.CreateGEP( 9536 State.Builder.getInt8Ty(), NewPointerPhi, 9537 State.Builder.CreateMul( 9538 StartOffset, 9539 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9540 "vector.gep")); 9541 State.set(this, GEP, Part); 9542 } 9543 } 9544 9545 void VPDerivedIVRecipe::execute(VPTransformState &State) { 9546 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9547 9548 // Fast-math-flags propagate from the original induction instruction. 9549 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9550 if (IndDesc.getInductionBinOp() && 9551 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9552 State.Builder.setFastMathFlags( 9553 IndDesc.getInductionBinOp()->getFastMathFlags()); 9554 9555 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9556 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9557 Value *DerivedIV = 9558 emitTransformedIndex(State.Builder, CanonicalIV, 9559 getStartValue()->getLiveInIRValue(), Step, IndDesc); 9560 DerivedIV->setName("offset.idx"); 9561 if (ResultTy != DerivedIV->getType()) { 9562 assert(Step->getType()->isIntegerTy() && 9563 "Truncation requires an integer step"); 9564 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); 9565 } 9566 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9567 9568 State.set(this, DerivedIV, VPIteration(0, 0)); 9569 } 9570 9571 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9572 // Fast-math-flags propagate from the original induction instruction. 9573 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9574 if (IndDesc.getInductionBinOp() && 9575 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9576 State.Builder.setFastMathFlags( 9577 IndDesc.getInductionBinOp()->getFastMathFlags()); 9578 9579 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); 9580 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9581 9582 buildScalarSteps(BaseIV, Step, IndDesc, this, State); 9583 } 9584 9585 void VPInterleaveRecipe::execute(VPTransformState &State) { 9586 assert(!State.Instance && "Interleave group being replicated."); 9587 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9588 getStoredValues(), getMask(), 9589 NeedsMaskForGaps); 9590 } 9591 9592 void VPReductionRecipe::execute(VPTransformState &State) { 9593 assert(!State.Instance && "Reduction being replicated."); 9594 Value *PrevInChain = State.get(getChainOp(), 0); 9595 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9596 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9597 // Propagate the fast-math flags carried by the underlying instruction. 9598 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9599 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9600 for (unsigned Part = 0; Part < State.UF; ++Part) { 9601 Value *NewVecOp = State.get(getVecOp(), Part); 9602 if (VPValue *Cond = getCondOp()) { 9603 Value *NewCond = State.get(Cond, Part); 9604 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9605 Value *Iden = RdxDesc->getRecurrenceIdentity( 9606 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9607 Value *IdenVec = 9608 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9609 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9610 NewVecOp = Select; 9611 } 9612 Value *NewRed; 9613 Value *NextInChain; 9614 if (IsOrdered) { 9615 if (State.VF.isVector()) 9616 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9617 PrevInChain); 9618 else 9619 NewRed = State.Builder.CreateBinOp( 9620 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9621 NewVecOp); 9622 PrevInChain = NewRed; 9623 } else { 9624 PrevInChain = State.get(getChainOp(), Part); 9625 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9626 } 9627 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9628 NextInChain = 9629 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9630 NewRed, PrevInChain); 9631 } else if (IsOrdered) 9632 NextInChain = NewRed; 9633 else 9634 NextInChain = State.Builder.CreateBinOp( 9635 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9636 PrevInChain); 9637 State.set(this, NextInChain, Part); 9638 } 9639 } 9640 9641 void VPReplicateRecipe::execute(VPTransformState &State) { 9642 Instruction *UI = getUnderlyingInstr(); 9643 if (State.Instance) { // Generate a single instance. 9644 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9645 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9646 // Insert scalar instance packing it into a vector. 9647 if (State.VF.isVector() && shouldPack()) { 9648 // If we're constructing lane 0, initialize to start from poison. 9649 if (State.Instance->Lane.isFirstLane()) { 9650 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9651 Value *Poison = PoisonValue::get( 9652 VectorType::get(UI->getType(), State.VF)); 9653 State.set(this, Poison, State.Instance->Part); 9654 } 9655 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9656 } 9657 return; 9658 } 9659 9660 if (IsUniform) { 9661 // If the recipe is uniform across all parts (instead of just per VF), only 9662 // generate a single instance. 9663 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9664 all_of(operands(), [](VPValue *Op) { 9665 return Op->isDefinedOutsideVectorRegions(); 9666 })) { 9667 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9668 if (user_begin() != user_end()) { 9669 for (unsigned Part = 1; Part < State.UF; ++Part) 9670 State.set(this, State.get(this, VPIteration(0, 0)), 9671 VPIteration(Part, 0)); 9672 } 9673 return; 9674 } 9675 9676 // Uniform within VL means we need to generate lane 0 only for each 9677 // unrolled copy. 9678 for (unsigned Part = 0; Part < State.UF; ++Part) 9679 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9680 return; 9681 } 9682 9683 // A store of a loop varying value to a uniform address only needs the last 9684 // copy of the store. 9685 if (isa<StoreInst>(UI) && 9686 vputils::isUniformAfterVectorization(getOperand(1))) { 9687 auto Lane = VPLane::getLastLaneForVF(State.VF); 9688 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9689 State); 9690 return; 9691 } 9692 9693 // Generate scalar instances for all VF lanes of all UF parts. 9694 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9695 const unsigned EndLane = State.VF.getKnownMinValue(); 9696 for (unsigned Part = 0; Part < State.UF; ++Part) 9697 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9698 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9699 } 9700 9701 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9702 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9703 9704 // Attempt to issue a wide load. 9705 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9706 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9707 9708 assert((LI || SI) && "Invalid Load/Store instruction"); 9709 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9710 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9711 9712 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9713 9714 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9715 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9716 bool CreateGatherScatter = !isConsecutive(); 9717 9718 auto &Builder = State.Builder; 9719 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9720 bool isMaskRequired = getMask(); 9721 if (isMaskRequired) 9722 for (unsigned Part = 0; Part < State.UF; ++Part) 9723 BlockInMaskParts[Part] = State.get(getMask(), Part); 9724 9725 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9726 // Calculate the pointer for the specific unroll-part. 9727 Value *PartPtr = nullptr; 9728 9729 // Use i32 for the gep index type when the value is constant, 9730 // or query DataLayout for a more suitable index type otherwise. 9731 const DataLayout &DL = 9732 Builder.GetInsertBlock()->getModule()->getDataLayout(); 9733 Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) 9734 ? DL.getIndexType(ScalarDataTy->getPointerTo()) 9735 : Builder.getInt32Ty(); 9736 bool InBounds = false; 9737 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9738 InBounds = gep->isInBounds(); 9739 if (isReverse()) { 9740 // If the address is consecutive but reversed, then the 9741 // wide store needs to start at the last vector element. 9742 // RunTimeVF = VScale * VF.getKnownMinValue() 9743 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9744 Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); 9745 // NumElt = -Part * RunTimeVF 9746 Value *NumElt = 9747 Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); 9748 // LastLane = 1 - RunTimeVF 9749 Value *LastLane = 9750 Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); 9751 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); 9752 PartPtr = 9753 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); 9754 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9755 BlockInMaskParts[Part] = 9756 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9757 } else { 9758 Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); 9759 PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); 9760 } 9761 9762 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9763 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9764 }; 9765 9766 // Handle Stores: 9767 if (SI) { 9768 State.setDebugLocFromInst(SI); 9769 9770 for (unsigned Part = 0; Part < State.UF; ++Part) { 9771 Instruction *NewSI = nullptr; 9772 Value *StoredVal = State.get(StoredValue, Part); 9773 if (CreateGatherScatter) { 9774 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9775 Value *VectorGep = State.get(getAddr(), Part); 9776 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9777 MaskPart); 9778 } else { 9779 if (isReverse()) { 9780 // If we store to reverse consecutive memory locations, then we need 9781 // to reverse the order of elements in the stored value. 9782 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9783 // We don't want to update the value in the map as it might be used in 9784 // another expression. So don't call resetVectorValue(StoredVal). 9785 } 9786 auto *VecPtr = 9787 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9788 if (isMaskRequired) 9789 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9790 BlockInMaskParts[Part]); 9791 else 9792 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9793 } 9794 State.addMetadata(NewSI, SI); 9795 } 9796 return; 9797 } 9798 9799 // Handle loads. 9800 assert(LI && "Must have a load instruction"); 9801 State.setDebugLocFromInst(LI); 9802 for (unsigned Part = 0; Part < State.UF; ++Part) { 9803 Value *NewLI; 9804 if (CreateGatherScatter) { 9805 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9806 Value *VectorGep = State.get(getAddr(), Part); 9807 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9808 nullptr, "wide.masked.gather"); 9809 State.addMetadata(NewLI, LI); 9810 } else { 9811 auto *VecPtr = 9812 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9813 if (isMaskRequired) 9814 NewLI = Builder.CreateMaskedLoad( 9815 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9816 PoisonValue::get(DataTy), "wide.masked.load"); 9817 else 9818 NewLI = 9819 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9820 9821 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9822 State.addMetadata(NewLI, LI); 9823 if (Reverse) 9824 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9825 } 9826 9827 State.set(getVPSingleValue(), NewLI, Part); 9828 } 9829 } 9830 9831 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9832 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9833 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9834 // for predication. 9835 static ScalarEpilogueLowering getScalarEpilogueLowering( 9836 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9837 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9838 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9839 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9840 // don't look at hints or options, and don't request a scalar epilogue. 9841 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9842 // LoopAccessInfo (due to code dependency and not being able to reliably get 9843 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9844 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9845 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9846 // back to the old way and vectorize with versioning when forced. See D81345.) 9847 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9848 PGSOQueryType::IRPass) && 9849 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9850 return CM_ScalarEpilogueNotAllowedOptSize; 9851 9852 // 2) If set, obey the directives 9853 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9854 switch (PreferPredicateOverEpilogue) { 9855 case PreferPredicateTy::ScalarEpilogue: 9856 return CM_ScalarEpilogueAllowed; 9857 case PreferPredicateTy::PredicateElseScalarEpilogue: 9858 return CM_ScalarEpilogueNotNeededUsePredicate; 9859 case PreferPredicateTy::PredicateOrDontVectorize: 9860 return CM_ScalarEpilogueNotAllowedUsePredicate; 9861 }; 9862 } 9863 9864 // 3) If set, obey the hints 9865 switch (Hints.getPredicate()) { 9866 case LoopVectorizeHints::FK_Enabled: 9867 return CM_ScalarEpilogueNotNeededUsePredicate; 9868 case LoopVectorizeHints::FK_Disabled: 9869 return CM_ScalarEpilogueAllowed; 9870 }; 9871 9872 // 4) if the TTI hook indicates this is profitable, request predication. 9873 TailFoldingInfo TFI(TLI, &LVL, IAI); 9874 if (TTI->preferPredicateOverEpilogue(&TFI)) 9875 return CM_ScalarEpilogueNotNeededUsePredicate; 9876 9877 return CM_ScalarEpilogueAllowed; 9878 } 9879 9880 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9881 // If Values have been set for this Def return the one relevant for \p Part. 9882 if (hasVectorValue(Def, Part)) 9883 return Data.PerPartOutput[Def][Part]; 9884 9885 auto GetBroadcastInstrs = [this, Def](Value *V) { 9886 bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); 9887 if (VF.isScalar()) 9888 return V; 9889 // Place the code for broadcasting invariant variables in the new preheader. 9890 IRBuilder<>::InsertPointGuard Guard(Builder); 9891 if (SafeToHoist) { 9892 BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( 9893 Plan->getVectorLoopRegion()->getSinglePredecessor())]; 9894 if (LoopVectorPreHeader) 9895 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 9896 } 9897 9898 // Place the code for broadcasting invariant variables in the new preheader. 9899 // Broadcast the scalar into all locations in the vector. 9900 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 9901 9902 return Shuf; 9903 }; 9904 9905 if (!hasScalarValue(Def, {Part, 0})) { 9906 Value *IRV = Def->getLiveInIRValue(); 9907 Value *B = GetBroadcastInstrs(IRV); 9908 set(Def, B, Part); 9909 return B; 9910 } 9911 9912 Value *ScalarValue = get(Def, {Part, 0}); 9913 // If we aren't vectorizing, we can just copy the scalar map values over 9914 // to the vector map. 9915 if (VF.isScalar()) { 9916 set(Def, ScalarValue, Part); 9917 return ScalarValue; 9918 } 9919 9920 bool IsUniform = vputils::isUniformAfterVectorization(Def); 9921 9922 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9923 // Check if there is a scalar value for the selected lane. 9924 if (!hasScalarValue(Def, {Part, LastLane})) { 9925 // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and 9926 // VPExpandSCEVRecipes can also be uniform. 9927 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || 9928 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || 9929 isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && 9930 "unexpected recipe found to be invariant"); 9931 IsUniform = true; 9932 LastLane = 0; 9933 } 9934 9935 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9936 // Set the insert point after the last scalarized instruction or after the 9937 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 9938 // will directly follow the scalar definitions. 9939 auto OldIP = Builder.saveIP(); 9940 auto NewIP = 9941 isa<PHINode>(LastInst) 9942 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 9943 : std::next(BasicBlock::iterator(LastInst)); 9944 Builder.SetInsertPoint(&*NewIP); 9945 9946 // However, if we are vectorizing, we need to construct the vector values. 9947 // If the value is known to be uniform after vectorization, we can just 9948 // broadcast the scalar value corresponding to lane zero for each unroll 9949 // iteration. Otherwise, we construct the vector values using 9950 // insertelement instructions. Since the resulting vectors are stored in 9951 // State, we will only generate the insertelements once. 9952 Value *VectorValue = nullptr; 9953 if (IsUniform) { 9954 VectorValue = GetBroadcastInstrs(ScalarValue); 9955 set(Def, VectorValue, Part); 9956 } else { 9957 // Initialize packing with insertelements to start from undef. 9958 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9959 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9960 set(Def, Undef, Part); 9961 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9962 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9963 VectorValue = get(Def, Part); 9964 } 9965 Builder.restoreIP(OldIP); 9966 return VectorValue; 9967 } 9968 9969 // Process the loop in the VPlan-native vectorization path. This path builds 9970 // VPlan upfront in the vectorization pipeline, which allows to apply 9971 // VPlan-to-VPlan transformations from the very beginning without modifying the 9972 // input LLVM IR. 9973 static bool processLoopInVPlanNativePath( 9974 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9975 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9976 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9977 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9978 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9979 LoopVectorizationRequirements &Requirements) { 9980 9981 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9982 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9983 return false; 9984 } 9985 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9986 Function *F = L->getHeader()->getParent(); 9987 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9988 9989 ScalarEpilogueLowering SEL = 9990 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9991 9992 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9993 &Hints, IAI); 9994 // Use the planner for outer loop vectorization. 9995 // TODO: CM is not used at this point inside the planner. Turn CM into an 9996 // optional argument if we don't need it in the future. 9997 LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); 9998 9999 // Get user vectorization factor. 10000 ElementCount UserVF = Hints.getWidth(); 10001 10002 CM.collectElementTypesForWidening(); 10003 10004 // Plan how to best vectorize, return the best VF and its cost. 10005 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10006 10007 // If we are stress testing VPlan builds, do not attempt to generate vector 10008 // code. Masked vector code generation support will follow soon. 10009 // Also, do not attempt to vectorize if no vector code will be produced. 10010 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10011 return false; 10012 10013 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10014 10015 { 10016 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10017 F->getParent()->getDataLayout()); 10018 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10019 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 10020 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10021 << L->getHeader()->getParent()->getName() << "\"\n"); 10022 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 10023 } 10024 10025 // Mark the loop as already vectorized to avoid vectorizing again. 10026 Hints.setAlreadyVectorized(); 10027 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10028 return true; 10029 } 10030 10031 // Emit a remark if there are stores to floats that required a floating point 10032 // extension. If the vectorized loop was generated with floating point there 10033 // will be a performance penalty from the conversion overhead and the change in 10034 // the vector width. 10035 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10036 SmallVector<Instruction *, 4> Worklist; 10037 for (BasicBlock *BB : L->getBlocks()) { 10038 for (Instruction &Inst : *BB) { 10039 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10040 if (S->getValueOperand()->getType()->isFloatTy()) 10041 Worklist.push_back(S); 10042 } 10043 } 10044 } 10045 10046 // Traverse the floating point stores upwards searching, for floating point 10047 // conversions. 10048 SmallPtrSet<const Instruction *, 4> Visited; 10049 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10050 while (!Worklist.empty()) { 10051 auto *I = Worklist.pop_back_val(); 10052 if (!L->contains(I)) 10053 continue; 10054 if (!Visited.insert(I).second) 10055 continue; 10056 10057 // Emit a remark if the floating point store required a floating 10058 // point conversion. 10059 // TODO: More work could be done to identify the root cause such as a 10060 // constant or a function return type and point the user to it. 10061 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10062 ORE->emit([&]() { 10063 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10064 I->getDebugLoc(), L->getHeader()) 10065 << "floating point conversion changes vector width. " 10066 << "Mixed floating point precision requires an up/down " 10067 << "cast that will negatively impact performance."; 10068 }); 10069 10070 for (Use &Op : I->operands()) 10071 if (auto *OpI = dyn_cast<Instruction>(Op)) 10072 Worklist.push_back(OpI); 10073 } 10074 } 10075 10076 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 10077 VectorizationFactor &VF, 10078 std::optional<unsigned> VScale, Loop *L, 10079 ScalarEvolution &SE) { 10080 InstructionCost CheckCost = Checks.getCost(); 10081 if (!CheckCost.isValid()) 10082 return false; 10083 10084 // When interleaving only scalar and vector cost will be equal, which in turn 10085 // would lead to a divide by 0. Fall back to hard threshold. 10086 if (VF.Width.isScalar()) { 10087 if (CheckCost > VectorizeMemoryCheckThreshold) { 10088 LLVM_DEBUG( 10089 dbgs() 10090 << "LV: Interleaving only is not profitable due to runtime checks\n"); 10091 return false; 10092 } 10093 return true; 10094 } 10095 10096 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10097 double ScalarC = *VF.ScalarCost.getValue(); 10098 if (ScalarC == 0) 10099 return true; 10100 10101 // First, compute the minimum iteration count required so that the vector 10102 // loop outperforms the scalar loop. 10103 // The total cost of the scalar loop is 10104 // ScalarC * TC 10105 // where 10106 // * TC is the actual trip count of the loop. 10107 // * ScalarC is the cost of a single scalar iteration. 10108 // 10109 // The total cost of the vector loop is 10110 // RtC + VecC * (TC / VF) + EpiC 10111 // where 10112 // * RtC is the cost of the generated runtime checks 10113 // * VecC is the cost of a single vector iteration. 10114 // * TC is the actual trip count of the loop 10115 // * VF is the vectorization factor 10116 // * EpiCost is the cost of the generated epilogue, including the cost 10117 // of the remaining scalar operations. 10118 // 10119 // Vectorization is profitable once the total vector cost is less than the 10120 // total scalar cost: 10121 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10122 // 10123 // Now we can compute the minimum required trip count TC as 10124 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 10125 // 10126 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10127 // the computations are performed on doubles, not integers and the result 10128 // is rounded up, hence we get an upper estimate of the TC. 10129 unsigned IntVF = VF.Width.getKnownMinValue(); 10130 if (VF.Width.isScalable()) { 10131 unsigned AssumedMinimumVscale = 1; 10132 if (VScale) 10133 AssumedMinimumVscale = *VScale; 10134 IntVF *= AssumedMinimumVscale; 10135 } 10136 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 10137 double RtC = *CheckCost.getValue(); 10138 double MinTC1 = RtC / (ScalarC - VecCOverVF); 10139 10140 // Second, compute a minimum iteration count so that the cost of the 10141 // runtime checks is only a fraction of the total scalar loop cost. This 10142 // adds a loop-dependent bound on the overhead incurred if the runtime 10143 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10144 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 10145 // cost, compute 10146 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10147 double MinTC2 = RtC * 10 / ScalarC; 10148 10149 // Now pick the larger minimum. If it is not a multiple of VF, choose the 10150 // next closest multiple of VF. This should partly compensate for ignoring 10151 // the epilogue cost. 10152 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 10153 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); 10154 10155 LLVM_DEBUG( 10156 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 10157 << VF.MinProfitableTripCount << "\n"); 10158 10159 // Skip vectorization if the expected trip count is less than the minimum 10160 // required trip count. 10161 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 10162 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 10163 VF.MinProfitableTripCount)) { 10164 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 10165 "trip count < minimum profitable VF (" 10166 << *ExpectedTC << " < " << VF.MinProfitableTripCount 10167 << ")\n"); 10168 10169 return false; 10170 } 10171 } 10172 return true; 10173 } 10174 10175 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10176 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10177 !EnableLoopInterleaving), 10178 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10179 !EnableLoopVectorization) {} 10180 10181 bool LoopVectorizePass::processLoop(Loop *L) { 10182 assert((EnableVPlanNativePath || L->isInnermost()) && 10183 "VPlan-native path is not enabled. Only process inner loops."); 10184 10185 #ifndef NDEBUG 10186 const std::string DebugLocStr = getDebugLocString(L); 10187 #endif /* NDEBUG */ 10188 10189 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10190 << L->getHeader()->getParent()->getName() << "' from " 10191 << DebugLocStr << "\n"); 10192 10193 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10194 10195 LLVM_DEBUG( 10196 dbgs() << "LV: Loop hints:" 10197 << " force=" 10198 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10199 ? "disabled" 10200 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10201 ? "enabled" 10202 : "?")) 10203 << " width=" << Hints.getWidth() 10204 << " interleave=" << Hints.getInterleave() << "\n"); 10205 10206 // Function containing loop 10207 Function *F = L->getHeader()->getParent(); 10208 10209 // Looking at the diagnostic output is the only way to determine if a loop 10210 // was vectorized (other than looking at the IR or machine code), so it 10211 // is important to generate an optimization remark for each loop. Most of 10212 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10213 // generated as OptimizationRemark and OptimizationRemarkMissed are 10214 // less verbose reporting vectorized loops and unvectorized loops that may 10215 // benefit from vectorization, respectively. 10216 10217 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10218 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10219 return false; 10220 } 10221 10222 PredicatedScalarEvolution PSE(*SE, *L); 10223 10224 // Check if it is legal to vectorize the loop. 10225 LoopVectorizationRequirements Requirements; 10226 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 10227 &Requirements, &Hints, DB, AC, BFI, PSI); 10228 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10229 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10230 Hints.emitRemarkWithHints(); 10231 return false; 10232 } 10233 10234 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10235 // here. They may require CFG and instruction level transformations before 10236 // even evaluating whether vectorization is profitable. Since we cannot modify 10237 // the incoming IR, we need to build VPlan upfront in the vectorization 10238 // pipeline. 10239 if (!L->isInnermost()) 10240 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10241 ORE, BFI, PSI, Hints, Requirements); 10242 10243 assert(L->isInnermost() && "Inner loop expected."); 10244 10245 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10246 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10247 10248 // If an override option has been passed in for interleaved accesses, use it. 10249 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10250 UseInterleaved = EnableInterleavedMemAccesses; 10251 10252 // Analyze interleaved memory accesses. 10253 if (UseInterleaved) 10254 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10255 10256 // Check the function attributes and profiles to find out if this function 10257 // should be optimized for size. 10258 ScalarEpilogueLowering SEL = 10259 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 10260 10261 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10262 // count by optimizing for size, to minimize overheads. 10263 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10264 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10265 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10266 << "This loop is worth vectorizing only if no scalar " 10267 << "iteration overheads are incurred."); 10268 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10269 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10270 else { 10271 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 10272 LLVM_DEBUG(dbgs() << "\n"); 10273 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10274 } else { 10275 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 10276 "small to consider vectorizing.\n"); 10277 reportVectorizationFailure( 10278 "The trip count is below the minial threshold value.", 10279 "loop trip count is too low, avoiding vectorization", 10280 "LowTripCount", ORE, L); 10281 Hints.emitRemarkWithHints(); 10282 return false; 10283 } 10284 } 10285 } 10286 10287 // Check the function attributes to see if implicit floats or vectors are 10288 // allowed. 10289 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10290 reportVectorizationFailure( 10291 "Can't vectorize when the NoImplicitFloat attribute is used", 10292 "loop not vectorized due to NoImplicitFloat attribute", 10293 "NoImplicitFloat", ORE, L); 10294 Hints.emitRemarkWithHints(); 10295 return false; 10296 } 10297 10298 // Check if the target supports potentially unsafe FP vectorization. 10299 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10300 // for the target we're vectorizing for, to make sure none of the 10301 // additional fp-math flags can help. 10302 if (Hints.isPotentiallyUnsafe() && 10303 TTI->isFPVectorizationPotentiallyUnsafe()) { 10304 reportVectorizationFailure( 10305 "Potentially unsafe FP op prevents vectorization", 10306 "loop not vectorized due to unsafe FP support.", 10307 "UnsafeFP", ORE, L); 10308 Hints.emitRemarkWithHints(); 10309 return false; 10310 } 10311 10312 bool AllowOrderedReductions; 10313 // If the flag is set, use that instead and override the TTI behaviour. 10314 if (ForceOrderedReductions.getNumOccurrences() > 0) 10315 AllowOrderedReductions = ForceOrderedReductions; 10316 else 10317 AllowOrderedReductions = TTI->enableOrderedReductions(); 10318 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10319 ORE->emit([&]() { 10320 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10321 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10322 ExactFPMathInst->getDebugLoc(), 10323 ExactFPMathInst->getParent()) 10324 << "loop not vectorized: cannot prove it is safe to reorder " 10325 "floating-point operations"; 10326 }); 10327 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10328 "reorder floating-point operations\n"); 10329 Hints.emitRemarkWithHints(); 10330 return false; 10331 } 10332 10333 // Use the cost model. 10334 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10335 F, &Hints, IAI); 10336 // Use the planner for vectorization. 10337 LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 10338 ORE); 10339 10340 // Get user vectorization factor and interleave count. 10341 ElementCount UserVF = Hints.getWidth(); 10342 unsigned UserIC = Hints.getInterleave(); 10343 10344 // Plan how to best vectorize, return the best VF and its cost. 10345 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10346 10347 VectorizationFactor VF = VectorizationFactor::Disabled(); 10348 unsigned IC = 1; 10349 10350 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10351 F->getParent()->getDataLayout()); 10352 if (MaybeVF) { 10353 VF = *MaybeVF; 10354 // Select the interleave count. 10355 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10356 10357 unsigned SelectedIC = std::max(IC, UserIC); 10358 // Optimistically generate runtime checks if they are needed. Drop them if 10359 // they turn out to not be profitable. 10360 if (VF.Width.isVector() || SelectedIC > 1) 10361 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10362 10363 // Check if it is profitable to vectorize with runtime checks. 10364 bool ForceVectorization = 10365 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10366 if (!ForceVectorization && 10367 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 10368 *PSE.getSE())) { 10369 ORE->emit([&]() { 10370 return OptimizationRemarkAnalysisAliasing( 10371 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10372 L->getHeader()) 10373 << "loop not vectorized: cannot prove it is safe to reorder " 10374 "memory operations"; 10375 }); 10376 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10377 Hints.emitRemarkWithHints(); 10378 return false; 10379 } 10380 } 10381 10382 // Identify the diagnostic messages that should be produced. 10383 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10384 bool VectorizeLoop = true, InterleaveLoop = true; 10385 if (VF.Width.isScalar()) { 10386 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10387 VecDiagMsg = std::make_pair( 10388 "VectorizationNotBeneficial", 10389 "the cost-model indicates that vectorization is not beneficial"); 10390 VectorizeLoop = false; 10391 } 10392 10393 if (!MaybeVF && UserIC > 1) { 10394 // Tell the user interleaving was avoided up-front, despite being explicitly 10395 // requested. 10396 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10397 "interleaving should be avoided up front\n"); 10398 IntDiagMsg = std::make_pair( 10399 "InterleavingAvoided", 10400 "Ignoring UserIC, because interleaving was avoided up front"); 10401 InterleaveLoop = false; 10402 } else if (IC == 1 && UserIC <= 1) { 10403 // Tell the user interleaving is not beneficial. 10404 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10405 IntDiagMsg = std::make_pair( 10406 "InterleavingNotBeneficial", 10407 "the cost-model indicates that interleaving is not beneficial"); 10408 InterleaveLoop = false; 10409 if (UserIC == 1) { 10410 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10411 IntDiagMsg.second += 10412 " and is explicitly disabled or interleave count is set to 1"; 10413 } 10414 } else if (IC > 1 && UserIC == 1) { 10415 // Tell the user interleaving is beneficial, but it explicitly disabled. 10416 LLVM_DEBUG( 10417 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10418 IntDiagMsg = std::make_pair( 10419 "InterleavingBeneficialButDisabled", 10420 "the cost-model indicates that interleaving is beneficial " 10421 "but is explicitly disabled or interleave count is set to 1"); 10422 InterleaveLoop = false; 10423 } 10424 10425 // Override IC if user provided an interleave count. 10426 IC = UserIC > 0 ? UserIC : IC; 10427 10428 // Emit diagnostic messages, if any. 10429 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10430 if (!VectorizeLoop && !InterleaveLoop) { 10431 // Do not vectorize or interleaving the loop. 10432 ORE->emit([&]() { 10433 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10434 L->getStartLoc(), L->getHeader()) 10435 << VecDiagMsg.second; 10436 }); 10437 ORE->emit([&]() { 10438 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10439 L->getStartLoc(), L->getHeader()) 10440 << IntDiagMsg.second; 10441 }); 10442 return false; 10443 } else if (!VectorizeLoop && InterleaveLoop) { 10444 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10445 ORE->emit([&]() { 10446 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10447 L->getStartLoc(), L->getHeader()) 10448 << VecDiagMsg.second; 10449 }); 10450 } else if (VectorizeLoop && !InterleaveLoop) { 10451 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10452 << ") in " << DebugLocStr << '\n'); 10453 ORE->emit([&]() { 10454 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10455 L->getStartLoc(), L->getHeader()) 10456 << IntDiagMsg.second; 10457 }); 10458 } else if (VectorizeLoop && InterleaveLoop) { 10459 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10460 << ") in " << DebugLocStr << '\n'); 10461 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10462 } 10463 10464 bool DisableRuntimeUnroll = false; 10465 MDNode *OrigLoopID = L->getLoopID(); 10466 { 10467 using namespace ore; 10468 if (!VectorizeLoop) { 10469 assert(IC > 1 && "interleave count should not be 1 or 0"); 10470 // If we decided that it is not legal to vectorize the loop, then 10471 // interleave it. 10472 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10473 &CM, BFI, PSI, Checks); 10474 10475 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10476 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10477 10478 ORE->emit([&]() { 10479 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10480 L->getHeader()) 10481 << "interleaved loop (interleaved count: " 10482 << NV("InterleaveCount", IC) << ")"; 10483 }); 10484 } else { 10485 // If we decided that it is *legal* to vectorize the loop, then do it. 10486 10487 // Consider vectorizing the epilogue too if it's profitable. 10488 VectorizationFactor EpilogueVF = 10489 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10490 if (EpilogueVF.Width.isVector()) { 10491 10492 // The first pass vectorizes the main loop and creates a scalar epilogue 10493 // to be vectorized by executing the plan (potentially with a different 10494 // factor) again shortly afterwards. 10495 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10496 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10497 EPI, &LVL, &CM, BFI, PSI, Checks); 10498 10499 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10500 auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, 10501 BestMainPlan, MainILV, DT, true); 10502 ++LoopsVectorized; 10503 10504 // Second pass vectorizes the epilogue and adjusts the control flow 10505 // edges from the first pass. 10506 EPI.MainLoopVF = EPI.EpilogueVF; 10507 EPI.MainLoopUF = EPI.EpilogueUF; 10508 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10509 ORE, EPI, &LVL, &CM, BFI, PSI, 10510 Checks); 10511 10512 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10513 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10514 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10515 Header->setName("vec.epilog.vector.body"); 10516 10517 // Re-use the trip count and steps expanded for the main loop, as 10518 // skeleton creation needs it as a value that dominates both the scalar 10519 // and vector epilogue loops 10520 // TODO: This is a workaround needed for epilogue vectorization and it 10521 // should be removed once induction resume value creation is done 10522 // directly in VPlan. 10523 EpilogILV.setTripCount(MainILV.getTripCount()); 10524 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10525 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10526 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10527 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10528 ExpandR->replaceAllUsesWith(ExpandedVal); 10529 ExpandR->eraseFromParent(); 10530 } 10531 10532 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10533 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10534 // before vectorizing the epilogue loop. 10535 for (VPRecipeBase &R : Header->phis()) { 10536 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10537 continue; 10538 10539 Value *ResumeV = nullptr; 10540 // TODO: Move setting of resume values to prepareToExecute. 10541 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10542 ResumeV = MainILV.getReductionResumeValue( 10543 ReductionPhi->getRecurrenceDescriptor()); 10544 } else { 10545 // Create induction resume values for both widened pointer and 10546 // integer/fp inductions and update the start value of the induction 10547 // recipes to use the resume value. 10548 PHINode *IndPhi = nullptr; 10549 const InductionDescriptor *ID; 10550 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10551 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10552 ID = &Ind->getInductionDescriptor(); 10553 } else { 10554 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10555 IndPhi = WidenInd->getPHINode(); 10556 ID = &WidenInd->getInductionDescriptor(); 10557 } 10558 10559 ResumeV = MainILV.createInductionResumeValue( 10560 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10561 {EPI.MainLoopIterationCountCheck}); 10562 } 10563 assert(ResumeV && "Must have a resume value"); 10564 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10565 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10566 } 10567 10568 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10569 DT, true, &ExpandedSCEVs); 10570 ++LoopsEpilogueVectorized; 10571 10572 if (!MainILV.areSafetyChecksAdded()) 10573 DisableRuntimeUnroll = true; 10574 } else { 10575 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10576 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10577 PSI, Checks); 10578 10579 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10580 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10581 ++LoopsVectorized; 10582 10583 // Add metadata to disable runtime unrolling a scalar loop when there 10584 // are no runtime checks about strides and memory. A scalar loop that is 10585 // rarely used is not worth unrolling. 10586 if (!LB.areSafetyChecksAdded()) 10587 DisableRuntimeUnroll = true; 10588 } 10589 // Report the vectorization decision. 10590 ORE->emit([&]() { 10591 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10592 L->getHeader()) 10593 << "vectorized loop (vectorization width: " 10594 << NV("VectorizationFactor", VF.Width) 10595 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10596 }); 10597 } 10598 10599 if (ORE->allowExtraAnalysis(LV_NAME)) 10600 checkMixedPrecision(L, ORE); 10601 } 10602 10603 std::optional<MDNode *> RemainderLoopID = 10604 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10605 LLVMLoopVectorizeFollowupEpilogue}); 10606 if (RemainderLoopID) { 10607 L->setLoopID(*RemainderLoopID); 10608 } else { 10609 if (DisableRuntimeUnroll) 10610 AddRuntimeUnrollDisableMetaData(L); 10611 10612 // Mark the loop as already vectorized to avoid vectorizing again. 10613 Hints.setAlreadyVectorized(); 10614 } 10615 10616 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10617 return true; 10618 } 10619 10620 LoopVectorizeResult LoopVectorizePass::runImpl( 10621 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10622 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10623 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10624 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10625 SE = &SE_; 10626 LI = &LI_; 10627 TTI = &TTI_; 10628 DT = &DT_; 10629 BFI = BFI_; 10630 TLI = TLI_; 10631 AC = &AC_; 10632 LAIs = &LAIs_; 10633 DB = &DB_; 10634 ORE = &ORE_; 10635 PSI = PSI_; 10636 10637 // Don't attempt if 10638 // 1. the target claims to have no vector registers, and 10639 // 2. interleaving won't help ILP. 10640 // 10641 // The second condition is necessary because, even if the target has no 10642 // vector registers, loop vectorization may still enable scalar 10643 // interleaving. 10644 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10645 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10646 return LoopVectorizeResult(false, false); 10647 10648 bool Changed = false, CFGChanged = false; 10649 10650 // The vectorizer requires loops to be in simplified form. 10651 // Since simplification may add new inner loops, it has to run before the 10652 // legality and profitability checks. This means running the loop vectorizer 10653 // will simplify all loops, regardless of whether anything end up being 10654 // vectorized. 10655 for (const auto &L : *LI) 10656 Changed |= CFGChanged |= 10657 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10658 10659 // Build up a worklist of inner-loops to vectorize. This is necessary as 10660 // the act of vectorizing or partially unrolling a loop creates new loops 10661 // and can invalidate iterators across the loops. 10662 SmallVector<Loop *, 8> Worklist; 10663 10664 for (Loop *L : *LI) 10665 collectSupportedLoops(*L, LI, ORE, Worklist); 10666 10667 LoopsAnalyzed += Worklist.size(); 10668 10669 // Now walk the identified inner loops. 10670 while (!Worklist.empty()) { 10671 Loop *L = Worklist.pop_back_val(); 10672 10673 // For the inner loops we actually process, form LCSSA to simplify the 10674 // transform. 10675 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10676 10677 Changed |= CFGChanged |= processLoop(L); 10678 10679 if (Changed) 10680 LAIs->clear(); 10681 } 10682 10683 // Process each loop nest in the function. 10684 return LoopVectorizeResult(Changed, CFGChanged); 10685 } 10686 10687 PreservedAnalyses LoopVectorizePass::run(Function &F, 10688 FunctionAnalysisManager &AM) { 10689 auto &LI = AM.getResult<LoopAnalysis>(F); 10690 // There are no loops in the function. Return before computing other expensive 10691 // analyses. 10692 if (LI.empty()) 10693 return PreservedAnalyses::all(); 10694 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10695 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10696 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10697 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10698 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10699 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10700 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10701 10702 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10703 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10704 ProfileSummaryInfo *PSI = 10705 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10706 BlockFrequencyInfo *BFI = nullptr; 10707 if (PSI && PSI->hasProfileSummary()) 10708 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10709 LoopVectorizeResult Result = 10710 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10711 if (!Result.MadeAnyChange) 10712 return PreservedAnalyses::all(); 10713 PreservedAnalyses PA; 10714 10715 if (isAssignmentTrackingEnabled(*F.getParent())) { 10716 for (auto &BB : F) 10717 RemoveRedundantDbgInstrs(&BB); 10718 } 10719 10720 // We currently do not preserve loopinfo/dominator analyses with outer loop 10721 // vectorization. Until this is addressed, mark these analyses as preserved 10722 // only for non-VPlan-native path. 10723 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10724 if (!EnableVPlanNativePath) { 10725 PA.preserve<LoopAnalysis>(); 10726 PA.preserve<DominatorTreeAnalysis>(); 10727 PA.preserve<ScalarEvolutionAnalysis>(); 10728 10729 #ifdef EXPENSIVE_CHECKS 10730 SE.verify(); 10731 #endif 10732 } 10733 10734 if (Result.MadeCFGChange) { 10735 // Making CFG changes likely means a loop got vectorized. Indicate that 10736 // extra simplification passes should be run. 10737 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10738 // be run if runtime checks have been added. 10739 AM.getResult<ShouldRunExtraVectorPasses>(F); 10740 PA.preserve<ShouldRunExtraVectorPasses>(); 10741 } else { 10742 PA.preserveSet<CFGAnalyses>(); 10743 } 10744 return PA; 10745 } 10746 10747 void LoopVectorizePass::printPipeline( 10748 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10749 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10750 OS, MapClassName2PassName); 10751 10752 OS << '<'; 10753 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10754 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10755 OS << '>'; 10756 } 10757