1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/STLExtras.h"
69 #include "llvm/ADT/SmallPtrSet.h"
70 #include "llvm/ADT/SmallSet.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/Statistic.h"
73 #include "llvm/ADT/StringRef.h"
74 #include "llvm/ADT/Twine.h"
75 #include "llvm/ADT/iterator_range.h"
76 #include "llvm/Analysis/AssumptionCache.h"
77 #include "llvm/Analysis/BasicAliasAnalysis.h"
78 #include "llvm/Analysis/BlockFrequencyInfo.h"
79 #include "llvm/Analysis/CFG.h"
80 #include "llvm/Analysis/CodeMetrics.h"
81 #include "llvm/Analysis/DemandedBits.h"
82 #include "llvm/Analysis/GlobalsModRef.h"
83 #include "llvm/Analysis/LoopAccessAnalysis.h"
84 #include "llvm/Analysis/LoopAnalysisManager.h"
85 #include "llvm/Analysis/LoopInfo.h"
86 #include "llvm/Analysis/LoopIterator.h"
87 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
88 #include "llvm/Analysis/ProfileSummaryInfo.h"
89 #include "llvm/Analysis/ScalarEvolution.h"
90 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
91 #include "llvm/Analysis/TargetLibraryInfo.h"
92 #include "llvm/Analysis/TargetTransformInfo.h"
93 #include "llvm/Analysis/ValueTracking.h"
94 #include "llvm/Analysis/VectorUtils.h"
95 #include "llvm/IR/Attributes.h"
96 #include "llvm/IR/BasicBlock.h"
97 #include "llvm/IR/CFG.h"
98 #include "llvm/IR/Constant.h"
99 #include "llvm/IR/Constants.h"
100 #include "llvm/IR/DataLayout.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/DebugLoc.h"
103 #include "llvm/IR/DerivedTypes.h"
104 #include "llvm/IR/DiagnosticInfo.h"
105 #include "llvm/IR/Dominators.h"
106 #include "llvm/IR/Function.h"
107 #include "llvm/IR/IRBuilder.h"
108 #include "llvm/IR/InstrTypes.h"
109 #include "llvm/IR/Instruction.h"
110 #include "llvm/IR/Instructions.h"
111 #include "llvm/IR/IntrinsicInst.h"
112 #include "llvm/IR/Intrinsics.h"
113 #include "llvm/IR/Metadata.h"
114 #include "llvm/IR/Module.h"
115 #include "llvm/IR/Operator.h"
116 #include "llvm/IR/PatternMatch.h"
117 #include "llvm/IR/Type.h"
118 #include "llvm/IR/Use.h"
119 #include "llvm/IR/User.h"
120 #include "llvm/IR/Value.h"
121 #include "llvm/IR/ValueHandle.h"
122 #include "llvm/IR/Verifier.h"
123 #include "llvm/InitializePasses.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
126 #include "llvm/Support/CommandLine.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
129 #include "llvm/Support/ErrorHandling.h"
130 #include "llvm/Support/InstructionCost.h"
131 #include "llvm/Support/MathExtras.h"
132 #include "llvm/Support/raw_ostream.h"
133 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
134 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
135 #include "llvm/Transforms/Utils/LoopSimplify.h"
136 #include "llvm/Transforms/Utils/LoopUtils.h"
137 #include "llvm/Transforms/Utils/LoopVersioning.h"
138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cmath>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153
154 using namespace llvm;
155
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167 "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169 "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175
176 static cl::opt<bool> EnableEpilogueVectorization(
177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178 cl::desc("Enable vectorization of epilogue loops."));
179
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182 cl::desc("When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
184 "loops."));
185
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188 cl::desc("Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
190
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195 cl::desc("Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
197 "are incurred."));
198
199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201 cl::desc("The maximum allowed number of runtime memory checks"));
202
203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204 // that predication is preferred, and this lists all options. I.e., the
205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
206 // and predicate the instructions accordingly. If tail-folding fails, there are
207 // different fallback strategies depending on these values:
208 namespace PreferPredicateTy {
209 enum Option {
210 ScalarEpilogue = 0,
211 PredicateElseScalarEpilogue,
212 PredicateOrDontVectorize
213 };
214 } // namespace PreferPredicateTy
215
216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217 "prefer-predicate-over-epilogue",
218 cl::init(PreferPredicateTy::ScalarEpilogue),
219 cl::Hidden,
220 cl::desc("Tail-folding and predication preferences over creating a scalar "
221 "epilogue loop."),
222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
223 "scalar-epilogue",
224 "Don't tail-predicate loops, create scalar epilogue"),
225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
226 "predicate-else-scalar-epilogue",
227 "prefer tail-folding, create scalar epilogue if tail "
228 "folding fails."),
229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
230 "predicate-dont-vectorize",
231 "prefers tail-folding, don't attempt vectorization if "
232 "tail-folding fails.")));
233
234 static cl::opt<bool> MaximizeBandwidth(
235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236 cl::desc("Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
238
239 static cl::opt<bool> EnableInterleavedMemAccesses(
240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242
243 /// An interleave-group may need masking if it resides in a block that needs
244 /// predication, or in order to mask away gaps.
245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248
249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251 cl::desc("We don't interleave loops with a estimated constant trip count "
252 "below this number"));
253
254 static cl::opt<unsigned> ForceTargetNumScalarRegs(
255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256 cl::desc("A flag that overrides the target's number of scalar registers."));
257
258 static cl::opt<unsigned> ForceTargetNumVectorRegs(
259 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of vector registers."));
261
262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's max interleave factor for "
265 "scalar loops."));
266
267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
271
272 static cl::opt<unsigned> ForceTargetInstructionCost(
273 "force-target-instruction-cost", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
277
278 static cl::opt<bool> ForceTargetSupportsScalableVectors(
279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280 cl::desc(
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
283
284 static cl::opt<unsigned> SmallLoopCost(
285 "small-loop-cost", cl::init(20), cl::Hidden,
286 cl::desc(
287 "The cost of a loop that is considered 'small' by the interleaver."));
288
289 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291 cl::desc("Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
294
295 // Runtime interleave loops for load/store throughput.
296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298 cl::desc(
299 "Enable runtime interleaving until load/store ports are saturated"));
300
301 /// Interleave small loops with scalar reductions.
302 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304 cl::desc("Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
306
307 /// The number of stores in a loop that are allowed to need predication.
308 static cl::opt<unsigned> NumberOfStoresToPredicate(
309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310 cl::desc("Max number of stores to be predicated behind an if."));
311
312 static cl::opt<bool> EnableIndVarRegisterHeur(
313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314 cl::desc("Count the induction variable only once when interleaving"));
315
316 static cl::opt<bool> EnableCondStoresVectorization(
317 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318 cl::desc("Enable if predication of stores during vectorization."));
319
320 static cl::opt<unsigned> MaxNestedScalarReductionIC(
321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322 cl::desc("The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
324
325 static cl::opt<bool>
326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327 cl::Hidden,
328 cl::desc("Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
330
331 static cl::opt<bool> ForceOrderedReductions(
332 "force-ordered-reductions", cl::init(false), cl::Hidden,
333 cl::desc("Enable the vectorisation of loops with in-order (strict) "
334 "FP reductions"));
335
336 static cl::opt<bool> PreferPredicatedReductionSelect(
337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338 cl::desc(
339 "Prefer predicating a reduction operation over an after loop select."));
340
341 cl::opt<bool> EnableVPlanNativePath(
342 "enable-vplan-native-path", cl::init(false), cl::Hidden,
343 cl::desc("Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
345
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351 "vplan-build-stress-test", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
356
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358 "interleave-loops", cl::init(true), cl::Hidden,
359 cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361 "vectorize-loops", cl::init(true), cl::Hidden,
362 cl::desc("Run the Loop vectorization passes"));
363
364 static cl::opt<bool> PrintVPlansInDotFormat(
365 "vplan-print-in-dot-format", cl::Hidden,
366 cl::desc("Use dot format instead of plain text when dumping VPlans"));
367
368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369 "force-widen-divrem-via-safe-divisor", cl::Hidden,
370 cl::desc(
371 "Override cost based safe divisor widening for div/rem instructions"));
372
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377 // Determine if an array of N elements of type Ty is "bitcast compatible"
378 // with a <N x Ty> vector.
379 // This is only true if there is no padding between the array elements.
380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381 }
382
383 /// A helper function that returns the reciprocal of the block probability of
384 /// predicated blocks. If we return X, we are assuming the predicated block
385 /// will execute once for every X iterations of the loop header.
386 ///
387 /// TODO: We should use actual block probability here, if available. Currently,
388 /// we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()389 static unsigned getReciprocalPredBlockProb() { return 2; }
390
391 /// A helper function that returns an integer or floating-point constant with
392 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395 : ConstantFP::get(Ty, C);
396 }
397
398 /// Returns "best known" trip count for the specified loop \p L as defined by
399 /// the following procedure:
400 /// 1) Returns exact trip count if it is known.
401 /// 2) Returns expected trip count according to profile data if any.
402 /// 3) Returns upper bound estimate if it is known.
403 /// 4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405 Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return *EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return std::nullopt;
420 }
421
422 namespace {
423 // Forward declare GeneratedRTChecks.
424 class GeneratedRTChecks;
425 } // namespace
426
427 namespace llvm {
428
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 /// counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 /// instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448 LoopInfo *LI, DominatorTree *DT,
449 const TargetLibraryInfo *TLI,
450 const TargetTransformInfo *TTI, AssumptionCache *AC,
451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452 ElementCount MinProfitableTripCount,
453 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459 PSI(PSI), RTChecks(RTChecks) {
460 // Query this against the original loop and save it here because the profile
461 // of the original loop header may change as the transformation happens.
462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464
465 if (MinProfitableTripCount.isZero())
466 this->MinProfitableTripCount = VecWidth;
467 else
468 this->MinProfitableTripCount = MinProfitableTripCount;
469 }
470
471 virtual ~InnerLoopVectorizer() = default;
472
473 /// Create a new empty loop that will contain vectorized instructions later
474 /// on, while the old loop will be used as the scalar remainder. Control flow
475 /// is generated around the vectorized (and scalar epilogue) loops consisting
476 /// of various checks and bypasses. Return the pre-header block of the new
477 /// loop and the start value for the canonical induction, if it is != 0. The
478 /// latter is the case when vectorizing the epilogue loop. In the case of
479 /// epilogue vectorization, this function is overriden to handle the more
480 /// complex control flow around the loops.
481 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482
483 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485
486 // Return true if any runtime check is added.
areSafetyChecksAdded()487 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488
489 /// A type for vectorized values in the new loop. Each value from the
490 /// original loop, when vectorized, is represented by UF vector values in the
491 /// new unrolled loop, where UF is the unroll factor.
492 using VectorParts = SmallVector<Value *, 2>;
493
494 /// A helper function to scalarize a single Instruction in the innermost loop.
495 /// Generates a sequence of scalar instances for each lane between \p MinLane
496 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498 /// Instr's operands.
499 void scalarizeInstruction(const Instruction *Instr,
500 VPReplicateRecipe *RepRecipe,
501 const VPIteration &Instance, bool IfPredicateInstr,
502 VPTransformState &State);
503
504 /// Construct the vector value of a scalarized value \p V one lane at a time.
505 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506 VPTransformState &State);
507
508 /// Try to vectorize interleaved access group \p Group with the base address
509 /// given in \p Addr, optionally masking the vector operations if \p
510 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511 /// values in the vectorized loop.
512 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513 ArrayRef<VPValue *> VPDefs,
514 VPTransformState &State, VPValue *Addr,
515 ArrayRef<VPValue *> StoredValues,
516 VPValue *BlockInMask = nullptr);
517
518 /// Fix the non-induction PHIs in \p Plan.
519 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520
521 /// Returns true if the reordering of FP operations is not allowed, but we are
522 /// able to vectorize with strict in-order reductions for the given RdxDesc.
523 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524
525 /// Create a broadcast instruction. This method generates a broadcast
526 /// instruction (shuffle) for loop invariant values and for the induction
527 /// value. If this is the induction variable then we extend it to N, N+1, ...
528 /// this is needed because each iteration in the loop corresponds to a SIMD
529 /// element.
530 virtual Value *getBroadcastInstrs(Value *V);
531
532 // Returns the resume value (bc.merge.rdx) for a reduction as
533 // generated by fixReduction.
534 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535
536 /// Create a new phi node for the induction variable \p OrigPhi to resume
537 /// iteration count in the scalar epilogue, from where the vectorized loop
538 /// left off. In cases where the loop skeleton is more complicated (eg.
539 /// epilogue vectorization) and the resume values can come from an additional
540 /// bypass block, the \p AdditionalBypass pair provides information about the
541 /// bypass block and the end value on the edge from bypass to this loop.
542 PHINode *createInductionResumeValue(
543 PHINode *OrigPhi, const InductionDescriptor &ID,
544 ArrayRef<BasicBlock *> BypassBlocks,
545 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546
547 protected:
548 friend class LoopVectorizationPlanner;
549
550 /// A small list of PHINodes.
551 using PhiVector = SmallVector<PHINode *, 4>;
552
553 /// A type for scalarized values in the new loop. Each value from the
554 /// original loop, when scalarized, is represented by UF x VF scalar values
555 /// in the new unrolled loop, where UF is the unroll factor and VF is the
556 /// vectorization factor.
557 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558
559 /// Set up the values of the IVs correctly when exiting the vector loop.
560 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561 Value *VectorTripCount, Value *EndValue,
562 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563 VPlan &Plan);
564
565 /// Handle all cross-iteration phis in the header.
566 void fixCrossIterationPHIs(VPTransformState &State);
567
568 /// Create the exit value of first order recurrences in the middle block and
569 /// update their users.
570 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571 VPTransformState &State);
572
573 /// Create code for the loop exit value of the reduction.
574 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575
576 /// Clear NSW/NUW flags from reduction instructions if necessary.
577 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578 VPTransformState &State);
579
580 /// Iteratively sink the scalarized operands of a predicated instruction into
581 /// the block that was created for it.
582 void sinkScalarOperands(Instruction *PredInst);
583
584 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585 /// represented as.
586 void truncateToMinimalBitwidths(VPTransformState &State);
587
588 /// Returns (and creates if needed) the original loop trip count.
589 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590
591 /// Returns (and creates if needed) the trip count of the widened loop.
592 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593
594 /// Returns a bitcasted value to the requested vector type.
595 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597 const DataLayout &DL);
598
599 /// Emit a bypass check to see if the vector trip count is zero, including if
600 /// it overflows.
601 void emitIterationCountCheck(BasicBlock *Bypass);
602
603 /// Emit a bypass check to see if all of the SCEV assumptions we've
604 /// had to make are correct. Returns the block containing the checks or
605 /// nullptr if no checks have been added.
606 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607
608 /// Emit bypass checks to check any memory assumptions we may have made.
609 /// Returns the block containing the checks or nullptr if no checks have been
610 /// added.
611 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612
613 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614 /// vector loop preheader, middle block and scalar preheader.
615 void createVectorLoopSkeleton(StringRef Prefix);
616
617 /// Create new phi nodes for the induction variables to resume iteration count
618 /// in the scalar epilogue, from where the vectorized loop left off.
619 /// In cases where the loop skeleton is more complicated (eg. epilogue
620 /// vectorization) and the resume values can come from an additional bypass
621 /// block, the \p AdditionalBypass pair provides information about the bypass
622 /// block and the end value on the edge from bypass to this loop.
623 void createInductionResumeValues(
624 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625
626 /// Complete the loop skeleton by adding debug MDs, creating appropriate
627 /// conditional branches in the middle block, preparing the builder and
628 /// running the verifier. Return the preheader of the completed vector loop.
629 BasicBlock *completeLoopSkeleton();
630
631 /// Collect poison-generating recipes that may generate a poison value that is
632 /// used after vectorization, even when their operands are not poison. Those
633 /// recipes meet the following conditions:
634 /// * Contribute to the address computation of a recipe generating a widen
635 /// memory load/store (VPWidenMemoryInstructionRecipe or
636 /// VPInterleaveRecipe).
637 /// * Such a widen memory load/store has at least one underlying Instruction
638 /// that is in a basic block that needs predication and after vectorization
639 /// the generated instruction won't be predicated.
640 void collectPoisonGeneratingRecipes(VPTransformState &State);
641
642 /// Allow subclasses to override and print debug traces before/after vplan
643 /// execution, when trace information is requested.
printDebugTracesAtStart()644 virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()645 virtual void printDebugTracesAtEnd(){};
646
647 /// The original loop.
648 Loop *OrigLoop;
649
650 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651 /// dynamic knowledge to simplify SCEV expressions and converts them to a
652 /// more usable form.
653 PredicatedScalarEvolution &PSE;
654
655 /// Loop Info.
656 LoopInfo *LI;
657
658 /// Dominator Tree.
659 DominatorTree *DT;
660
661 /// Target Library Info.
662 const TargetLibraryInfo *TLI;
663
664 /// Target Transform Info.
665 const TargetTransformInfo *TTI;
666
667 /// Assumption Cache.
668 AssumptionCache *AC;
669
670 /// Interface to emit optimization remarks.
671 OptimizationRemarkEmitter *ORE;
672
673 /// The vectorization SIMD factor to use. Each vector will have this many
674 /// vector elements.
675 ElementCount VF;
676
677 ElementCount MinProfitableTripCount;
678
679 /// The vectorization unroll factor to use. Each scalar is vectorized to this
680 /// many different vector instructions.
681 unsigned UF;
682
683 /// The builder that we use
684 IRBuilder<> Builder;
685
686 // --- Vectorization state ---
687
688 /// The vector-loop preheader.
689 BasicBlock *LoopVectorPreHeader;
690
691 /// The scalar-loop preheader.
692 BasicBlock *LoopScalarPreHeader;
693
694 /// Middle Block between the vector and the scalar.
695 BasicBlock *LoopMiddleBlock;
696
697 /// The unique ExitBlock of the scalar loop if one exists. Note that
698 /// there can be multiple exiting edges reaching this block.
699 BasicBlock *LoopExitBlock;
700
701 /// The scalar loop body.
702 BasicBlock *LoopScalarBody;
703
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706
707 /// Store instructions that were predicated.
708 SmallVector<Instruction *, 4> PredicatedInstructions;
709
710 /// Trip count of the original loop.
711 Value *TripCount = nullptr;
712
713 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714 Value *VectorTripCount = nullptr;
715
716 /// The legality analysis.
717 LoopVectorizationLegality *Legal;
718
719 /// The profitablity analysis.
720 LoopVectorizationCostModel *Cost;
721
722 // Record whether runtime checks are added.
723 bool AddedSafetyChecks = false;
724
725 // Holds the end values for each induction variable. We save the end values
726 // so we can later fix-up the external users of the induction variables.
727 DenseMap<PHINode *, Value *> IVEndValues;
728
729 /// BFI and PSI are used to check for profile guided size optimizations.
730 BlockFrequencyInfo *BFI;
731 ProfileSummaryInfo *PSI;
732
733 // Whether this loop should be optimized for size based on profile guided size
734 // optimizatios.
735 bool OptForSizeBasedOnProfile;
736
737 /// Structure to hold information about generated runtime checks, responsible
738 /// for cleaning the checks, if vectorization turns out unprofitable.
739 GeneratedRTChecks &RTChecks;
740
741 // Holds the resume values for reductions in the loops, used to set the
742 // correct start value of reduction PHIs when vectorizing the epilogue.
743 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744 ReductionResumeValues;
745 };
746
747 class InnerLoopUnroller : public InnerLoopVectorizer {
748 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)749 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750 LoopInfo *LI, DominatorTree *DT,
751 const TargetLibraryInfo *TLI,
752 const TargetTransformInfo *TTI, AssumptionCache *AC,
753 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754 LoopVectorizationLegality *LVL,
755 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758 ElementCount::getFixed(1),
759 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760 BFI, PSI, Check) {}
761
762 private:
763 Value *getBroadcastInstrs(Value *V) override;
764 };
765
766 /// Encapsulate information regarding vectorization of a loop and its epilogue.
767 /// This information is meant to be updated and used across two stages of
768 /// epilogue vectorization.
769 struct EpilogueLoopVectorizationInfo {
770 ElementCount MainLoopVF = ElementCount::getFixed(0);
771 unsigned MainLoopUF = 0;
772 ElementCount EpilogueVF = ElementCount::getFixed(0);
773 unsigned EpilogueUF = 0;
774 BasicBlock *MainLoopIterationCountCheck = nullptr;
775 BasicBlock *EpilogueIterationCountCheck = nullptr;
776 BasicBlock *SCEVSafetyCheck = nullptr;
777 BasicBlock *MemSafetyCheck = nullptr;
778 Value *TripCount = nullptr;
779 Value *VectorTripCount = nullptr;
780
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo781 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782 ElementCount EVF, unsigned EUF)
783 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784 assert(EUF == 1 &&
785 "A high UF for the epilogue loop is likely not beneficial.");
786 }
787 };
788
789 /// An extension of the inner loop vectorizer that creates a skeleton for a
790 /// vectorized loop that has its epilogue (residual) also vectorized.
791 /// The idea is to run the vplan on a given loop twice, firstly to setup the
792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
793 /// from the first step and vectorize the epilogue. This is achieved by
794 /// deriving two concrete strategy classes from this base class and invoking
795 /// them in succession from the loop vectorizer planner.
796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)798 InnerLoopAndEpilogueVectorizer(
799 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800 DominatorTree *DT, const TargetLibraryInfo *TLI,
801 const TargetTransformInfo *TTI, AssumptionCache *AC,
802 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805 GeneratedRTChecks &Checks)
806 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808 CM, BFI, PSI, Checks),
809 EPI(EPI) {}
810
811 // Override this function to handle the more complex control flow around the
812 // three loops.
createVectorizedLoopSkeleton()813 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814 return createEpilogueVectorizedLoopSkeleton();
815 }
816
817 /// The interface for creating a vectorized skeleton using one of two
818 /// different strategies, each corresponding to one execution of the vplan
819 /// as described above.
820 virtual std::pair<BasicBlock *, Value *>
821 createEpilogueVectorizedLoopSkeleton() = 0;
822
823 /// Holds and updates state information required to vectorize the main loop
824 /// and its epilogue in two separate passes. This setup helps us avoid
825 /// regenerating and recomputing runtime safety checks. It also helps us to
826 /// shorten the iteration-count-check path length for the cases where the
827 /// iteration count of the loop is so small that the main vector loop is
828 /// completely skipped.
829 EpilogueLoopVectorizationInfo &EPI;
830 };
831
832 /// A specialized derived class of inner loop vectorizer that performs
833 /// vectorization of *main* loops in the process of vectorizing loops and their
834 /// epilogues.
835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)837 EpilogueVectorizerMainLoop(
838 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839 DominatorTree *DT, const TargetLibraryInfo *TLI,
840 const TargetTransformInfo *TTI, AssumptionCache *AC,
841 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844 GeneratedRTChecks &Check)
845 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846 EPI, LVL, CM, BFI, PSI, Check) {}
847 /// Implements the interface for creating a vectorized skeleton using the
848 /// *main loop* strategy (ie the first pass of vplan execution).
849 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850
851 protected:
852 /// Emits an iteration count bypass check once for the main loop (when \p
853 /// ForEpilogue is false) and once for the epilogue loop (when \p
854 /// ForEpilogue is true).
855 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856 void printDebugTracesAtStart() override;
857 void printDebugTracesAtEnd() override;
858 };
859
860 // A specialized derived class of inner loop vectorizer that performs
861 // vectorization of *epilogue* loops in the process of vectorizing loops and
862 // their epilogues.
863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)865 EpilogueVectorizerEpilogueLoop(
866 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867 DominatorTree *DT, const TargetLibraryInfo *TLI,
868 const TargetTransformInfo *TTI, AssumptionCache *AC,
869 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872 GeneratedRTChecks &Checks)
873 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874 EPI, LVL, CM, BFI, PSI, Checks) {
875 TripCount = EPI.TripCount;
876 }
877 /// Implements the interface for creating a vectorized skeleton using the
878 /// *epilogue loop* strategy (ie the second pass of vplan execution).
879 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880
881 protected:
882 /// Emits an iteration count bypass check after the main vector loop has
883 /// finished to see if there are any iterations left to execute by either
884 /// the vector epilogue or the scalar epilogue.
885 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886 BasicBlock *Bypass,
887 BasicBlock *Insert);
888 void printDebugTracesAtStart() override;
889 void printDebugTracesAtEnd() override;
890 };
891 } // end namespace llvm
892
893 /// Look for a meaningful debug location on the instruction or it's
894 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896 if (!I)
897 return I;
898
899 DebugLoc Empty;
900 if (I->getDebugLoc() != Empty)
901 return I;
902
903 for (Use &Op : I->operands()) {
904 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905 if (OpInst->getDebugLoc() != Empty)
906 return OpInst;
907 }
908
909 return I;
910 }
911
912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913 /// is passed, the message relates to that particular instruction.
914 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)915 static void debugVectorizationMessage(const StringRef Prefix,
916 const StringRef DebugMsg,
917 Instruction *I) {
918 dbgs() << "LV: " << Prefix << DebugMsg;
919 if (I != nullptr)
920 dbgs() << " " << *I;
921 else
922 dbgs() << '.';
923 dbgs() << '\n';
924 }
925 #endif
926
927 /// Create an analysis remark that explains why vectorization failed
928 ///
929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
930 /// RemarkName is the identifier for the remark. If \p I is passed it is an
931 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
932 /// the location of the remark. \return the remark object that can be
933 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936 Value *CodeRegion = TheLoop->getHeader();
937 DebugLoc DL = TheLoop->getStartLoc();
938
939 if (I) {
940 CodeRegion = I->getParent();
941 // If there is no debug location attached to the instruction, revert back to
942 // using the loop's.
943 if (I->getDebugLoc())
944 DL = I->getDebugLoc();
945 }
946
947 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948 }
949
950 namespace llvm {
951
952 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954 int64_t Step) {
955 assert(Ty->isIntegerTy() && "Expected an integer step");
956 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958 }
959
960 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963 return VF.isScalable() ? B.CreateVScale(EC) : EC;
964 }
965
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE)966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
969
970 ScalarEvolution &SE = *PSE.getSE();
971
972 // The exit count might have the type of i64 while the phi is i32. This can
973 // happen if we have an induction variable that is sign extended before the
974 // compare. The only way that we get a backedge taken count is that the
975 // induction variable was signed and as such will not overflow. In such a case
976 // truncation is legal.
977 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978 IdxTy->getPrimitiveSizeInBits())
979 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981
982 // Get the total trip count from the count by adding 1.
983 return SE.getAddExpr(BackedgeTakenCount,
984 SE.getOne(BackedgeTakenCount->getType()));
985 }
986
getRuntimeVFAsFloat(IRBuilderBase & B,Type * FTy,ElementCount VF)987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988 ElementCount VF) {
989 assert(FTy->isFloatingPointTy() && "Expected floating point type!");
990 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992 return B.CreateUIToFP(RuntimeVF, FTy);
993 }
994
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)995 void reportVectorizationFailure(const StringRef DebugMsg,
996 const StringRef OREMsg, const StringRef ORETag,
997 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998 Instruction *I) {
999 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1000 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001 ORE->emit(
1002 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003 << "loop not vectorized: " << OREMsg);
1004 }
1005
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008 Instruction *I) {
1009 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1010 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011 ORE->emit(
1012 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013 << Msg);
1014 }
1015
1016 } // end namespace llvm
1017
1018 #ifndef NDEBUG
1019 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)1020 static std::string getDebugLocString(const Loop *L) {
1021 std::string Result;
1022 if (L) {
1023 raw_string_ostream OS(Result);
1024 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025 LoopDbgLoc.print(OS);
1026 else
1027 // Just print the module name.
1028 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029 OS.flush();
1030 }
1031 return Result;
1032 }
1033 #endif
1034
collectPoisonGeneratingRecipes(VPTransformState & State)1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036 VPTransformState &State) {
1037
1038 // Collect recipes in the backward slice of `Root` that may generate a poison
1039 // value that is used after vectorization.
1040 SmallPtrSet<VPRecipeBase *, 16> Visited;
1041 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042 SmallVector<VPRecipeBase *, 16> Worklist;
1043 Worklist.push_back(Root);
1044
1045 // Traverse the backward slice of Root through its use-def chain.
1046 while (!Worklist.empty()) {
1047 VPRecipeBase *CurRec = Worklist.back();
1048 Worklist.pop_back();
1049
1050 if (!Visited.insert(CurRec).second)
1051 continue;
1052
1053 // Prune search if we find another recipe generating a widen memory
1054 // instruction. Widen memory instructions involved in address computation
1055 // will lead to gather/scatter instructions, which don't need to be
1056 // handled.
1057 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058 isa<VPInterleaveRecipe>(CurRec) ||
1059 isa<VPScalarIVStepsRecipe>(CurRec) ||
1060 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062 continue;
1063
1064 // This recipe contributes to the address computation of a widen
1065 // load/store. Collect recipe if its underlying instruction has
1066 // poison-generating flags.
1067 Instruction *Instr = CurRec->getUnderlyingInstr();
1068 if (Instr && Instr->hasPoisonGeneratingFlags())
1069 State.MayGeneratePoisonRecipes.insert(CurRec);
1070
1071 // Add new definitions to the worklist.
1072 for (VPValue *operand : CurRec->operands())
1073 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074 Worklist.push_back(OpDef);
1075 }
1076 });
1077
1078 // Traverse all the recipes in the VPlan and collect the poison-generating
1079 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080 // VPInterleaveRecipe.
1081 auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083 for (VPRecipeBase &Recipe : *VPBB) {
1084 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087 if (AddrDef && WidenRec->isConsecutive() &&
1088 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092 if (AddrDef) {
1093 // Check if any member of the interleave group needs predication.
1094 const InterleaveGroup<Instruction> *InterGroup =
1095 InterleaveRec->getInterleaveGroup();
1096 bool NeedPredication = false;
1097 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098 I < NumMembers; ++I) {
1099 Instruction *Member = InterGroup->getMember(I);
1100 if (Member)
1101 NeedPredication |=
1102 Legal->blockNeedsPredication(Member->getParent());
1103 }
1104
1105 if (NeedPredication)
1106 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107 }
1108 }
1109 }
1110 }
1111 }
1112
getReductionResumeValue(const RecurrenceDescriptor & RdxDesc)1113 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114 const RecurrenceDescriptor &RdxDesc) {
1115 auto It = ReductionResumeValues.find(&RdxDesc);
1116 assert(It != ReductionResumeValues.end() &&
1117 "Expected to find a resume value for the reduction.");
1118 return It->second;
1119 }
1120
1121 namespace llvm {
1122
1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1124 // lowered.
1125 enum ScalarEpilogueLowering {
1126
1127 // The default: allowing scalar epilogues.
1128 CM_ScalarEpilogueAllowed,
1129
1130 // Vectorization with OptForSize: don't allow epilogues.
1131 CM_ScalarEpilogueNotAllowedOptSize,
1132
1133 // A special case of vectorisation with OptForSize: loops with a very small
1134 // trip count are considered for vectorization under OptForSize, thereby
1135 // making sure the cost of their loop body is dominant, free of runtime
1136 // guards and scalar iteration overheads.
1137 CM_ScalarEpilogueNotAllowedLowTripLoop,
1138
1139 // Loop hint predicate indicating an epilogue is undesired.
1140 CM_ScalarEpilogueNotNeededUsePredicate,
1141
1142 // Directive indicating we must either tail fold or not vectorize
1143 CM_ScalarEpilogueNotAllowedUsePredicate
1144 };
1145
1146 /// ElementCountComparator creates a total ordering for ElementCount
1147 /// for the purposes of using it in a set structure.
1148 struct ElementCountComparator {
operator ()llvm::ElementCountComparator1149 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152 }
1153 };
1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155
1156 /// LoopVectorizationCostModel - estimates the expected speedups due to
1157 /// vectorization.
1158 /// In many cases vectorization is not profitable. This can happen because of
1159 /// a number of reasons. In this class we mainly attempt to predict the
1160 /// expected speedup/slowdowns due to the supported instruction set. We use the
1161 /// TargetTransformInfo to query the different backends for the cost of
1162 /// different operations.
1163 class LoopVectorizationCostModel {
1164 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1165 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167 LoopVectorizationLegality *Legal,
1168 const TargetTransformInfo &TTI,
1169 const TargetLibraryInfo *TLI, DemandedBits *DB,
1170 AssumptionCache *AC,
1171 OptimizationRemarkEmitter *ORE, const Function *F,
1172 const LoopVectorizeHints *Hints,
1173 InterleavedAccessInfo &IAI)
1174 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176 Hints(Hints), InterleaveInfo(IAI) {}
1177
1178 /// \return An upper bound for the vectorization factors (both fixed and
1179 /// scalable). If the factors are 0, vectorization and interleaving should be
1180 /// avoided up front.
1181 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182
1183 /// \return True if runtime checks are required for vectorization, and false
1184 /// otherwise.
1185 bool runtimeChecksRequired();
1186
1187 /// \return The most profitable vectorization factor and the cost of that VF.
1188 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189 /// then this vectorization factor will be selected if vectorization is
1190 /// possible.
1191 VectorizationFactor
1192 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193
1194 VectorizationFactor
1195 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196 const LoopVectorizationPlanner &LVP);
1197
1198 /// Setup cost-based decisions for user vectorization factor.
1199 /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1200 bool selectUserVectorizationFactor(ElementCount UserVF) {
1201 collectUniformsAndScalars(UserVF);
1202 collectInstsToScalarize(UserVF);
1203 return expectedCost(UserVF).first.isValid();
1204 }
1205
1206 /// \return The size (in bits) of the smallest and widest types in the code
1207 /// that needs to be vectorized. We ignore values that remain scalar such as
1208 /// 64 bit loop indices.
1209 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210
1211 /// \return The desired interleave count.
1212 /// If interleave count has been specified by metadata it will be returned.
1213 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214 /// are the selected vectorization factor and the cost of the selected VF.
1215 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216
1217 /// Memory access instruction may be vectorized in more than one way.
1218 /// Form of instruction after vectorization depends on cost.
1219 /// This function takes cost-based decisions for Load/Store instructions
1220 /// and collects them in a map. This decisions map is used for building
1221 /// the lists of loop-uniform and loop-scalar instructions.
1222 /// The calculated cost is saved with widening decision in order to
1223 /// avoid redundant calculations.
1224 void setCostBasedWideningDecision(ElementCount VF);
1225
1226 /// A struct that represents some properties of the register usage
1227 /// of a loop.
1228 struct RegisterUsage {
1229 /// Holds the number of loop invariant values that are used in the loop.
1230 /// The key is ClassID of target-provided register class.
1231 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232 /// Holds the maximum number of concurrent live intervals in the loop.
1233 /// The key is ClassID of target-provided register class.
1234 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235 };
1236
1237 /// \return Returns information about the register usages of the loop for the
1238 /// given vectorization factors.
1239 SmallVector<RegisterUsage, 8>
1240 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241
1242 /// Collect values we want to ignore in the cost model.
1243 void collectValuesToIgnore();
1244
1245 /// Collect all element types in the loop for which widening is needed.
1246 void collectElementTypesForWidening();
1247
1248 /// Split reductions into those that happen in the loop, and those that happen
1249 /// outside. In loop reductions are collected into InLoopReductionChains.
1250 void collectInLoopReductions();
1251
1252 /// Returns true if we should use strict in-order reductions for the given
1253 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255 /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const1256 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257 return !Hints->allowReordering() && RdxDesc.isOrdered();
1258 }
1259
1260 /// \returns The smallest bitwidth each instruction can be represented with.
1261 /// The vector equivalents of these instructions should be truncated to this
1262 /// type.
getMinimalBitwidths() const1263 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264 return MinBWs;
1265 }
1266
1267 /// \returns True if it is more profitable to scalarize instruction \p I for
1268 /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1269 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270 assert(VF.isVector() &&
1271 "Profitable to scalarize relevant only for VF > 1.");
1272
1273 // Cost model is not run in the VPlan-native path - return conservative
1274 // result until this changes.
1275 if (EnableVPlanNativePath)
1276 return false;
1277
1278 auto Scalars = InstsToScalarize.find(VF);
1279 assert(Scalars != InstsToScalarize.end() &&
1280 "VF not yet analyzed for scalarization profitability");
1281 return Scalars->second.find(I) != Scalars->second.end();
1282 }
1283
1284 /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1285 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286 if (VF.isScalar())
1287 return true;
1288
1289 // Cost model is not run in the VPlan-native path - return conservative
1290 // result until this changes.
1291 if (EnableVPlanNativePath)
1292 return false;
1293
1294 auto UniformsPerVF = Uniforms.find(VF);
1295 assert(UniformsPerVF != Uniforms.end() &&
1296 "VF not yet analyzed for uniformity");
1297 return UniformsPerVF->second.count(I);
1298 }
1299
1300 /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1301 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302 if (VF.isScalar())
1303 return true;
1304
1305 // Cost model is not run in the VPlan-native path - return conservative
1306 // result until this changes.
1307 if (EnableVPlanNativePath)
1308 return false;
1309
1310 auto ScalarsPerVF = Scalars.find(VF);
1311 assert(ScalarsPerVF != Scalars.end() &&
1312 "Scalar values are not calculated for VF");
1313 return ScalarsPerVF->second.count(I);
1314 }
1315
1316 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317 /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1318 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320 !isProfitableToScalarize(I, VF) &&
1321 !isScalarAfterVectorization(I, VF);
1322 }
1323
1324 /// Decision that was taken during cost calculation for memory instruction.
1325 enum InstWidening {
1326 CM_Unknown,
1327 CM_Widen, // For consecutive accesses with stride +1.
1328 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329 CM_Interleave,
1330 CM_GatherScatter,
1331 CM_Scalarize
1332 };
1333
1334 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335 /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1336 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337 InstructionCost Cost) {
1338 assert(VF.isVector() && "Expected VF >=2");
1339 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340 }
1341
1342 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343 /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1344 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345 ElementCount VF, InstWidening W,
1346 InstructionCost Cost) {
1347 assert(VF.isVector() && "Expected VF >=2");
1348 /// Broadcast this decicion to all instructions inside the group.
1349 /// But the cost will be assigned to one instruction only.
1350 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351 if (auto *I = Grp->getMember(i)) {
1352 if (Grp->getInsertPos() == I)
1353 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354 else
1355 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356 }
1357 }
1358 }
1359
1360 /// Return the cost model decision for the given instruction \p I and vector
1361 /// width \p VF. Return CM_Unknown if this instruction did not pass
1362 /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1363 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364 assert(VF.isVector() && "Expected VF to be a vector VF");
1365 // Cost model is not run in the VPlan-native path - return conservative
1366 // result until this changes.
1367 if (EnableVPlanNativePath)
1368 return CM_GatherScatter;
1369
1370 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371 auto Itr = WideningDecisions.find(InstOnVF);
1372 if (Itr == WideningDecisions.end())
1373 return CM_Unknown;
1374 return Itr->second.first;
1375 }
1376
1377 /// Return the vectorization cost for the given instruction \p I and vector
1378 /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1379 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380 assert(VF.isVector() && "Expected VF >=2");
1381 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1383 "The cost is not calculated");
1384 return WideningDecisions[InstOnVF].second;
1385 }
1386
1387 /// Return True if instruction \p I is an optimizable truncate whose operand
1388 /// is an induction variable. Such a truncate will be removed by adding a new
1389 /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1390 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391 // If the instruction is not a truncate, return false.
1392 auto *Trunc = dyn_cast<TruncInst>(I);
1393 if (!Trunc)
1394 return false;
1395
1396 // Get the source and destination types of the truncate.
1397 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399
1400 // If the truncate is free for the given types, return false. Replacing a
1401 // free truncate with an induction variable would add an induction variable
1402 // update instruction to each iteration of the loop. We exclude from this
1403 // check the primary induction variable since it will need an update
1404 // instruction regardless.
1405 Value *Op = Trunc->getOperand(0);
1406 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407 return false;
1408
1409 // If the truncated value is not an induction variable, return false.
1410 return Legal->isInductionPhi(Op);
1411 }
1412
1413 /// Collects the instructions to scalarize for each predicated instruction in
1414 /// the loop.
1415 void collectInstsToScalarize(ElementCount VF);
1416
1417 /// Collect Uniform and Scalar values for the given \p VF.
1418 /// The sets depend on CM decision for Load/Store instructions
1419 /// that may be vectorized as interleave, gather-scatter or scalarized.
collectUniformsAndScalars(ElementCount VF)1420 void collectUniformsAndScalars(ElementCount VF) {
1421 // Do the analysis once.
1422 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423 return;
1424 setCostBasedWideningDecision(VF);
1425 collectLoopUniforms(VF);
1426 collectLoopScalars(VF);
1427 }
1428
1429 /// Returns true if the target machine supports masked store operation
1430 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1431 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432 return Legal->isConsecutivePtr(DataType, Ptr) &&
1433 TTI.isLegalMaskedStore(DataType, Alignment);
1434 }
1435
1436 /// Returns true if the target machine supports masked load operation
1437 /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1438 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439 return Legal->isConsecutivePtr(DataType, Ptr) &&
1440 TTI.isLegalMaskedLoad(DataType, Alignment);
1441 }
1442
1443 /// Returns true if the target machine can represent \p V as a masked gather
1444 /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF=ElementCount::getFixed (1))1445 bool isLegalGatherOrScatter(Value *V,
1446 ElementCount VF = ElementCount::getFixed(1)) {
1447 bool LI = isa<LoadInst>(V);
1448 bool SI = isa<StoreInst>(V);
1449 if (!LI && !SI)
1450 return false;
1451 auto *Ty = getLoadStoreType(V);
1452 Align Align = getLoadStoreAlignment(V);
1453 if (VF.isVector())
1454 Ty = VectorType::get(Ty, VF);
1455 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457 }
1458
1459 /// Returns true if the target machine supports all of the reduction
1460 /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1461 bool canVectorizeReductions(ElementCount VF) const {
1462 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465 }));
1466 }
1467
1468 /// Given costs for both strategies, return true if the scalar predication
1469 /// lowering should be used for div/rem. This incorporates an override
1470 /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1471 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472 InstructionCost SafeDivisorCost) const {
1473 switch (ForceSafeDivisor) {
1474 case cl::BOU_UNSET:
1475 return ScalarCost < SafeDivisorCost;
1476 case cl::BOU_TRUE:
1477 return false;
1478 case cl::BOU_FALSE:
1479 return true;
1480 };
1481 llvm_unreachable("impossible case value");
1482 }
1483
1484 /// Returns true if \p I is an instruction which requires predication and
1485 /// for which our chosen predication strategy is scalarization (i.e. we
1486 /// don't have an alternate strategy such as masking available).
1487 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489
1490 /// Returns true if \p I is an instruction that needs to be predicated
1491 /// at runtime. The result is independent of the predication mechanism.
1492 /// Superset of instructions that return true for isScalarWithPredication.
1493 bool isPredicatedInst(Instruction *I) const;
1494
1495 /// Return the costs for our two available strategies for lowering a
1496 /// div/rem operation which requires speculating at least one lane.
1497 /// First result is for scalarization (will be invalid for scalable
1498 /// vectors); second is for the safe-divisor strategy.
1499 std::pair<InstructionCost, InstructionCost>
1500 getDivRemSpeculationCost(Instruction *I,
1501 ElementCount VF) const;
1502
1503 /// Returns true if \p I is a memory instruction with consecutive memory
1504 /// access that can be widened.
1505 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506
1507 /// Returns true if \p I is a memory instruction in an interleaved-group
1508 /// of memory accesses that can be vectorized with wide vector loads/stores
1509 /// and shuffles.
1510 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511
1512 /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1513 bool isAccessInterleaved(Instruction *Instr) {
1514 return InterleaveInfo.isInterleaved(Instr);
1515 }
1516
1517 /// Get the interleaved access group that \p Instr belongs to.
1518 const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1519 getInterleavedAccessGroup(Instruction *Instr) {
1520 return InterleaveInfo.getInterleaveGroup(Instr);
1521 }
1522
1523 /// Returns true if we're required to use a scalar epilogue for at least
1524 /// the final iteration of the original loop.
requiresScalarEpilogue(ElementCount VF) const1525 bool requiresScalarEpilogue(ElementCount VF) const {
1526 if (!isScalarEpilogueAllowed())
1527 return false;
1528 // If we might exit from anywhere but the latch, must run the exiting
1529 // iteration in scalar form.
1530 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531 return true;
1532 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533 }
1534
1535 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536 /// loop hint annotation.
isScalarEpilogueAllowed() const1537 bool isScalarEpilogueAllowed() const {
1538 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539 }
1540
1541 /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1542 bool foldTailByMasking() const { return FoldTailByMasking; }
1543
1544 /// Returns true if were tail-folding and want to use the active lane mask
1545 /// for vector loop control flow.
useActiveLaneMaskForControlFlow() const1546 bool useActiveLaneMaskForControlFlow() const {
1547 return FoldTailByMasking &&
1548 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549 }
1550
1551 /// Returns true if the instructions in this block requires predication
1552 /// for any reason, e.g. because tail folding now requires a predicate
1553 /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1554 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556 }
1557
1558 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559 /// nodes to the chain of instructions representing the reductions. Uses a
1560 /// MapVector to ensure deterministic iteration order.
1561 using ReductionChainMap =
1562 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563
1564 /// Return the chain of instructions representing an inloop reduction.
getInLoopReductionChains() const1565 const ReductionChainMap &getInLoopReductionChains() const {
1566 return InLoopReductionChains;
1567 }
1568
1569 /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1570 bool isInLoopReduction(PHINode *Phi) const {
1571 return InLoopReductionChains.count(Phi);
1572 }
1573
1574 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575 /// with factor VF. Return the cost of the instruction, including
1576 /// scalarization overhead if it's needed.
1577 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578
1579 /// Estimate cost of a call instruction CI if it were vectorized with factor
1580 /// VF. Return the cost of the instruction, including scalarization overhead
1581 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582 /// scalarized -
1583 /// i.e. either vector version isn't available, or is too expensive.
1584 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585 bool &NeedToScalarize) const;
1586
1587 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588 /// that of B.
1589 bool isMoreProfitable(const VectorizationFactor &A,
1590 const VectorizationFactor &B) const;
1591
1592 /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1593 void invalidateCostModelingDecisions() {
1594 WideningDecisions.clear();
1595 Uniforms.clear();
1596 Scalars.clear();
1597 }
1598
1599 /// Convenience function that returns the value of vscale_range iff
1600 /// vscale_range.min == vscale_range.max or otherwise returns the value
1601 /// returned by the corresponding TLI method.
1602 std::optional<unsigned> getVScaleForTuning() const;
1603
1604 private:
1605 unsigned NumPredStores = 0;
1606
1607 /// \return An upper bound for the vectorization factors for both
1608 /// fixed and scalable vectorization, where the minimum-known number of
1609 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610 /// disabled or unsupported, then the scalable part will be equal to
1611 /// ElementCount::getScalable(0).
1612 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613 ElementCount UserVF,
1614 bool FoldTailByMasking);
1615
1616 /// \return the maximized element count based on the targets vector
1617 /// registers and the loop trip-count, but limited to a maximum safe VF.
1618 /// This is a helper function of computeFeasibleMaxVF.
1619 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620 unsigned SmallestType,
1621 unsigned WidestType,
1622 ElementCount MaxSafeVF,
1623 bool FoldTailByMasking);
1624
1625 /// \return the maximum legal scalable VF, based on the safe max number
1626 /// of elements.
1627 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628
1629 /// The vectorization cost is a combination of the cost itself and a boolean
1630 /// indicating whether any of the contributing operations will actually
1631 /// operate on vector values after type legalization in the backend. If this
1632 /// latter value is false, then all operations will be scalarized (i.e. no
1633 /// vectorization has actually taken place).
1634 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635
1636 /// Returns the expected execution cost. The unit of the cost does
1637 /// not matter because we use the 'cost' units to compare different
1638 /// vector widths. The cost that is returned is *not* normalized by
1639 /// the factor width. If \p Invalid is not nullptr, this function
1640 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641 /// each instruction that has an Invalid cost for the given VF.
1642 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643 VectorizationCostTy
1644 expectedCost(ElementCount VF,
1645 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646
1647 /// Returns the execution time cost of an instruction for a given vector
1648 /// width. Vector width of one means scalar.
1649 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650
1651 /// The cost-computation logic from getInstructionCost which provides
1652 /// the vector type as an output parameter.
1653 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654 Type *&VectorTy);
1655
1656 /// Return the cost of instructions in an inloop reduction pattern, if I is
1657 /// part of that pattern.
1658 std::optional<InstructionCost>
1659 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660 TTI::TargetCostKind CostKind);
1661
1662 /// Calculate vectorization cost of memory instruction \p I.
1663 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664
1665 /// The cost computation for scalarized memory instruction.
1666 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667
1668 /// The cost computation for interleaving group of memory instructions.
1669 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670
1671 /// The cost computation for Gather/Scatter instruction.
1672 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673
1674 /// The cost computation for widening instruction \p I with consecutive
1675 /// memory access.
1676 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677
1678 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679 /// Load: scalar load + broadcast.
1680 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681 /// element)
1682 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683
1684 /// Estimate the overhead of scalarizing an instruction. This is a
1685 /// convenience wrapper for the type-based getScalarizationOverhead API.
1686 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687 TTI::TargetCostKind CostKind) const;
1688
1689 /// Returns true if an artificially high cost for emulated masked memrefs
1690 /// should be used.
1691 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692
1693 /// Map of scalar integer values to the smallest bitwidth they can be legally
1694 /// represented as. The vector equivalents of these values should be truncated
1695 /// to this type.
1696 MapVector<Instruction *, uint64_t> MinBWs;
1697
1698 /// A type representing the costs for instructions if they were to be
1699 /// scalarized rather than vectorized. The entries are Instruction-Cost
1700 /// pairs.
1701 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702
1703 /// A set containing all BasicBlocks that are known to present after
1704 /// vectorization as a predicated block.
1705 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706 PredicatedBBsAfterVectorization;
1707
1708 /// Records whether it is allowed to have the original scalar loop execute at
1709 /// least once. This may be needed as a fallback loop in case runtime
1710 /// aliasing/dependence checks fail, or to handle the tail/remainder
1711 /// iterations when the trip count is unknown or doesn't divide by the VF,
1712 /// or as a peel-loop to handle gaps in interleave-groups.
1713 /// Under optsize and when the trip count is very small we don't allow any
1714 /// iterations to execute in the scalar loop.
1715 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716
1717 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718 bool FoldTailByMasking = false;
1719
1720 /// A map holding scalar costs for different vectorization factors. The
1721 /// presence of a cost for an instruction in the mapping indicates that the
1722 /// instruction will be scalarized when vectorizing with the associated
1723 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725
1726 /// Holds the instructions known to be uniform after vectorization.
1727 /// The data is collected per VF.
1728 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729
1730 /// Holds the instructions known to be scalar after vectorization.
1731 /// The data is collected per VF.
1732 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733
1734 /// Holds the instructions (address computations) that are forced to be
1735 /// scalarized.
1736 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737
1738 /// PHINodes of the reductions that should be expanded in-loop along with
1739 /// their associated chains of reduction operations, in program order from top
1740 /// (PHI) to bottom
1741 ReductionChainMap InLoopReductionChains;
1742
1743 /// A Map of inloop reduction operations and their immediate chain operand.
1744 /// FIXME: This can be removed once reductions can be costed correctly in
1745 /// vplan. This was added to allow quick lookup to the inloop operations,
1746 /// without having to loop through InLoopReductionChains.
1747 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748
1749 /// Returns the expected difference in cost from scalarizing the expression
1750 /// feeding a predicated instruction \p PredInst. The instructions to
1751 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752 /// non-negative return value implies the expression will be scalarized.
1753 /// Currently, only single-use chains are considered for scalarization.
1754 InstructionCost computePredInstDiscount(Instruction *PredInst,
1755 ScalarCostsTy &ScalarCosts,
1756 ElementCount VF);
1757
1758 /// Collect the instructions that are uniform after vectorization. An
1759 /// instruction is uniform if we represent it with a single scalar value in
1760 /// the vectorized loop corresponding to each vector iteration. Examples of
1761 /// uniform instructions include pointer operands of consecutive or
1762 /// interleaved memory accesses. Note that although uniformity implies an
1763 /// instruction will be scalar, the reverse is not true. In general, a
1764 /// scalarized instruction will be represented by VF scalar values in the
1765 /// vectorized loop, each corresponding to an iteration of the original
1766 /// scalar loop.
1767 void collectLoopUniforms(ElementCount VF);
1768
1769 /// Collect the instructions that are scalar after vectorization. An
1770 /// instruction is scalar if it is known to be uniform or will be scalarized
1771 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772 /// to the list if they are used by a load/store instruction that is marked as
1773 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774 /// VF values in the vectorized loop, each corresponding to an iteration of
1775 /// the original scalar loop.
1776 void collectLoopScalars(ElementCount VF);
1777
1778 /// Keeps cost model vectorization decision and cost for instructions.
1779 /// Right now it is used for memory instructions only.
1780 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781 std::pair<InstWidening, InstructionCost>>;
1782
1783 DecisionList WideningDecisions;
1784
1785 /// Returns true if \p V is expected to be vectorized and it needs to be
1786 /// extracted.
needsExtract(Value * V,ElementCount VF) const1787 bool needsExtract(Value *V, ElementCount VF) const {
1788 Instruction *I = dyn_cast<Instruction>(V);
1789 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790 TheLoop->isLoopInvariant(I))
1791 return false;
1792
1793 // Assume we can vectorize V (and hence we need extraction) if the
1794 // scalars are not computed yet. This can happen, because it is called
1795 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796 // the scalars are collected. That should be a safe assumption in most
1797 // cases, because we check if the operands have vectorizable types
1798 // beforehand in LoopVectorizationLegality.
1799 return Scalars.find(VF) == Scalars.end() ||
1800 !isScalarAfterVectorization(I, VF);
1801 };
1802
1803 /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1804 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805 ElementCount VF) const {
1806 return SmallVector<Value *, 4>(make_filter_range(
1807 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808 }
1809
1810 /// Determines if we have the infrastructure to vectorize loop \p L and its
1811 /// epilogue, assuming the main loop is vectorized by \p VF.
1812 bool isCandidateForEpilogueVectorization(const Loop &L,
1813 const ElementCount VF) const;
1814
1815 /// Returns true if epilogue vectorization is considered profitable, and
1816 /// false otherwise.
1817 /// \p VF is the vectorization factor chosen for the original loop.
1818 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819
1820 public:
1821 /// The loop that we evaluate.
1822 Loop *TheLoop;
1823
1824 /// Predicated scalar evolution analysis.
1825 PredicatedScalarEvolution &PSE;
1826
1827 /// Loop Info analysis.
1828 LoopInfo *LI;
1829
1830 /// Vectorization legality.
1831 LoopVectorizationLegality *Legal;
1832
1833 /// Vector target information.
1834 const TargetTransformInfo &TTI;
1835
1836 /// Target Library Info.
1837 const TargetLibraryInfo *TLI;
1838
1839 /// Demanded bits analysis.
1840 DemandedBits *DB;
1841
1842 /// Assumption cache.
1843 AssumptionCache *AC;
1844
1845 /// Interface to emit optimization remarks.
1846 OptimizationRemarkEmitter *ORE;
1847
1848 const Function *TheFunction;
1849
1850 /// Loop Vectorize Hint.
1851 const LoopVectorizeHints *Hints;
1852
1853 /// The interleave access information contains groups of interleaved accesses
1854 /// with the same stride and close to each other.
1855 InterleavedAccessInfo &InterleaveInfo;
1856
1857 /// Values to ignore in the cost model.
1858 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859
1860 /// Values to ignore in the cost model when VF > 1.
1861 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862
1863 /// All element types found in the loop.
1864 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865
1866 /// Profitable vector factors.
1867 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868 };
1869 } // end namespace llvm
1870
1871 namespace {
1872 /// Helper struct to manage generating runtime checks for vectorization.
1873 ///
1874 /// The runtime checks are created up-front in temporary blocks to allow better
1875 /// estimating the cost and un-linked from the existing IR. After deciding to
1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1877 /// temporary blocks are completely removed.
1878 class GeneratedRTChecks {
1879 /// Basic block which contains the generated SCEV checks, if any.
1880 BasicBlock *SCEVCheckBlock = nullptr;
1881
1882 /// The value representing the result of the generated SCEV checks. If it is
1883 /// nullptr, either no SCEV checks have been generated or they have been used.
1884 Value *SCEVCheckCond = nullptr;
1885
1886 /// Basic block which contains the generated memory runtime checks, if any.
1887 BasicBlock *MemCheckBlock = nullptr;
1888
1889 /// The value representing the result of the generated memory runtime checks.
1890 /// If it is nullptr, either no memory runtime checks have been generated or
1891 /// they have been used.
1892 Value *MemRuntimeCheckCond = nullptr;
1893
1894 DominatorTree *DT;
1895 LoopInfo *LI;
1896 TargetTransformInfo *TTI;
1897
1898 SCEVExpander SCEVExp;
1899 SCEVExpander MemCheckExp;
1900
1901 bool CostTooHigh = false;
1902
1903 public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL)1904 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905 TargetTransformInfo *TTI, const DataLayout &DL)
1906 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907 MemCheckExp(SE, DL, "scev.check") {}
1908
1909 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910 /// accurately estimate the cost of the runtime checks. The blocks are
1911 /// un-linked from the IR and is added back during vector code generation. If
1912 /// there is no vector code generation, the check blocks are removed
1913 /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1914 void Create(Loop *L, const LoopAccessInfo &LAI,
1915 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916
1917 // Hard cutoff to limit compile-time increase in case a very large number of
1918 // runtime checks needs to be generated.
1919 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920 // profile info.
1921 CostTooHigh =
1922 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923 if (CostTooHigh)
1924 return;
1925
1926 BasicBlock *LoopHeader = L->getHeader();
1927 BasicBlock *Preheader = L->getLoopPreheader();
1928
1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931 // may be used by SCEVExpander. The blocks will be un-linked from their
1932 // predecessors and removed from LI & DT at the end of the function.
1933 if (!UnionPred.isAlwaysTrue()) {
1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935 nullptr, "vector.scevcheck");
1936
1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938 &UnionPred, SCEVCheckBlock->getTerminator());
1939 }
1940
1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942 if (RtPtrChecking.Need) {
1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945 "vector.memcheck");
1946
1947 auto DiffChecks = RtPtrChecking.getDiffChecks();
1948 if (DiffChecks) {
1949 Value *RuntimeVF = nullptr;
1950 MemRuntimeCheckCond = addDiffRuntimeChecks(
1951 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953 if (!RuntimeVF)
1954 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955 return RuntimeVF;
1956 },
1957 IC);
1958 } else {
1959 MemRuntimeCheckCond =
1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961 RtPtrChecking.getChecks(), MemCheckExp);
1962 }
1963 assert(MemRuntimeCheckCond &&
1964 "no RT checks generated although RtPtrChecking "
1965 "claimed checks are required");
1966 }
1967
1968 if (!MemCheckBlock && !SCEVCheckBlock)
1969 return;
1970
1971 // Unhook the temporary block with the checks, update various places
1972 // accordingly.
1973 if (SCEVCheckBlock)
1974 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975 if (MemCheckBlock)
1976 MemCheckBlock->replaceAllUsesWith(Preheader);
1977
1978 if (SCEVCheckBlock) {
1979 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981 Preheader->getTerminator()->eraseFromParent();
1982 }
1983 if (MemCheckBlock) {
1984 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986 Preheader->getTerminator()->eraseFromParent();
1987 }
1988
1989 DT->changeImmediateDominator(LoopHeader, Preheader);
1990 if (MemCheckBlock) {
1991 DT->eraseNode(MemCheckBlock);
1992 LI->removeBlock(MemCheckBlock);
1993 }
1994 if (SCEVCheckBlock) {
1995 DT->eraseNode(SCEVCheckBlock);
1996 LI->removeBlock(SCEVCheckBlock);
1997 }
1998 }
1999
getCost()2000 InstructionCost getCost() {
2001 if (SCEVCheckBlock || MemCheckBlock)
2002 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2003
2004 if (CostTooHigh) {
2005 InstructionCost Cost;
2006 Cost.setInvalid();
2007 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2008 return Cost;
2009 }
2010
2011 InstructionCost RTCheckCost = 0;
2012 if (SCEVCheckBlock)
2013 for (Instruction &I : *SCEVCheckBlock) {
2014 if (SCEVCheckBlock->getTerminator() == &I)
2015 continue;
2016 InstructionCost C =
2017 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2019 RTCheckCost += C;
2020 }
2021 if (MemCheckBlock)
2022 for (Instruction &I : *MemCheckBlock) {
2023 if (MemCheckBlock->getTerminator() == &I)
2024 continue;
2025 InstructionCost C =
2026 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2028 RTCheckCost += C;
2029 }
2030
2031 if (SCEVCheckBlock || MemCheckBlock)
2032 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2033 << "\n");
2034
2035 return RTCheckCost;
2036 }
2037
2038 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039 /// unused.
~GeneratedRTChecks()2040 ~GeneratedRTChecks() {
2041 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043 if (!SCEVCheckCond)
2044 SCEVCleaner.markResultUsed();
2045
2046 if (!MemRuntimeCheckCond)
2047 MemCheckCleaner.markResultUsed();
2048
2049 if (MemRuntimeCheckCond) {
2050 auto &SE = *MemCheckExp.getSE();
2051 // Memory runtime check generation creates compares that use expanded
2052 // values. Remove them before running the SCEVExpanderCleaners.
2053 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054 if (MemCheckExp.isInsertedInstruction(&I))
2055 continue;
2056 SE.forgetValue(&I);
2057 I.eraseFromParent();
2058 }
2059 }
2060 MemCheckCleaner.cleanup();
2061 SCEVCleaner.cleanup();
2062
2063 if (SCEVCheckCond)
2064 SCEVCheckBlock->eraseFromParent();
2065 if (MemRuntimeCheckCond)
2066 MemCheckBlock->eraseFromParent();
2067 }
2068
2069 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071 /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)2072 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073 BasicBlock *LoopVectorPreHeader,
2074 BasicBlock *LoopExitBlock) {
2075 if (!SCEVCheckCond)
2076 return nullptr;
2077
2078 Value *Cond = SCEVCheckCond;
2079 // Mark the check as used, to prevent it from being removed during cleanup.
2080 SCEVCheckCond = nullptr;
2081 if (auto *C = dyn_cast<ConstantInt>(Cond))
2082 if (C->isZero())
2083 return nullptr;
2084
2085 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086
2087 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088 // Create new preheader for vector loop.
2089 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091
2092 SCEVCheckBlock->getTerminator()->eraseFromParent();
2093 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095 SCEVCheckBlock);
2096
2097 DT->addNewBlock(SCEVCheckBlock, Pred);
2098 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099
2100 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102 return SCEVCheckBlock;
2103 }
2104
2105 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106 /// the branches to branch to the vector preheader or \p Bypass, depending on
2107 /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)2108 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109 BasicBlock *LoopVectorPreHeader) {
2110 // Check if we generated code that checks in runtime if arrays overlap.
2111 if (!MemRuntimeCheckCond)
2112 return nullptr;
2113
2114 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116 MemCheckBlock);
2117
2118 DT->addNewBlock(MemCheckBlock, Pred);
2119 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121
2122 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124
2125 ReplaceInstWithInst(
2126 MemCheckBlock->getTerminator(),
2127 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128 MemCheckBlock->getTerminator()->setDebugLoc(
2129 Pred->getTerminator()->getDebugLoc());
2130
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 MemRuntimeCheckCond = nullptr;
2133 return MemCheckBlock;
2134 }
2135 };
2136 } // namespace
2137
2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139 // vectorization. The loop needs to be annotated with #pragma omp simd
2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141 // vector length information is not provided, vectorization is not considered
2142 // explicit. Interleave hints are not allowed either. These limitations will be
2143 // relaxed in the future.
2144 // Please, note that we are currently forced to abuse the pragma 'clang
2145 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147 // provides *explicit vectorization hints* (LV can bypass legal checks and
2148 // assume that vectorization is legal). However, both hints are implemented
2149 // using the same metadata (llvm.loop.vectorize, processed by
2150 // LoopVectorizeHints). This will be fixed in the future when the native IR
2151 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2152 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153 OptimizationRemarkEmitter *ORE) {
2154 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2155 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156
2157 // Only outer loops with an explicit vectorization hint are supported.
2158 // Unannotated outer loops are ignored.
2159 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160 return false;
2161
2162 Function *Fn = OuterLp->getHeader()->getParent();
2163 if (!Hints.allowVectorization(Fn, OuterLp,
2164 true /*VectorizeOnlyWhenForced*/)) {
2165 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2166 return false;
2167 }
2168
2169 if (Hints.getInterleave() > 1) {
2170 // TODO: Interleave support is future work.
2171 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2172 "outer loops.\n");
2173 Hints.emitRemarkWithHints();
2174 return false;
2175 }
2176
2177 return true;
2178 }
2179
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181 OptimizationRemarkEmitter *ORE,
2182 SmallVectorImpl<Loop *> &V) {
2183 // Collect inner loops and outer loops without irreducible control flow. For
2184 // now, only collect outer loops that have explicit vectorization hints. If we
2185 // are stress testing the VPlan H-CFG construction, we collect the outermost
2186 // loop of every loop nest.
2187 if (L.isInnermost() || VPlanBuildStressTest ||
2188 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189 LoopBlocksRPO RPOT(&L);
2190 RPOT.perform(LI);
2191 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192 V.push_back(&L);
2193 // TODO: Collect inner loops inside marked outer loops in case
2194 // vectorization fails for the outer loop. Do not invoke
2195 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196 // already known to be reducible. We can use an inherited attribute for
2197 // that.
2198 return;
2199 }
2200 }
2201 for (Loop *InnerL : L)
2202 collectSupportedLoops(*InnerL, LI, ORE, V);
2203 }
2204
2205 namespace {
2206
2207 /// The LoopVectorize Pass.
2208 struct LoopVectorize : public FunctionPass {
2209 /// Pass identification, replacement for typeid
2210 static char ID;
2211
2212 LoopVectorizePass Impl;
2213
LoopVectorize__anonb4be857a0711::LoopVectorize2214 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215 bool VectorizeOnlyWhenForced = false)
2216 : FunctionPass(ID),
2217 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219 }
2220
runOnFunction__anonb4be857a0711::LoopVectorize2221 bool runOnFunction(Function &F) override {
2222 if (skipFunction(F))
2223 return false;
2224
2225 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237
2238 return Impl
2239 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240 .MadeAnyChange;
2241 }
2242
getAnalysisUsage__anonb4be857a0711::LoopVectorize2243 void getAnalysisUsage(AnalysisUsage &AU) const override {
2244 AU.addRequired<AssumptionCacheTracker>();
2245 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246 AU.addRequired<DominatorTreeWrapperPass>();
2247 AU.addRequired<LoopInfoWrapperPass>();
2248 AU.addRequired<ScalarEvolutionWrapperPass>();
2249 AU.addRequired<TargetTransformInfoWrapperPass>();
2250 AU.addRequired<LoopAccessLegacyAnalysis>();
2251 AU.addRequired<DemandedBitsWrapperPass>();
2252 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253 AU.addRequired<InjectTLIMappingsLegacy>();
2254
2255 // We currently do not preserve loopinfo/dominator analyses with outer loop
2256 // vectorization. Until this is addressed, mark these analyses as preserved
2257 // only for non-VPlan-native path.
2258 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259 if (!EnableVPlanNativePath) {
2260 AU.addPreserved<LoopInfoWrapperPass>();
2261 AU.addPreserved<DominatorTreeWrapperPass>();
2262 }
2263
2264 AU.addPreserved<BasicAAWrapperPass>();
2265 AU.addPreserved<GlobalsAAWrapperPass>();
2266 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267 }
2268 };
2269
2270 } // end anonymous namespace
2271
2272 //===----------------------------------------------------------------------===//
2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2275 //===----------------------------------------------------------------------===//
2276
getBroadcastInstrs(Value * V)2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278 // We need to place the broadcast of invariant variables outside the loop,
2279 // but only if it's proven safe to do so. Else, broadcast will be inside
2280 // vector loop body.
2281 Instruction *Instr = dyn_cast<Instruction>(V);
2282 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283 (!Instr ||
2284 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285 // Place the code for broadcasting invariant variables in the new preheader.
2286 IRBuilder<>::InsertPointGuard Guard(Builder);
2287 if (SafeToHoist)
2288 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289
2290 // Broadcast the scalar into all locations in the vector.
2291 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292
2293 return Shuf;
2294 }
2295
2296 /// This function adds
2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298 /// to each vector element of Val. The sequence starts at StartIndex.
2299 /// \p Opcode is relevant for FP induction variable.
getStepVector(Value * Val,Value * StartIdx,Value * Step,Instruction::BinaryOps BinOp,ElementCount VF,IRBuilderBase & Builder)2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301 Instruction::BinaryOps BinOp, ElementCount VF,
2302 IRBuilderBase &Builder) {
2303 assert(VF.isVector() && "only vector VFs are supported");
2304
2305 // Create and check the types.
2306 auto *ValVTy = cast<VectorType>(Val->getType());
2307 ElementCount VLen = ValVTy->getElementCount();
2308
2309 Type *STy = Val->getType()->getScalarType();
2310 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2311 "Induction Step must be an integer or FP");
2312 assert(Step->getType() == STy && "Step has wrong type");
2313
2314 SmallVector<Constant *, 8> Indices;
2315
2316 // Create a vector of consecutive numbers from zero to VF.
2317 VectorType *InitVecValVTy = ValVTy;
2318 if (STy->isFloatingPointTy()) {
2319 Type *InitVecValSTy =
2320 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322 }
2323 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324
2325 // Splat the StartIdx
2326 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327
2328 if (STy->isIntegerTy()) {
2329 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330 Step = Builder.CreateVectorSplat(VLen, Step);
2331 assert(Step->getType() == Val->getType() && "Invalid step vec");
2332 // FIXME: The newly created binary instructions should contain nsw/nuw
2333 // flags, which can be found from the original scalar operations.
2334 Step = Builder.CreateMul(InitVec, Step);
2335 return Builder.CreateAdd(Val, Step, "induction");
2336 }
2337
2338 // Floating point induction.
2339 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2340 "Binary Opcode should be specified for FP induction");
2341 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343
2344 Step = Builder.CreateVectorSplat(VLen, Step);
2345 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347 }
2348
2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350 /// variable on which to base the steps, \p Step is the size of the step.
buildScalarSteps(Value * ScalarIV,Value * Step,const InductionDescriptor & ID,VPValue * Def,VPTransformState & State)2351 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352 const InductionDescriptor &ID, VPValue *Def,
2353 VPTransformState &State) {
2354 IRBuilderBase &Builder = State.Builder;
2355
2356 // Ensure step has the same type as that of scalar IV.
2357 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358 if (ScalarIVTy != Step->getType()) {
2359 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360 // avoid separate truncate here.
2361 assert(Step->getType()->isIntegerTy() &&
2362 "Truncation requires an integer step");
2363 Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364 }
2365
2366 // We build scalar steps for both integer and floating-point induction
2367 // variables. Here, we determine the kind of arithmetic we will perform.
2368 Instruction::BinaryOps AddOp;
2369 Instruction::BinaryOps MulOp;
2370 if (ScalarIVTy->isIntegerTy()) {
2371 AddOp = Instruction::Add;
2372 MulOp = Instruction::Mul;
2373 } else {
2374 AddOp = ID.getInductionOpcode();
2375 MulOp = Instruction::FMul;
2376 }
2377
2378 // Determine the number of scalars we need to generate for each unroll
2379 // iteration.
2380 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381 // Compute the scalar steps and save the results in State.
2382 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383 ScalarIVTy->getScalarSizeInBits());
2384 Type *VecIVTy = nullptr;
2385 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386 if (!FirstLaneOnly && State.VF.isScalable()) {
2387 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388 UnitStepVec =
2389 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392 }
2393
2394 unsigned StartPart = 0;
2395 unsigned EndPart = State.UF;
2396 unsigned StartLane = 0;
2397 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398 if (State.Instance) {
2399 StartPart = State.Instance->Part;
2400 EndPart = StartPart + 1;
2401 StartLane = State.Instance->Lane.getKnownLane();
2402 EndLane = StartLane + 1;
2403 }
2404 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406
2407 if (!FirstLaneOnly && State.VF.isScalable()) {
2408 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410 if (ScalarIVTy->isFloatingPointTy())
2411 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414 State.set(Def, Add, Part);
2415 // It's useful to record the lane values too for the known minimum number
2416 // of elements so we do those below. This improves the code quality when
2417 // trying to extract the first element, for example.
2418 }
2419
2420 if (ScalarIVTy->isFloatingPointTy())
2421 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422
2423 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424 Value *StartIdx = Builder.CreateBinOp(
2425 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426 // The step returned by `createStepForVF` is a runtime-evaluated value
2427 // when VF is scalable. Otherwise, it should be folded into a Constant.
2428 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2429 "Expected StartIdx to be folded to a constant when VF is not "
2430 "scalable");
2431 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433 State.set(Def, Add, VPIteration(Part, Lane));
2434 }
2435 }
2436 }
2437
2438 // Generate code for the induction step. Note that induction steps are
2439 // required to be loop-invariant
CreateStepValue(const SCEV * Step,ScalarEvolution & SE,Instruction * InsertBefore,Loop * OrigLoop=nullptr)2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441 Instruction *InsertBefore,
2442 Loop *OrigLoop = nullptr) {
2443 const DataLayout &DL = SE.getDataLayout();
2444 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2445 "Induction step should be loop invariant");
2446 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447 return E->getValue();
2448
2449 SCEVExpander Exp(SE, DL, "induction");
2450 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451 }
2452
2453 /// Compute the transformed value of Index at offset StartValue using step
2454 /// StepValue.
2455 /// For integer induction, returns StartValue + Index * StepValue.
2456 /// For pointer induction, returns StartValue[Index * StepValue].
2457 /// FIXME: The newly created binary instructions should contain nsw/nuw
2458 /// flags, which can be found from the original scalar operations.
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,const InductionDescriptor & ID)2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460 Value *StartValue, Value *Step,
2461 const InductionDescriptor &ID) {
2462 Type *StepTy = Step->getType();
2463 Value *CastedIndex = StepTy->isIntegerTy()
2464 ? B.CreateSExtOrTrunc(Index, StepTy)
2465 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466 if (CastedIndex != Index) {
2467 CastedIndex->setName(CastedIndex->getName() + ".cast");
2468 Index = CastedIndex;
2469 }
2470
2471 // Note: the IR at this point is broken. We cannot use SE to create any new
2472 // SCEV and then expand it, hoping that SCEV's simplification will give us
2473 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474 // lead to various SCEV crashes. So all we can do is to use builder and rely
2475 // on InstCombine for future simplifications. Here we handle some trivial
2476 // cases only.
2477 auto CreateAdd = [&B](Value *X, Value *Y) {
2478 assert(X->getType() == Y->getType() && "Types don't match!");
2479 if (auto *CX = dyn_cast<ConstantInt>(X))
2480 if (CX->isZero())
2481 return Y;
2482 if (auto *CY = dyn_cast<ConstantInt>(Y))
2483 if (CY->isZero())
2484 return X;
2485 return B.CreateAdd(X, Y);
2486 };
2487
2488 // We allow X to be a vector type, in which case Y will potentially be
2489 // splatted into a vector with the same element count.
2490 auto CreateMul = [&B](Value *X, Value *Y) {
2491 assert(X->getType()->getScalarType() == Y->getType() &&
2492 "Types don't match!");
2493 if (auto *CX = dyn_cast<ConstantInt>(X))
2494 if (CX->isOne())
2495 return Y;
2496 if (auto *CY = dyn_cast<ConstantInt>(Y))
2497 if (CY->isOne())
2498 return X;
2499 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500 if (XVTy && !isa<VectorType>(Y->getType()))
2501 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502 return B.CreateMul(X, Y);
2503 };
2504
2505 switch (ID.getKind()) {
2506 case InductionDescriptor::IK_IntInduction: {
2507 assert(!isa<VectorType>(Index->getType()) &&
2508 "Vector indices not supported for integer inductions yet");
2509 assert(Index->getType() == StartValue->getType() &&
2510 "Index type does not match StartValue type");
2511 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512 return B.CreateSub(StartValue, Index);
2513 auto *Offset = CreateMul(Index, Step);
2514 return CreateAdd(StartValue, Offset);
2515 }
2516 case InductionDescriptor::IK_PtrInduction: {
2517 assert(isa<Constant>(Step) &&
2518 "Expected constant step for pointer induction");
2519 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520 }
2521 case InductionDescriptor::IK_FpInduction: {
2522 assert(!isa<VectorType>(Index->getType()) &&
2523 "Vector indices not supported for FP inductions yet");
2524 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2525 auto InductionBinOp = ID.getInductionBinOp();
2526 assert(InductionBinOp &&
2527 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2528 InductionBinOp->getOpcode() == Instruction::FSub) &&
2529 "Original bin op should be defined for FP induction");
2530
2531 Value *MulExp = B.CreateFMul(Step, Index);
2532 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533 "induction");
2534 }
2535 case InductionDescriptor::IK_NoInduction:
2536 return nullptr;
2537 }
2538 llvm_unreachable("invalid enum");
2539 }
2540
packScalarIntoVectorValue(VPValue * Def,const VPIteration & Instance,VPTransformState & State)2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542 const VPIteration &Instance,
2543 VPTransformState &State) {
2544 Value *ScalarInst = State.get(Def, Instance);
2545 Value *VectorValue = State.get(Def, Instance.Part);
2546 VectorValue = Builder.CreateInsertElement(
2547 VectorValue, ScalarInst,
2548 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549 State.set(Def, VectorValue, Instance.Part);
2550 }
2551
2552 // Return whether we allow using masked interleave-groups (for dealing with
2553 // strided loads/stores that reside in predicated blocks, or for dealing
2554 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556 // If an override option has been passed in for interleaved accesses, use it.
2557 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558 return EnableMaskedInterleavedMemAccesses;
2559
2560 return TTI.enableMaskedInterleavedAccessVectorization();
2561 }
2562
2563 // Try to vectorize the interleave group that \p Instr belongs to.
2564 //
2565 // E.g. Translate following interleaved load group (factor = 3):
2566 // for (i = 0; i < N; i+=3) {
2567 // R = Pic[i]; // Member of index 0
2568 // G = Pic[i+1]; // Member of index 1
2569 // B = Pic[i+2]; // Member of index 2
2570 // ... // do something to R, G, B
2571 // }
2572 // To:
2573 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2574 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2575 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2576 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2577 //
2578 // Or translate following interleaved store group (factor = 3):
2579 // for (i = 0; i < N; i+=3) {
2580 // ... do something to R, G, B
2581 // Pic[i] = R; // Member of index 0
2582 // Pic[i+1] = G; // Member of index 1
2583 // Pic[i+2] = B; // Member of index 2
2584 // }
2585 // To:
2586 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2590 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,ArrayRef<VPValue * > VPDefs,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask)2591 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594 VPValue *BlockInMask) {
2595 Instruction *Instr = Group->getInsertPos();
2596 const DataLayout &DL = Instr->getModule()->getDataLayout();
2597
2598 // Prepare for the vector type of the interleaved load/store.
2599 Type *ScalarTy = getLoadStoreType(Instr);
2600 unsigned InterleaveFactor = Group->getFactor();
2601 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2602 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603
2604 // Prepare for the new pointers.
2605 SmallVector<Value *, 2> AddrParts;
2606 unsigned Index = Group->getIndex(Instr);
2607
2608 // TODO: extend the masked interleaved-group support to reversed access.
2609 assert((!BlockInMask || !Group->isReverse()) &&
2610 "Reversed masked interleave-group not supported.");
2611
2612 // If the group is reverse, adjust the index to refer to the last vector lane
2613 // instead of the first. We adjust the index from the first vector lane,
2614 // rather than directly getting the pointer for lane VF - 1, because the
2615 // pointer operand of the interleaved access is supposed to be uniform. For
2616 // uniform instructions, we're only required to generate a value for the
2617 // first vector lane in each unroll iteration.
2618 if (Group->isReverse())
2619 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620
2621 for (unsigned Part = 0; Part < UF; Part++) {
2622 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623 State.setDebugLocFromInst(AddrPart);
2624
2625 // Notice current instruction could be any index. Need to adjust the address
2626 // to the member of index 0.
2627 //
2628 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2629 // b = A[i]; // Member of index 0
2630 // Current pointer is pointed to A[i+1], adjust it to A[i].
2631 //
2632 // E.g. A[i+1] = a; // Member of index 1
2633 // A[i] = b; // Member of index 0
2634 // A[i+2] = c; // Member of index 2 (Current instruction)
2635 // Current pointer is pointed to A[i+2], adjust it to A[i].
2636
2637 bool InBounds = false;
2638 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639 InBounds = gep->isInBounds();
2640 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642
2643 // Cast to the vector pointer type.
2644 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647 }
2648
2649 State.setDebugLocFromInst(Instr);
2650 Value *PoisonVec = PoisonValue::get(VecTy);
2651
2652 Value *MaskForGaps = nullptr;
2653 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2656 }
2657
2658 // Vectorize the interleaved load group.
2659 if (isa<LoadInst>(Instr)) {
2660 // For each unroll part, create a wide load for the group.
2661 SmallVector<Value *, 2> NewLoads;
2662 for (unsigned Part = 0; Part < UF; Part++) {
2663 Instruction *NewLoad;
2664 if (BlockInMask || MaskForGaps) {
2665 assert(useMaskedInterleavedAccesses(*TTI) &&
2666 "masked interleaved groups are not allowed.");
2667 Value *GroupMask = MaskForGaps;
2668 if (BlockInMask) {
2669 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670 Value *ShuffledMask = Builder.CreateShuffleVector(
2671 BlockInMaskPart,
2672 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673 "interleaved.mask");
2674 GroupMask = MaskForGaps
2675 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676 MaskForGaps)
2677 : ShuffledMask;
2678 }
2679 NewLoad =
2680 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681 GroupMask, PoisonVec, "wide.masked.vec");
2682 }
2683 else
2684 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685 Group->getAlign(), "wide.vec");
2686 Group->addMetadata(NewLoad);
2687 NewLoads.push_back(NewLoad);
2688 }
2689
2690 // For each member in the group, shuffle out the appropriate data from the
2691 // wide loads.
2692 unsigned J = 0;
2693 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694 Instruction *Member = Group->getMember(I);
2695
2696 // Skip the gaps in the group.
2697 if (!Member)
2698 continue;
2699
2700 auto StrideMask =
2701 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702 for (unsigned Part = 0; Part < UF; Part++) {
2703 Value *StridedVec = Builder.CreateShuffleVector(
2704 NewLoads[Part], StrideMask, "strided.vec");
2705
2706 // If this member has different type, cast the result type.
2707 if (Member->getType() != ScalarTy) {
2708 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2709 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711 }
2712
2713 if (Group->isReverse())
2714 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715
2716 State.set(VPDefs[J], StridedVec, Part);
2717 }
2718 ++J;
2719 }
2720 return;
2721 }
2722
2723 // The sub vector type for current instruction.
2724 auto *SubVT = VectorType::get(ScalarTy, VF);
2725
2726 // Vectorize the interleaved store group.
2727 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2729 "masked interleaved groups are not allowed.");
2730 assert((!MaskForGaps || !VF.isScalable()) &&
2731 "masking gaps for scalable vectors is not yet supported.");
2732 for (unsigned Part = 0; Part < UF; Part++) {
2733 // Collect the stored vector from each member.
2734 SmallVector<Value *, 4> StoredVecs;
2735 unsigned StoredIdx = 0;
2736 for (unsigned i = 0; i < InterleaveFactor; i++) {
2737 assert((Group->getMember(i) || MaskForGaps) &&
2738 "Fail to get a member from an interleaved store group");
2739 Instruction *Member = Group->getMember(i);
2740
2741 // Skip the gaps in the group.
2742 if (!Member) {
2743 Value *Undef = PoisonValue::get(SubVT);
2744 StoredVecs.push_back(Undef);
2745 continue;
2746 }
2747
2748 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749 ++StoredIdx;
2750
2751 if (Group->isReverse())
2752 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753
2754 // If this member has different type, cast it to a unified type.
2755
2756 if (StoredVec->getType() != SubVT)
2757 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758
2759 StoredVecs.push_back(StoredVec);
2760 }
2761
2762 // Concatenate all vectors into a wide vector.
2763 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764
2765 // Interleave the elements in the wide vector.
2766 Value *IVec = Builder.CreateShuffleVector(
2767 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768 "interleaved.vec");
2769
2770 Instruction *NewStoreInstr;
2771 if (BlockInMask || MaskForGaps) {
2772 Value *GroupMask = MaskForGaps;
2773 if (BlockInMask) {
2774 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775 Value *ShuffledMask = Builder.CreateShuffleVector(
2776 BlockInMaskPart,
2777 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778 "interleaved.mask");
2779 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780 ShuffledMask, MaskForGaps)
2781 : ShuffledMask;
2782 }
2783 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784 Group->getAlign(), GroupMask);
2785 } else
2786 NewStoreInstr =
2787 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788
2789 Group->addMetadata(NewStoreInstr);
2790 }
2791 }
2792
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,bool IfPredicateInstr,VPTransformState & State)2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794 VPReplicateRecipe *RepRecipe,
2795 const VPIteration &Instance,
2796 bool IfPredicateInstr,
2797 VPTransformState &State) {
2798 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2799
2800 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801 // the first lane and part.
2802 if (isa<NoAliasScopeDeclInst>(Instr))
2803 if (!Instance.isFirstIteration())
2804 return;
2805
2806 // Does this instruction return a value ?
2807 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808
2809 Instruction *Cloned = Instr->clone();
2810 if (!IsVoidRetTy)
2811 Cloned->setName(Instr->getName() + ".cloned");
2812
2813 // If the scalarized instruction contributes to the address computation of a
2814 // widen masked load/store which was in a basic block that needed predication
2815 // and is not predicated after vectorization, we can't propagate
2816 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817 // instruction could feed a poison value to the base address of the widen
2818 // load/store.
2819 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820 Cloned->dropPoisonGeneratingFlags();
2821
2822 if (Instr->getDebugLoc())
2823 State.setDebugLocFromInst(Instr);
2824
2825 // Replace the operands of the cloned instructions with their scalar
2826 // equivalents in the new loop.
2827 for (const auto &I : enumerate(RepRecipe->operands())) {
2828 auto InputInstance = Instance;
2829 VPValue *Operand = I.value();
2830 if (vputils::isUniformAfterVectorization(Operand))
2831 InputInstance.Lane = VPLane::getFirstLane();
2832 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833 }
2834 State.addNewMetadata(Cloned, Instr);
2835
2836 // Place the cloned scalar in the new loop.
2837 State.Builder.Insert(Cloned);
2838
2839 State.set(RepRecipe, Cloned, Instance);
2840
2841 // If we just cloned a new assumption, add it the assumption cache.
2842 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843 AC->registerAssumption(II);
2844
2845 // End if-block.
2846 if (IfPredicateInstr)
2847 PredicatedInstructions.push_back(Cloned);
2848 }
2849
getOrCreateTripCount(BasicBlock * InsertBlock)2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851 if (TripCount)
2852 return TripCount;
2853
2854 assert(InsertBlock);
2855 IRBuilder<> Builder(InsertBlock->getTerminator());
2856 // Find the loop boundaries.
2857 Type *IdxTy = Legal->getWidestInductionType();
2858 assert(IdxTy && "No type for induction");
2859 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860
2861 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862
2863 // Expand the trip count and place the new instructions in the preheader.
2864 // Notice that the pre-header does not change, only the loop body.
2865 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866
2867 // Count holds the overall loop count (N).
2868 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869 InsertBlock->getTerminator());
2870
2871 if (TripCount->getType()->isPointerTy())
2872 TripCount =
2873 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874 InsertBlock->getTerminator());
2875
2876 return TripCount;
2877 }
2878
2879 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881 if (VectorTripCount)
2882 return VectorTripCount;
2883
2884 Value *TC = getOrCreateTripCount(InsertBlock);
2885 IRBuilder<> Builder(InsertBlock->getTerminator());
2886
2887 Type *Ty = TC->getType();
2888 // This is where we can make the step a runtime constant.
2889 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890
2891 // If the tail is to be folded by masking, round the number of iterations N
2892 // up to a multiple of Step instead of rounding down. This is done by first
2893 // adding Step-1 and then rounding down. Note that it's ok if this addition
2894 // overflows: the vector induction variable will eventually wrap to zero given
2895 // that it starts at zero and its Step is a power of two; the loop will then
2896 // exit, with the last early-exit vector comparison also producing all-true.
2897 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898 // is accounted for in emitIterationCountCheck that adds an overflow check.
2899 if (Cost->foldTailByMasking()) {
2900 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2901 "VF*UF must be a power of 2 when folding tail by masking");
2902 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903 TC = Builder.CreateAdd(
2904 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905 }
2906
2907 // Now we need to generate the expression for the part of the loop that the
2908 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909 // iterations are not required for correctness, or N - Step, otherwise. Step
2910 // is equal to the vectorization factor (number of SIMD elements) times the
2911 // unroll factor (number of SIMD instructions).
2912 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913
2914 // There are cases where we *must* run at least one iteration in the remainder
2915 // loop. See the cost model for when this can happen. If the step evenly
2916 // divides the trip count, we set the remainder to be equal to the step. If
2917 // the step does not evenly divide the trip count, no adjustment is necessary
2918 // since there will already be scalar iterations. Note that the minimum
2919 // iterations check ensures that N >= Step.
2920 if (Cost->requiresScalarEpilogue(VF)) {
2921 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922 R = Builder.CreateSelect(IsZero, Step, R);
2923 }
2924
2925 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926
2927 return VectorTripCount;
2928 }
2929
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931 const DataLayout &DL) {
2932 // Verify that V is a vector type with same number of elements as DstVTy.
2933 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934 unsigned VF = DstFVTy->getNumElements();
2935 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2937 Type *SrcElemTy = SrcVecTy->getElementType();
2938 Type *DstElemTy = DstFVTy->getElementType();
2939 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2940 "Vector elements must have same size");
2941
2942 // Do a direct cast if element types are castable.
2943 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945 }
2946 // V cannot be directly casted to desired vector type.
2947 // May happen when V is a floating point vector but DstVTy is a vector of
2948 // pointers or vice-versa. Handle this using a two-step bitcast using an
2949 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2951 "Only one type should be a pointer type");
2952 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2953 "Only one type should be a floating point type");
2954 Type *IntTy =
2955 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959 }
2960
emitIterationCountCheck(BasicBlock * Bypass)2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963 // Reuse existing vector loop preheader for TC checks.
2964 // Note that new preheader block is generated for vector loop.
2965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967
2968 // Generate code to check if the loop's trip count is less than VF * UF, or
2969 // equal to it in case a scalar epilogue is required; this implies that the
2970 // vector trip count is zero. This check also covers the case where adding one
2971 // to the backedge-taken count overflowed leading to an incorrect trip count
2972 // of zero. In this case we will also jump to the scalar loop.
2973 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974 : ICmpInst::ICMP_ULT;
2975
2976 // If tail is to be folded, vector loop takes care of all iterations.
2977 Type *CountTy = Count->getType();
2978 Value *CheckMinIters = Builder.getFalse();
2979 auto CreateStep = [&]() -> Value * {
2980 // Create step with max(MinProTripCount, UF * VF).
2981 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982 return createStepForVF(Builder, CountTy, VF, UF);
2983
2984 Value *MinProfTC =
2985 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986 if (!VF.isScalable())
2987 return MinProfTC;
2988 return Builder.CreateBinaryIntrinsic(
2989 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990 };
2991
2992 if (!Cost->foldTailByMasking())
2993 CheckMinIters =
2994 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995 else if (VF.isScalable()) {
2996 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997 // an overflow to zero when updating induction variables and so an
2998 // additional overflow check is required before entering the vector loop.
2999
3000 // Get the maximum unsigned value for the type.
3001 Value *MaxUIntTripCount =
3002 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004
3005 // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007 }
3008
3009 // Create new preheader for vector loop.
3010 LoopVectorPreHeader =
3011 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012 "vector.ph");
3013
3014 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3015 DT->getNode(Bypass)->getIDom()) &&
3016 "TC check is expected to dominate Bypass");
3017
3018 // Update dominator for Bypass & LoopExit (if needed).
3019 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020 if (!Cost->requiresScalarEpilogue(VF))
3021 // If there is an epilogue which must run, there's no edge from the
3022 // middle block to exit blocks and thus no need to update the immediate
3023 // dominator of the exit blocks.
3024 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025
3026 ReplaceInstWithInst(
3027 TCCheckBlock->getTerminator(),
3028 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029 LoopBypassBlocks.push_back(TCCheckBlock);
3030 }
3031
emitSCEVChecks(BasicBlock * Bypass)3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033 BasicBlock *const SCEVCheckBlock =
3034 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035 if (!SCEVCheckBlock)
3036 return nullptr;
3037
3038 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3039 (OptForSizeBasedOnProfile &&
3040 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3041 "Cannot SCEV check stride or overflow when optimizing for size");
3042
3043
3044 // Update dominator only if this is first RT check.
3045 if (LoopBypassBlocks.empty()) {
3046 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047 if (!Cost->requiresScalarEpilogue(VF))
3048 // If there is an epilogue which must run, there's no edge from the
3049 // middle block to exit blocks and thus no need to update the immediate
3050 // dominator of the exit blocks.
3051 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052 }
3053
3054 LoopBypassBlocks.push_back(SCEVCheckBlock);
3055 AddedSafetyChecks = true;
3056 return SCEVCheckBlock;
3057 }
3058
emitMemRuntimeChecks(BasicBlock * Bypass)3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060 // VPlan-native path does not do any analysis for runtime checks currently.
3061 if (EnableVPlanNativePath)
3062 return nullptr;
3063
3064 BasicBlock *const MemCheckBlock =
3065 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066
3067 // Check if we generated code that checks in runtime if arrays overlap. We put
3068 // the checks into a separate block to make the more common case of few
3069 // elements faster.
3070 if (!MemCheckBlock)
3071 return nullptr;
3072
3073 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3075 "Cannot emit memory checks when optimizing for size, unless forced "
3076 "to vectorize.");
3077 ORE->emit([&]() {
3078 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3079 OrigLoop->getStartLoc(),
3080 OrigLoop->getHeader())
3081 << "Code-size may be reduced by not forcing "
3082 "vectorization, or by source-code modifications "
3083 "eliminating the need for runtime checks "
3084 "(e.g., adding 'restrict').";
3085 });
3086 }
3087
3088 LoopBypassBlocks.push_back(MemCheckBlock);
3089
3090 AddedSafetyChecks = true;
3091
3092 return MemCheckBlock;
3093 }
3094
createVectorLoopSkeleton(StringRef Prefix)3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096 LoopScalarBody = OrigLoop->getHeader();
3097 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098 assert(LoopVectorPreHeader && "Invalid loop structure");
3099 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3101 "multiple exit loop without required epilogue?");
3102
3103 LoopMiddleBlock =
3104 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105 LI, nullptr, Twine(Prefix) + "middle.block");
3106 LoopScalarPreHeader =
3107 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108 nullptr, Twine(Prefix) + "scalar.ph");
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Set up the middle block terminator. Two cases:
3113 // 1) If we know that we must execute the scalar epilogue, emit an
3114 // unconditional branch.
3115 // 2) Otherwise, we must have a single unique exit block (due to how we
3116 // implement the multiple exit case). In this case, set up a conditional
3117 // branch from the middle block to the loop scalar preheader, and the
3118 // exit block. completeLoopSkeleton will update the condition to use an
3119 // iteration check, if required to decide whether to execute the remainder.
3120 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121 BranchInst::Create(LoopScalarPreHeader) :
3122 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123 Builder.getTrue());
3124 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126
3127 // Update dominator for loop exit. During skeleton creation, only the vector
3128 // pre-header and the middle block are created. The vector loop is entirely
3129 // created during VPlan exection.
3130 if (!Cost->requiresScalarEpilogue(VF))
3131 // If there is an epilogue which must run, there's no edge from the
3132 // middle block to exit blocks and thus no need to update the immediate
3133 // dominator of the exit blocks.
3134 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135 }
3136
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)3137 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138 PHINode *OrigPhi, const InductionDescriptor &II,
3139 ArrayRef<BasicBlock *> BypassBlocks,
3140 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142 assert(VectorTripCount && "Expected valid arguments");
3143
3144 Instruction *OldInduction = Legal->getPrimaryInduction();
3145 Value *&EndValue = IVEndValues[OrigPhi];
3146 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147 if (OrigPhi == OldInduction) {
3148 // We know what the end value is.
3149 EndValue = VectorTripCount;
3150 } else {
3151 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152
3153 // Fast-math-flags propagate from the original induction instruction.
3154 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156
3157 Value *Step =
3158 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159 EndValue =
3160 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161 EndValue->setName("ind.end");
3162
3163 // Compute the end value for the additional bypass (if applicable).
3164 if (AdditionalBypass.first) {
3165 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166 Value *Step =
3167 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168 EndValueFromAdditionalBypass = emitTransformedIndex(
3169 B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170 EndValueFromAdditionalBypass->setName("ind.end");
3171 }
3172 }
3173
3174 // Create phi nodes to merge from the backedge-taken check block.
3175 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176 LoopScalarPreHeader->getTerminator());
3177 // Copy original phi DL over to the new one.
3178 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179
3180 // The new PHI merges the original incoming value, in case of a bypass,
3181 // or the value at the end of the vectorized loop.
3182 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183
3184 // Fix the scalar body counter (PHI node).
3185 // The old induction's phi node in the scalar body needs the truncated
3186 // value.
3187 for (BasicBlock *BB : BypassBlocks)
3188 BCResumeVal->addIncoming(II.getStartValue(), BB);
3189
3190 if (AdditionalBypass.first)
3191 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192 EndValueFromAdditionalBypass);
3193 return BCResumeVal;
3194 }
3195
createInductionResumeValues(std::pair<BasicBlock *,Value * > AdditionalBypass)3196 void InnerLoopVectorizer::createInductionResumeValues(
3197 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3199 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3200 "Inconsistent information about additional bypass.");
3201 // We are going to resume the execution of the scalar loop.
3202 // Go over all of the induction variables that we found and fix the
3203 // PHIs that are left in the scalar version of the loop.
3204 // The starting values of PHI nodes depend on the counter of the last
3205 // iteration in the vectorized loop.
3206 // If we come from a bypass edge then we need to start from the original
3207 // start value.
3208 for (const auto &InductionEntry : Legal->getInductionVars()) {
3209 PHINode *OrigPhi = InductionEntry.first;
3210 const InductionDescriptor &II = InductionEntry.second;
3211 PHINode *BCResumeVal = createInductionResumeValue(
3212 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214 }
3215 }
3216
completeLoopSkeleton()3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218 // The trip counts should be cached by now.
3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221
3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223
3224 // Add a check in the middle block to see if we have completed
3225 // all of the iterations in the first vector loop. Three cases:
3226 // 1) If we require a scalar epilogue, there is no conditional branch as
3227 // we unconditionally branch to the scalar preheader. Do nothing.
3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229 // Thus if tail is to be folded, we know we don't need to run the
3230 // remainder and we can use the previous value for the condition (true).
3231 // 3) Otherwise, construct a runtime check.
3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234 Count, VectorTripCount, "cmp.n",
3235 LoopMiddleBlock->getTerminator());
3236
3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238 // of the corresponding compare because they may have ended up with
3239 // different line numbers and we want to avoid awkward line stepping while
3240 // debugging. Eg. if the compare has got a line number inside the loop.
3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243 }
3244
3245 #ifdef EXPENSIVE_CHECKS
3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3247 #endif
3248
3249 return LoopVectorPreHeader;
3250 }
3251
3252 std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton()3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254 /*
3255 In this function we generate a new loop. The new loop will contain
3256 the vectorized instructions while the old loop will continue to run the
3257 scalar remainder.
3258
3259 [ ] <-- loop iteration number check.
3260 / |
3261 / v
3262 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3263 | / |
3264 | / v
3265 || [ ] <-- vector pre header.
3266 |/ |
3267 | v
3268 | [ ] \
3269 | [ ]_| <-- vector loop (created during VPlan execution).
3270 | |
3271 | v
3272 \ -[ ] <--- middle-block.
3273 \/ |
3274 /\ v
3275 | ->[ ] <--- new preheader.
3276 | |
3277 (opt) v <-- edge from middle to exit iff epilogue is not required.
3278 | [ ] \
3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3280 \ |
3281 \ v
3282 >[ ] <-- exit block(s).
3283 ...
3284 */
3285
3286 // Create an empty vector loop, and prepare basic blocks for the runtime
3287 // checks.
3288 createVectorLoopSkeleton("");
3289
3290 // Now, compare the new count to zero. If it is zero skip the vector loop and
3291 // jump to the scalar loop. This check also covers the case where the
3292 // backedge-taken count is uint##_max: adding one to it will overflow leading
3293 // to an incorrect trip count of zero. In this (rare) case we will also jump
3294 // to the scalar loop.
3295 emitIterationCountCheck(LoopScalarPreHeader);
3296
3297 // Generate the code to check any assumptions that we've made for SCEV
3298 // expressions.
3299 emitSCEVChecks(LoopScalarPreHeader);
3300
3301 // Generate the code that checks in runtime if arrays overlap. We put the
3302 // checks into a separate block to make the more common case of few elements
3303 // faster.
3304 emitMemRuntimeChecks(LoopScalarPreHeader);
3305
3306 // Emit phis for the new starting index of the scalar loop.
3307 createInductionResumeValues();
3308
3309 return {completeLoopSkeleton(), nullptr};
3310 }
3311
3312 // Fix up external users of the induction variable. At this point, we are
3313 // in LCSSA form, with all external PHIs that use the IV having one input value,
3314 // coming from the remainder loop. We need those PHIs to also have a correct
3315 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan)3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317 const InductionDescriptor &II,
3318 Value *VectorTripCount, Value *EndValue,
3319 BasicBlock *MiddleBlock,
3320 BasicBlock *VectorHeader, VPlan &Plan) {
3321 // There are two kinds of external IV usages - those that use the value
3322 // computed in the last iteration (the PHI) and those that use the penultimate
3323 // value (the value that feeds into the phi from the loop latch).
3324 // We allow both, but they, obviously, have different values.
3325
3326 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3327
3328 DenseMap<Value *, Value *> MissingVals;
3329
3330 // An external user of the last iteration's value should see the value that
3331 // the remainder loop uses to initialize its own IV.
3332 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333 for (User *U : PostInc->users()) {
3334 Instruction *UI = cast<Instruction>(U);
3335 if (!OrigLoop->contains(UI)) {
3336 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3337 MissingVals[UI] = EndValue;
3338 }
3339 }
3340
3341 // An external user of the penultimate value need to see EndValue - Step.
3342 // The simplest way to get this is to recompute it from the constituent SCEVs,
3343 // that is Start + (Step * (CRD - 1)).
3344 for (User *U : OrigPhi->users()) {
3345 auto *UI = cast<Instruction>(U);
3346 if (!OrigLoop->contains(UI)) {
3347 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3348
3349 IRBuilder<> B(MiddleBlock->getTerminator());
3350
3351 // Fast-math-flags propagate from the original induction instruction.
3352 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354
3355 Value *CountMinusOne = B.CreateSub(
3356 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357 CountMinusOne->setName("cmo");
3358 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359 VectorHeader->getTerminator());
3360 Value *Escape =
3361 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362 Escape->setName("ind.escape");
3363 MissingVals[UI] = Escape;
3364 }
3365 }
3366
3367 for (auto &I : MissingVals) {
3368 PHINode *PHI = cast<PHINode>(I.first);
3369 // One corner case we have to handle is two IVs "chasing" each-other,
3370 // that is %IV2 = phi [...], [ %IV1, %latch ]
3371 // In this case, if IV1 has an external use, we need to avoid adding both
3372 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373 // don't already have an incoming value for the middle block.
3374 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375 PHI->addIncoming(I.second, MiddleBlock);
3376 Plan.removeLiveOut(PHI);
3377 }
3378 }
3379 }
3380
3381 namespace {
3382
3383 struct CSEDenseMapInfo {
canHandle__anonb4be857a0c11::CSEDenseMapInfo3384 static bool canHandle(const Instruction *I) {
3385 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387 }
3388
getEmptyKey__anonb4be857a0c11::CSEDenseMapInfo3389 static inline Instruction *getEmptyKey() {
3390 return DenseMapInfo<Instruction *>::getEmptyKey();
3391 }
3392
getTombstoneKey__anonb4be857a0c11::CSEDenseMapInfo3393 static inline Instruction *getTombstoneKey() {
3394 return DenseMapInfo<Instruction *>::getTombstoneKey();
3395 }
3396
getHashValue__anonb4be857a0c11::CSEDenseMapInfo3397 static unsigned getHashValue(const Instruction *I) {
3398 assert(canHandle(I) && "Unknown instruction!");
3399 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400 I->value_op_end()));
3401 }
3402
isEqual__anonb4be857a0c11::CSEDenseMapInfo3403 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406 return LHS == RHS;
3407 return LHS->isIdenticalTo(RHS);
3408 }
3409 };
3410
3411 } // end anonymous namespace
3412
3413 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3414 static void cse(BasicBlock *BB) {
3415 // Perform simple cse.
3416 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418 if (!CSEDenseMapInfo::canHandle(&In))
3419 continue;
3420
3421 // Check if we can replace this instruction with any of the
3422 // visited instructions.
3423 if (Instruction *V = CSEMap.lookup(&In)) {
3424 In.replaceAllUsesWith(V);
3425 In.eraseFromParent();
3426 continue;
3427 }
3428
3429 CSEMap[&In] = &In;
3430 }
3431 }
3432
3433 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF,bool & NeedToScalarize) const3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435 bool &NeedToScalarize) const {
3436 Function *F = CI->getCalledFunction();
3437 Type *ScalarRetTy = CI->getType();
3438 SmallVector<Type *, 4> Tys, ScalarTys;
3439 for (auto &ArgOp : CI->args())
3440 ScalarTys.push_back(ArgOp->getType());
3441
3442 // Estimate cost of scalarized vector call. The source operands are assumed
3443 // to be vectors, so we need to extract individual elements from there,
3444 // execute VF scalar calls, and then gather the result into the vector return
3445 // value.
3446 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447 InstructionCost ScalarCallCost =
3448 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449 if (VF.isScalar())
3450 return ScalarCallCost;
3451
3452 // Compute corresponding vector type for return value and arguments.
3453 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454 for (Type *ScalarTy : ScalarTys)
3455 Tys.push_back(ToVectorTy(ScalarTy, VF));
3456
3457 // Compute costs of unpacking argument values for the scalar calls and
3458 // packing the return values to a vector.
3459 InstructionCost ScalarizationCost =
3460 getScalarizationOverhead(CI, VF, CostKind);
3461
3462 InstructionCost Cost =
3463 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464
3465 // If we can't emit a vector call for this function, then the currently found
3466 // cost is the cost we need to return.
3467 NeedToScalarize = true;
3468 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470
3471 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472 return Cost;
3473
3474 // If the corresponding vector cost is cheaper, return its cost.
3475 InstructionCost VectorCallCost =
3476 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477 if (VectorCallCost < Cost) {
3478 NeedToScalarize = false;
3479 Cost = VectorCallCost;
3480 }
3481 return Cost;
3482 }
3483
MaybeVectorizeType(Type * Elt,ElementCount VF)3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486 return Elt;
3487 return VectorType::get(Elt, VF);
3488 }
3489
3490 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492 ElementCount VF) const {
3493 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494 assert(ID && "Expected intrinsic call!");
3495 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496 FastMathFlags FMF;
3497 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498 FMF = FPMO->getFastMathFlags();
3499
3500 SmallVector<const Value *> Arguments(CI->args());
3501 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502 SmallVector<Type *> ParamTys;
3503 std::transform(FTy->param_begin(), FTy->param_end(),
3504 std::back_inserter(ParamTys),
3505 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506
3507 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508 dyn_cast<IntrinsicInst>(CI));
3509 return TTI.getIntrinsicInstrCost(CostAttrs,
3510 TargetTransformInfo::TCK_RecipThroughput);
3511 }
3512
smallestIntegerVectorType(Type * T1,Type * T2)3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517 }
3518
largestIntegerVectorType(Type * T1,Type * T2)3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523 }
3524
truncateToMinimalBitwidths(VPTransformState & State)3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526 // For every instruction `I` in MinBWs, truncate the operands, create a
3527 // truncated version of `I` and reextend its result. InstCombine runs
3528 // later and will remove any ext/trunc pairs.
3529 SmallPtrSet<Value *, 4> Erased;
3530 for (const auto &KV : Cost->getMinimalBitwidths()) {
3531 // If the value wasn't vectorized, we must maintain the original scalar
3532 // type. The absence of the value from State indicates that it
3533 // wasn't vectorized.
3534 // FIXME: Should not rely on getVPValue at this point.
3535 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536 if (!State.hasAnyVectorValue(Def))
3537 continue;
3538 for (unsigned Part = 0; Part < UF; ++Part) {
3539 Value *I = State.get(Def, Part);
3540 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541 continue;
3542 Type *OriginalTy = I->getType();
3543 Type *ScalarTruncatedTy =
3544 IntegerType::get(OriginalTy->getContext(), KV.second);
3545 auto *TruncatedTy = VectorType::get(
3546 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547 if (TruncatedTy == OriginalTy)
3548 continue;
3549
3550 IRBuilder<> B(cast<Instruction>(I));
3551 auto ShrinkOperand = [&](Value *V) -> Value * {
3552 if (auto *ZI = dyn_cast<ZExtInst>(V))
3553 if (ZI->getSrcTy() == TruncatedTy)
3554 return ZI->getOperand(0);
3555 return B.CreateZExtOrTrunc(V, TruncatedTy);
3556 };
3557
3558 // The actual instruction modification depends on the instruction type,
3559 // unfortunately.
3560 Value *NewI = nullptr;
3561 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563 ShrinkOperand(BO->getOperand(1)));
3564
3565 // Any wrapping introduced by shrinking this operation shouldn't be
3566 // considered undefined behavior. So, we can't unconditionally copy
3567 // arithmetic wrapping flags to NewI.
3568 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570 NewI =
3571 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572 ShrinkOperand(CI->getOperand(1)));
3573 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574 NewI = B.CreateSelect(SI->getCondition(),
3575 ShrinkOperand(SI->getTrueValue()),
3576 ShrinkOperand(SI->getFalseValue()));
3577 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578 switch (CI->getOpcode()) {
3579 default:
3580 llvm_unreachable("Unhandled cast!");
3581 case Instruction::Trunc:
3582 NewI = ShrinkOperand(CI->getOperand(0));
3583 break;
3584 case Instruction::SExt:
3585 NewI = B.CreateSExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 case Instruction::ZExt:
3590 NewI = B.CreateZExtOrTrunc(
3591 CI->getOperand(0),
3592 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593 break;
3594 }
3595 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596 auto Elements0 =
3597 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598 auto *O0 = B.CreateZExtOrTrunc(
3599 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600 auto Elements1 =
3601 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602 auto *O1 = B.CreateZExtOrTrunc(
3603 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604
3605 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607 // Don't do anything with the operands, just extend the result.
3608 continue;
3609 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610 auto Elements =
3611 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612 auto *O0 = B.CreateZExtOrTrunc(
3613 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617 auto Elements =
3618 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619 auto *O0 = B.CreateZExtOrTrunc(
3620 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622 } else {
3623 // If we don't know what to do, be conservative and don't do anything.
3624 continue;
3625 }
3626
3627 // Lastly, extend the result.
3628 NewI->takeName(cast<Instruction>(I));
3629 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630 I->replaceAllUsesWith(Res);
3631 cast<Instruction>(I)->eraseFromParent();
3632 Erased.insert(I);
3633 State.reset(Def, Res, Part);
3634 }
3635 }
3636
3637 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638 for (const auto &KV : Cost->getMinimalBitwidths()) {
3639 // If the value wasn't vectorized, we must maintain the original scalar
3640 // type. The absence of the value from State indicates that it
3641 // wasn't vectorized.
3642 // FIXME: Should not rely on getVPValue at this point.
3643 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644 if (!State.hasAnyVectorValue(Def))
3645 continue;
3646 for (unsigned Part = 0; Part < UF; ++Part) {
3647 Value *I = State.get(Def, Part);
3648 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649 if (Inst && Inst->use_empty()) {
3650 Value *NewI = Inst->getOperand(0);
3651 Inst->eraseFromParent();
3652 State.reset(Def, NewI, Part);
3653 }
3654 }
3655 }
3656 }
3657
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659 VPlan &Plan) {
3660 // Insert truncates and extends for any truncated instructions as hints to
3661 // InstCombine.
3662 if (VF.isVector())
3663 truncateToMinimalBitwidths(State);
3664
3665 // Fix widened non-induction PHIs by setting up the PHI operands.
3666 if (EnableVPlanNativePath)
3667 fixNonInductionPHIs(Plan, State);
3668
3669 // At this point every instruction in the original loop is widened to a
3670 // vector form. Now we need to fix the recurrences in the loop. These PHI
3671 // nodes are currently empty because we did not want to introduce cycles.
3672 // This is the second stage of vectorizing recurrences.
3673 fixCrossIterationPHIs(State);
3674
3675 // Forget the original basic block.
3676 PSE.getSE()->forgetLoop(OrigLoop);
3677
3678 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680 if (Cost->requiresScalarEpilogue(VF)) {
3681 // No edge from the middle block to the unique exit block has been inserted
3682 // and there is nothing to fix from vector loop; phis should have incoming
3683 // from scalar loop only.
3684 Plan.clearLiveOuts();
3685 } else {
3686 // If we inserted an edge from the middle block to the unique exit block,
3687 // update uses outside the loop (phis) to account for the newly inserted
3688 // edge.
3689
3690 // Fix-up external users of the induction variables.
3691 for (const auto &Entry : Legal->getInductionVars())
3692 fixupIVUsers(Entry.first, Entry.second,
3693 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694 IVEndValues[Entry.first], LoopMiddleBlock,
3695 VectorLoop->getHeader(), Plan);
3696 }
3697
3698 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699 // in the exit block, so update the builder.
3700 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701 for (const auto &KV : Plan.getLiveOuts())
3702 KV.second->fixPhi(Plan, State);
3703
3704 for (Instruction *PI : PredicatedInstructions)
3705 sinkScalarOperands(&*PI);
3706
3707 // Remove redundant induction instructions.
3708 cse(VectorLoop->getHeader());
3709
3710 // Set/update profile weights for the vector and remainder loops as original
3711 // loop iterations are now distributed among them. Note that original loop
3712 // represented by LoopScalarBody becomes remainder loop after vectorization.
3713 //
3714 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715 // end up getting slightly roughened result but that should be OK since
3716 // profile is not inherently precise anyway. Note also possible bypass of
3717 // vector code caused by legality checks is ignored, assigning all the weight
3718 // to the vector loop, optimistically.
3719 //
3720 // For scalable vectorization we can't know at compile time how many iterations
3721 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722 // vscale of '1'.
3723 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724 LI->getLoopFor(LoopScalarBody),
3725 VF.getKnownMinValue() * UF);
3726 }
3727
fixCrossIterationPHIs(VPTransformState & State)3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729 // In order to support recurrences we need to be able to vectorize Phi nodes.
3730 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731 // stage #2: We now need to fix the recurrences by adding incoming edges to
3732 // the currently empty PHI nodes. At this point every instruction in the
3733 // original loop is widened to a vector form so we can use them to construct
3734 // the incoming edges.
3735 VPBasicBlock *Header =
3736 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737 for (VPRecipeBase &R : Header->phis()) {
3738 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739 fixReduction(ReductionPhi, State);
3740 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741 fixFixedOrderRecurrence(FOR, State);
3742 }
3743 }
3744
fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe * PhiR,VPTransformState & State)3745 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747 // This is the second phase of vectorizing first-order recurrences. An
3748 // overview of the transformation is described below. Suppose we have the
3749 // following loop.
3750 //
3751 // for (int i = 0; i < n; ++i)
3752 // b[i] = a[i] - a[i - 1];
3753 //
3754 // There is a first-order recurrence on "a". For this loop, the shorthand
3755 // scalar IR looks like:
3756 //
3757 // scalar.ph:
3758 // s_init = a[-1]
3759 // br scalar.body
3760 //
3761 // scalar.body:
3762 // i = phi [0, scalar.ph], [i+1, scalar.body]
3763 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764 // s2 = a[i]
3765 // b[i] = s2 - s1
3766 // br cond, scalar.body, ...
3767 //
3768 // In this example, s1 is a recurrence because it's value depends on the
3769 // previous iteration. In the first phase of vectorization, we created a
3770 // vector phi v1 for s1. We now complete the vectorization and produce the
3771 // shorthand vector IR shown below (for VF = 4, UF = 1).
3772 //
3773 // vector.ph:
3774 // v_init = vector(..., ..., ..., a[-1])
3775 // br vector.body
3776 //
3777 // vector.body
3778 // i = phi [0, vector.ph], [i+4, vector.body]
3779 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3780 // v2 = a[i, i+1, i+2, i+3];
3781 // v3 = vector(v1(3), v2(0, 1, 2))
3782 // b[i, i+1, i+2, i+3] = v2 - v3
3783 // br cond, vector.body, middle.block
3784 //
3785 // middle.block:
3786 // x = v2(3)
3787 // br scalar.ph
3788 //
3789 // scalar.ph:
3790 // s_init = phi [x, middle.block], [a[-1], otherwise]
3791 // br scalar.body
3792 //
3793 // After execution completes the vector loop, we extract the next value of
3794 // the recurrence (x) to use as the initial value in the scalar loop.
3795
3796 // Extract the last vector element in the middle block. This will be the
3797 // initial value for the recurrence when jumping to the scalar loop.
3798 VPValue *PreviousDef = PhiR->getBackedgeValue();
3799 Value *Incoming = State.get(PreviousDef, UF - 1);
3800 auto *ExtractForScalar = Incoming;
3801 auto *IdxTy = Builder.getInt32Ty();
3802 if (VF.isVector()) {
3803 auto *One = ConstantInt::get(IdxTy, 1);
3804 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808 "vector.recur.extract");
3809 }
3810 // Extract the second last element in the middle block if the
3811 // Phi is used outside the loop. We need to extract the phi itself
3812 // and not the last element (the phi update in the current iteration). This
3813 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814 // when the scalar loop is not run at all.
3815 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816 if (VF.isVector()) {
3817 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820 Incoming, Idx, "vector.recur.extract.for.phi");
3821 } else if (UF > 1)
3822 // When loop is unrolled without vectorizing, initialize
3823 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824 // of `Incoming`. This is analogous to the vectorized case above: extracting
3825 // the second last element when VF > 1.
3826 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827
3828 // Fix the initial value of the original recurrence in the scalar loop.
3829 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835 Start->addIncoming(Incoming, BB);
3836 }
3837
3838 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839 Phi->setName("scalar.recur");
3840
3841 // Finally, fix users of the recurrence outside the loop. The users will need
3842 // either the last value of the scalar recurrence or the last value of the
3843 // vector recurrence we extracted in the middle block. Since the loop is in
3844 // LCSSA form, we just need to find all the phi nodes for the original scalar
3845 // recurrence in the exit block, and then add an edge for the middle block.
3846 // Note that LCSSA does not imply single entry when the original scalar loop
3847 // had multiple exiting edges (as we always run the last iteration in the
3848 // scalar epilogue); in that case, there is no edge from middle to exit and
3849 // and thus no phis which needed updated.
3850 if (!Cost->requiresScalarEpilogue(VF))
3851 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854 State.Plan->removeLiveOut(&LCSSAPhi);
3855 }
3856 }
3857
fixReduction(VPReductionPHIRecipe * PhiR,VPTransformState & State)3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859 VPTransformState &State) {
3860 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861 // Get it's reduction variable descriptor.
3862 assert(Legal->isReductionVariable(OrigPhi) &&
3863 "Unable to find the reduction variable");
3864 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865
3866 RecurKind RK = RdxDesc.getRecurrenceKind();
3867 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869 State.setDebugLocFromInst(ReductionStartValue);
3870
3871 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872 // This is the vector-clone of the value that leaves the loop.
3873 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874
3875 // Wrap flags are in general invalid after vectorization, clear them.
3876 clearReductionWrapFlags(PhiR, State);
3877
3878 // Before each round, move the insertion point right between
3879 // the PHIs and the values we are going to write.
3880 // This allows us to write both PHINodes and the extractelement
3881 // instructions.
3882 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883
3884 State.setDebugLocFromInst(LoopExitInst);
3885
3886 Type *PhiTy = OrigPhi->getType();
3887
3888 VPBasicBlock *LatchVPBB =
3889 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891 // If tail is folded by masking, the vector value to leave the loop should be
3892 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893 // instead of the former. For an inloop reduction the reduction will already
3894 // be predicated, and does not need to be handled here.
3895 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896 for (unsigned Part = 0; Part < UF; ++Part) {
3897 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898 SelectInst *Sel = nullptr;
3899 for (User *U : VecLoopExitInst->users()) {
3900 if (isa<SelectInst>(U)) {
3901 assert(!Sel && "Reduction exit feeding two selects");
3902 Sel = cast<SelectInst>(U);
3903 } else
3904 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3905 }
3906 assert(Sel && "Reduction exit feeds no select");
3907 State.reset(LoopExitInstDef, Sel, Part);
3908
3909 if (isa<FPMathOperator>(Sel))
3910 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911
3912 // If the target can create a predicated operator for the reduction at no
3913 // extra cost in the loop (for example a predicated vadd), it can be
3914 // cheaper for the select to remain in the loop than be sunk out of it,
3915 // and so use the select value for the phi instead of the old
3916 // LoopExitValue.
3917 if (PreferPredicatedReductionSelect ||
3918 TTI->preferPredicatedReductionSelect(
3919 RdxDesc.getOpcode(), PhiTy,
3920 TargetTransformInfo::ReductionFlags())) {
3921 auto *VecRdxPhi =
3922 cast<PHINode>(State.get(PhiR, Part));
3923 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924 }
3925 }
3926 }
3927
3928 // If the vector reduction can be performed in a smaller type, we truncate
3929 // then extend the loop exit value to enable InstCombine to evaluate the
3930 // entire expression in the smaller type.
3931 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3933 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935 VectorParts RdxParts(UF);
3936 for (unsigned Part = 0; Part < UF; ++Part) {
3937 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940 : Builder.CreateZExt(Trunc, VecTy);
3941 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942 if (U != Trunc) {
3943 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944 RdxParts[Part] = Extnd;
3945 }
3946 }
3947 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948 for (unsigned Part = 0; Part < UF; ++Part) {
3949 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951 }
3952 }
3953
3954 // Reduce all of the unrolled parts into a single vector.
3955 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957
3958 // The middle block terminator has already been assigned a DebugLoc here (the
3959 // OrigLoop's single latch terminator). We want the whole middle block to
3960 // appear to execute on this line because: (a) it is all compiler generated,
3961 // (b) these instructions are always executed after evaluating the latch
3962 // conditional branch, and (c) other passes may add new predecessors which
3963 // terminate on this line. This is the easiest way to ensure we don't
3964 // accidentally cause an extra step back into the loop while debugging.
3965 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966 if (PhiR->isOrdered())
3967 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968 else {
3969 // Floating-point operations should have some FMF to enable the reduction.
3970 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972 for (unsigned Part = 1; Part < UF; ++Part) {
3973 Value *RdxPart = State.get(LoopExitInstDef, Part);
3974 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975 ReducedPartRdx = Builder.CreateBinOp(
3976 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979 ReducedPartRdx, RdxPart);
3980 else
3981 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982 }
3983 }
3984
3985 // Create the reduction after the loop. Note that inloop reductions create the
3986 // target reduction in the loop using a Reduction recipe.
3987 if (VF.isVector() && !PhiR->isInLoop()) {
3988 ReducedPartRdx =
3989 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990 // If the reduction can be performed in a smaller type, we need to extend
3991 // the reduction to the wider type before we branch to the original loop.
3992 if (PhiTy != RdxDesc.getRecurrenceType())
3993 ReducedPartRdx = RdxDesc.isSigned()
3994 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996 }
3997
3998 PHINode *ResumePhi =
3999 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000
4001 // Create a phi node that merges control-flow from the backedge-taken check
4002 // block and the middle block.
4003 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004 LoopScalarPreHeader->getTerminator());
4005
4006 // If we are fixing reductions in the epilogue loop then we should already
4007 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008 // we carry over the incoming values correctly.
4009 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010 if (Incoming == LoopMiddleBlock)
4011 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014 Incoming);
4015 else
4016 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017 }
4018
4019 // Set the resume value for this reduction
4020 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021
4022 // If there were stores of the reduction value to a uniform memory address
4023 // inside the loop, create the final store here.
4024 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025 StoreInst *NewSI =
4026 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027 propagateMetadata(NewSI, SI);
4028
4029 // If the reduction value is used in other places,
4030 // then let the code below create PHI's for that.
4031 }
4032
4033 // Now, we need to fix the users of the reduction variable
4034 // inside and outside of the scalar remainder loop.
4035
4036 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037 // in the exit blocks. See comment on analogous loop in
4038 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039 if (!Cost->requiresScalarEpilogue(VF))
4040 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043 State.Plan->removeLiveOut(&LCSSAPhi);
4044 }
4045
4046 // Fix the scalar loop reduction variable with the incoming reduction sum
4047 // from the vector body and from the backedge value.
4048 int IncomingEdgeBlockIdx =
4049 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4051 // Pick the other block.
4052 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055 }
4056
clearReductionWrapFlags(VPReductionPHIRecipe * PhiR,VPTransformState & State)4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058 VPTransformState &State) {
4059 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060 RecurKind RK = RdxDesc.getRecurrenceKind();
4061 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062 return;
4063
4064 SmallVector<VPValue *, 8> Worklist;
4065 SmallPtrSet<VPValue *, 8> Visited;
4066 Worklist.push_back(PhiR);
4067 Visited.insert(PhiR);
4068
4069 while (!Worklist.empty()) {
4070 VPValue *Cur = Worklist.pop_back_val();
4071 for (unsigned Part = 0; Part < UF; ++Part) {
4072 Value *V = State.get(Cur, Part);
4073 if (!isa<OverflowingBinaryOperator>(V))
4074 break;
4075 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076 }
4077
4078 for (VPUser *U : Cur->users()) {
4079 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080 if (!UserRecipe)
4081 continue;
4082 for (VPValue *V : UserRecipe->definedValues())
4083 if (Visited.insert(V).second)
4084 Worklist.push_back(V);
4085 }
4086 }
4087 }
4088
sinkScalarOperands(Instruction * PredInst)4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090 // The basic block and loop containing the predicated instruction.
4091 auto *PredBB = PredInst->getParent();
4092 auto *VectorLoop = LI->getLoopFor(PredBB);
4093
4094 // Initialize a worklist with the operands of the predicated instruction.
4095 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096
4097 // Holds instructions that we need to analyze again. An instruction may be
4098 // reanalyzed if we don't yet know if we can sink it or not.
4099 SmallVector<Instruction *, 8> InstsToReanalyze;
4100
4101 // Returns true if a given use occurs in the predicated block. Phi nodes use
4102 // their operands in their corresponding predecessor blocks.
4103 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104 auto *I = cast<Instruction>(U.getUser());
4105 BasicBlock *BB = I->getParent();
4106 if (auto *Phi = dyn_cast<PHINode>(I))
4107 BB = Phi->getIncomingBlock(
4108 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109 return BB == PredBB;
4110 };
4111
4112 // Iteratively sink the scalarized operands of the predicated instruction
4113 // into the block we created for it. When an instruction is sunk, it's
4114 // operands are then added to the worklist. The algorithm ends after one pass
4115 // through the worklist doesn't sink a single instruction.
4116 bool Changed;
4117 do {
4118 // Add the instructions that need to be reanalyzed to the worklist, and
4119 // reset the changed indicator.
4120 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121 InstsToReanalyze.clear();
4122 Changed = false;
4123
4124 while (!Worklist.empty()) {
4125 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126
4127 // We can't sink an instruction if it is a phi node, is not in the loop,
4128 // or may have side effects.
4129 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130 I->mayHaveSideEffects())
4131 continue;
4132
4133 // If the instruction is already in PredBB, check if we can sink its
4134 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135 // sinking the scalar instruction I, hence it appears in PredBB; but it
4136 // may have failed to sink I's operands (recursively), which we try
4137 // (again) here.
4138 if (I->getParent() == PredBB) {
4139 Worklist.insert(I->op_begin(), I->op_end());
4140 continue;
4141 }
4142
4143 // It's legal to sink the instruction if all its uses occur in the
4144 // predicated block. Otherwise, there's nothing to do yet, and we may
4145 // need to reanalyze the instruction.
4146 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147 InstsToReanalyze.push_back(I);
4148 continue;
4149 }
4150
4151 // Move the instruction to the beginning of the predicated block, and add
4152 // it's operands to the worklist.
4153 I->moveBefore(&*PredBB->getFirstInsertionPt());
4154 Worklist.insert(I->op_begin(), I->op_end());
4155
4156 // The sinking may have enabled other instructions to be sunk, so we will
4157 // need to iterate.
4158 Changed = true;
4159 }
4160 } while (Changed);
4161 }
4162
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164 VPTransformState &State) {
4165 auto Iter = vp_depth_first_deep(Plan.getEntry());
4166 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167 for (VPRecipeBase &P : VPBB->phis()) {
4168 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169 if (!VPPhi)
4170 continue;
4171 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172 // Make sure the builder has a valid insert point.
4173 Builder.SetInsertPoint(NewPhi);
4174 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175 VPValue *Inc = VPPhi->getIncomingValue(i);
4176 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178 }
4179 }
4180 }
4181 }
4182
useOrderedReductions(const RecurrenceDescriptor & RdxDesc)4183 bool InnerLoopVectorizer::useOrderedReductions(
4184 const RecurrenceDescriptor &RdxDesc) {
4185 return Cost->useOrderedReductions(RdxDesc);
4186 }
4187
collectLoopScalars(ElementCount VF)4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189 // We should not collect Scalars more than once per VF. Right now, this
4190 // function is called from collectUniformsAndScalars(), which already does
4191 // this check. Collecting Scalars for VF=1 does not make any sense.
4192 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4193 "This function should not be visited twice for the same VF");
4194
4195 // This avoids any chances of creating a REPLICATE recipe during planning
4196 // since that would result in generation of scalarized code during execution,
4197 // which is not supported for scalable vectors.
4198 if (VF.isScalable()) {
4199 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200 return;
4201 }
4202
4203 SmallSetVector<Instruction *, 8> Worklist;
4204
4205 // These sets are used to seed the analysis with pointers used by memory
4206 // accesses that will remain scalar.
4207 SmallSetVector<Instruction *, 8> ScalarPtrs;
4208 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209 auto *Latch = TheLoop->getLoopLatch();
4210
4211 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212 // The pointer operands of loads and stores will be scalar as long as the
4213 // memory access is not a gather or scatter operation. The value operand of a
4214 // store will remain scalar if the store is scalarized.
4215 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217 assert(WideningDecision != CM_Unknown &&
4218 "Widening decision should be ready at this moment");
4219 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220 if (Ptr == Store->getValueOperand())
4221 return WideningDecision == CM_Scalarize;
4222 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4223 "Ptr is neither a value or pointer operand");
4224 return WideningDecision != CM_GatherScatter;
4225 };
4226
4227 // A helper that returns true if the given value is a bitcast or
4228 // getelementptr instruction contained in the loop.
4229 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231 isa<GetElementPtrInst>(V)) &&
4232 !TheLoop->isLoopInvariant(V);
4233 };
4234
4235 // A helper that evaluates a memory access's use of a pointer. If the use will
4236 // be a scalar use and the pointer is only used by memory accesses, we place
4237 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238 // PossibleNonScalarPtrs.
4239 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240 // We only care about bitcast and getelementptr instructions contained in
4241 // the loop.
4242 if (!isLoopVaryingBitCastOrGEP(Ptr))
4243 return;
4244
4245 // If the pointer has already been identified as scalar (e.g., if it was
4246 // also identified as uniform), there's nothing to do.
4247 auto *I = cast<Instruction>(Ptr);
4248 if (Worklist.count(I))
4249 return;
4250
4251 // If the use of the pointer will be a scalar use, and all users of the
4252 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253 // place the pointer in PossibleNonScalarPtrs.
4254 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255 return isa<LoadInst>(U) || isa<StoreInst>(U);
4256 }))
4257 ScalarPtrs.insert(I);
4258 else
4259 PossibleNonScalarPtrs.insert(I);
4260 };
4261
4262 // We seed the scalars analysis with three classes of instructions: (1)
4263 // instructions marked uniform-after-vectorization and (2) bitcast,
4264 // getelementptr and (pointer) phi instructions used by memory accesses
4265 // requiring a scalar use.
4266 //
4267 // (1) Add to the worklist all instructions that have been identified as
4268 // uniform-after-vectorization.
4269 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270
4271 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272 // memory accesses requiring a scalar use. The pointer operands of loads and
4273 // stores will be scalar as long as the memory accesses is not a gather or
4274 // scatter operation. The value operand of a store will remain scalar if the
4275 // store is scalarized.
4276 for (auto *BB : TheLoop->blocks())
4277 for (auto &I : *BB) {
4278 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279 evaluatePtrUse(Load, Load->getPointerOperand());
4280 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281 evaluatePtrUse(Store, Store->getPointerOperand());
4282 evaluatePtrUse(Store, Store->getValueOperand());
4283 }
4284 }
4285 for (auto *I : ScalarPtrs)
4286 if (!PossibleNonScalarPtrs.count(I)) {
4287 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4288 Worklist.insert(I);
4289 }
4290
4291 // Insert the forced scalars.
4292 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293 // induction variable when the PHI user is scalarized.
4294 auto ForcedScalar = ForcedScalars.find(VF);
4295 if (ForcedScalar != ForcedScalars.end())
4296 for (auto *I : ForcedScalar->second) {
4297 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4298 Worklist.insert(I);
4299 }
4300
4301 // Expand the worklist by looking through any bitcasts and getelementptr
4302 // instructions we've already identified as scalar. This is similar to the
4303 // expansion step in collectLoopUniforms(); however, here we're only
4304 // expanding to include additional bitcasts and getelementptr instructions.
4305 unsigned Idx = 0;
4306 while (Idx != Worklist.size()) {
4307 Instruction *Dst = Worklist[Idx++];
4308 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309 continue;
4310 auto *Src = cast<Instruction>(Dst->getOperand(0));
4311 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312 auto *J = cast<Instruction>(U);
4313 return !TheLoop->contains(J) || Worklist.count(J) ||
4314 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315 isScalarUse(J, Src));
4316 })) {
4317 Worklist.insert(Src);
4318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4319 }
4320 }
4321
4322 // An induction variable will remain scalar if all users of the induction
4323 // variable and induction variable update remain scalar.
4324 for (const auto &Induction : Legal->getInductionVars()) {
4325 auto *Ind = Induction.first;
4326 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327
4328 // If tail-folding is applied, the primary induction variable will be used
4329 // to feed a vector compare.
4330 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331 continue;
4332
4333 // Returns true if \p Indvar is a pointer induction that is used directly by
4334 // load/store instruction \p I.
4335 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336 Instruction *I) {
4337 return Induction.second.getKind() ==
4338 InductionDescriptor::IK_PtrInduction &&
4339 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341 };
4342
4343 // Determine if all users of the induction variable are scalar after
4344 // vectorization.
4345 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346 auto *I = cast<Instruction>(U);
4347 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349 });
4350 if (!ScalarInd)
4351 continue;
4352
4353 // Determine if all users of the induction variable update instruction are
4354 // scalar after vectorization.
4355 auto ScalarIndUpdate =
4356 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357 auto *I = cast<Instruction>(U);
4358 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360 });
4361 if (!ScalarIndUpdate)
4362 continue;
4363
4364 // The induction variable and its update instruction will remain scalar.
4365 Worklist.insert(Ind);
4366 Worklist.insert(IndUpdate);
4367 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4368 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4369 << "\n");
4370 }
4371
4372 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373 }
4374
isScalarWithPredication(Instruction * I,ElementCount VF) const4375 bool LoopVectorizationCostModel::isScalarWithPredication(
4376 Instruction *I, ElementCount VF) const {
4377 if (!isPredicatedInst(I))
4378 return false;
4379
4380 // Do we have a non-scalar lowering for this predicated
4381 // instruction? No - it is scalar with predication.
4382 switch(I->getOpcode()) {
4383 default:
4384 return true;
4385 case Instruction::Load:
4386 case Instruction::Store: {
4387 auto *Ptr = getLoadStorePointerOperand(I);
4388 auto *Ty = getLoadStoreType(I);
4389 Type *VTy = Ty;
4390 if (VF.isVector())
4391 VTy = VectorType::get(Ty, VF);
4392 const Align Alignment = getLoadStoreAlignment(I);
4393 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394 TTI.isLegalMaskedGather(VTy, Alignment))
4395 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396 TTI.isLegalMaskedScatter(VTy, Alignment));
4397 }
4398 case Instruction::UDiv:
4399 case Instruction::SDiv:
4400 case Instruction::SRem:
4401 case Instruction::URem: {
4402 // We have the option to use the safe-divisor idiom to avoid predication.
4403 // The cost based decision here will always select safe-divisor for
4404 // scalable vectors as scalarization isn't legal.
4405 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407 }
4408 }
4409 }
4410
isPredicatedInst(Instruction * I) const4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413 return false;
4414
4415 // Can we prove this instruction is safe to unconditionally execute?
4416 // If not, we must use some form of predication.
4417 switch(I->getOpcode()) {
4418 default:
4419 return false;
4420 case Instruction::Load:
4421 case Instruction::Store: {
4422 if (!Legal->isMaskRequired(I))
4423 return false;
4424 // When we know the load's address is loop invariant and the instruction
4425 // in the original scalar loop was unconditionally executed then we
4426 // don't need to mark it as a predicated instruction. Tail folding may
4427 // introduce additional predication, but we're guaranteed to always have
4428 // at least one active lane. We call Legal->blockNeedsPredication here
4429 // because it doesn't query tail-folding. For stores, we need to prove
4430 // both speculation safety (which follows from the same argument as loads),
4431 // but also must prove the value being stored is correct. The easiest
4432 // form of the later is to require that all values stored are the same.
4433 if (Legal->isUniformMemOp(*I) &&
4434 (isa<LoadInst>(I) ||
4435 (isa<StoreInst>(I) &&
4436 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437 !Legal->blockNeedsPredication(I->getParent()))
4438 return false;
4439 return true;
4440 }
4441 case Instruction::UDiv:
4442 case Instruction::SDiv:
4443 case Instruction::SRem:
4444 case Instruction::URem:
4445 // TODO: We can use the loop-preheader as context point here and get
4446 // context sensitive reasoning
4447 return !isSafeToSpeculativelyExecute(I);
4448 }
4449 }
4450
4451 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453 ElementCount VF) const {
4454 assert(I->getOpcode() == Instruction::UDiv ||
4455 I->getOpcode() == Instruction::SDiv ||
4456 I->getOpcode() == Instruction::SRem ||
4457 I->getOpcode() == Instruction::URem);
4458 assert(!isSafeToSpeculativelyExecute(I));
4459
4460 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461
4462 // Scalarization isn't legal for scalable vector types
4463 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464 if (!VF.isScalable()) {
4465 // Get the scalarization cost and scale this amount by the probability of
4466 // executing the predicated block. If the instruction is not predicated,
4467 // we fall through to the next case.
4468 ScalarizationCost = 0;
4469
4470 // These instructions have a non-void type, so account for the phi nodes
4471 // that we will create. This cost is likely to be zero. The phi node
4472 // cost, if any, should be scaled by the block probability because it
4473 // models a copy at the end of each predicated block.
4474 ScalarizationCost += VF.getKnownMinValue() *
4475 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476
4477 // The cost of the non-predicated instruction.
4478 ScalarizationCost += VF.getKnownMinValue() *
4479 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480
4481 // The cost of insertelement and extractelement instructions needed for
4482 // scalarization.
4483 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484
4485 // Scale the cost by the probability of executing the predicated blocks.
4486 // This assumes the predicated block for each vector lane is equally
4487 // likely.
4488 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489 }
4490 InstructionCost SafeDivisorCost = 0;
4491
4492 auto *VecTy = ToVectorTy(I->getType(), VF);
4493
4494 // The cost of the select guard to ensure all lanes are well defined
4495 // after we speculate above any internal control flow.
4496 SafeDivisorCost += TTI.getCmpSelInstrCost(
4497 Instruction::Select, VecTy,
4498 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500
4501 // Certain instructions can be cheaper to vectorize if they have a constant
4502 // second vector operand. One example of this are shifts on x86.
4503 Value *Op2 = I->getOperand(1);
4504 auto Op2Info = TTI.getOperandInfo(Op2);
4505 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507
4508 SmallVector<const Value *, 4> Operands(I->operand_values());
4509 SafeDivisorCost += TTI.getArithmeticInstrCost(
4510 I->getOpcode(), VecTy, CostKind,
4511 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512 Op2Info, Operands, I);
4513 return {ScalarizationCost, SafeDivisorCost};
4514 }
4515
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517 Instruction *I, ElementCount VF) {
4518 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4519 assert(getWideningDecision(I, VF) == CM_Unknown &&
4520 "Decision should not be set yet.");
4521 auto *Group = getInterleavedAccessGroup(I);
4522 assert(Group && "Must have a group.");
4523
4524 // If the instruction's allocated size doesn't equal it's type size, it
4525 // requires padding and will be scalarized.
4526 auto &DL = I->getModule()->getDataLayout();
4527 auto *ScalarTy = getLoadStoreType(I);
4528 if (hasIrregularType(ScalarTy, DL))
4529 return false;
4530
4531 // If the group involves a non-integral pointer, we may not be able to
4532 // losslessly cast all values to a common type.
4533 unsigned InterleaveFactor = Group->getFactor();
4534 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535 for (unsigned i = 0; i < InterleaveFactor; i++) {
4536 Instruction *Member = Group->getMember(i);
4537 if (!Member)
4538 continue;
4539 auto *MemberTy = getLoadStoreType(Member);
4540 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541 // Don't coerce non-integral pointers to integers or vice versa.
4542 if (MemberNI != ScalarNI) {
4543 // TODO: Consider adding special nullptr value case here
4544 return false;
4545 } else if (MemberNI && ScalarNI &&
4546 ScalarTy->getPointerAddressSpace() !=
4547 MemberTy->getPointerAddressSpace()) {
4548 return false;
4549 }
4550 }
4551
4552 // Check if masking is required.
4553 // A Group may need masking for one of two reasons: it resides in a block that
4554 // needs predication, or it was decided to use masking to deal with gaps
4555 // (either a gap at the end of a load-access that may result in a speculative
4556 // load, or any gaps in a store-access).
4557 bool PredicatedAccessRequiresMasking =
4558 blockNeedsPredicationForAnyReason(I->getParent()) &&
4559 Legal->isMaskRequired(I);
4560 bool LoadAccessWithGapsRequiresEpilogMasking =
4561 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562 !isScalarEpilogueAllowed();
4563 bool StoreAccessWithGapsRequiresMasking =
4564 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565 if (!PredicatedAccessRequiresMasking &&
4566 !LoadAccessWithGapsRequiresEpilogMasking &&
4567 !StoreAccessWithGapsRequiresMasking)
4568 return true;
4569
4570 // If masked interleaving is required, we expect that the user/target had
4571 // enabled it, because otherwise it either wouldn't have been created or
4572 // it should have been invalidated by the CostModel.
4573 assert(useMaskedInterleavedAccesses(TTI) &&
4574 "Masked interleave-groups for predicated accesses are not enabled.");
4575
4576 if (Group->isReverse())
4577 return false;
4578
4579 auto *Ty = getLoadStoreType(I);
4580 const Align Alignment = getLoadStoreAlignment(I);
4581 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582 : TTI.isLegalMaskedStore(Ty, Alignment);
4583 }
4584
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586 Instruction *I, ElementCount VF) {
4587 // Get and ensure we have a valid memory instruction.
4588 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4589
4590 auto *Ptr = getLoadStorePointerOperand(I);
4591 auto *ScalarTy = getLoadStoreType(I);
4592
4593 // In order to be widened, the pointer should be consecutive, first of all.
4594 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595 return false;
4596
4597 // If the instruction is a store located in a predicated block, it will be
4598 // scalarized.
4599 if (isScalarWithPredication(I, VF))
4600 return false;
4601
4602 // If the instruction's allocated size doesn't equal it's type size, it
4603 // requires padding and will be scalarized.
4604 auto &DL = I->getModule()->getDataLayout();
4605 if (hasIrregularType(ScalarTy, DL))
4606 return false;
4607
4608 return true;
4609 }
4610
collectLoopUniforms(ElementCount VF)4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612 // We should not collect Uniforms more than once per VF. Right now,
4613 // this function is called from collectUniformsAndScalars(), which
4614 // already does this check. Collecting Uniforms for VF=1 does not make any
4615 // sense.
4616
4617 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4618 "This function should not be visited twice for the same VF");
4619
4620 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621 // not analyze again. Uniforms.count(VF) will return 1.
4622 Uniforms[VF].clear();
4623
4624 // We now know that the loop is vectorizable!
4625 // Collect instructions inside the loop that will remain uniform after
4626 // vectorization.
4627
4628 // Global values, params and instructions outside of current loop are out of
4629 // scope.
4630 auto isOutOfScope = [&](Value *V) -> bool {
4631 Instruction *I = dyn_cast<Instruction>(V);
4632 return (!I || !TheLoop->contains(I));
4633 };
4634
4635 // Worklist containing uniform instructions demanding lane 0.
4636 SetVector<Instruction *> Worklist;
4637 BasicBlock *Latch = TheLoop->getLoopLatch();
4638
4639 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640 // that are scalar with predication must not be considered uniform after
4641 // vectorization, because that would create an erroneous replicating region
4642 // where only a single instance out of VF should be formed.
4643 // TODO: optimize such seldom cases if found important, see PR40816.
4644 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645 if (isOutOfScope(I)) {
4646 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4647 << *I << "\n");
4648 return;
4649 }
4650 if (isScalarWithPredication(I, VF)) {
4651 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4652 << *I << "\n");
4653 return;
4654 }
4655 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4656 Worklist.insert(I);
4657 };
4658
4659 // Start with the conditional branch. If the branch condition is an
4660 // instruction contained in the loop that is only used by the branch, it is
4661 // uniform.
4662 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664 addToWorklistIfAllowed(Cmp);
4665
4666 // Return true if all lanes perform the same memory operation, and we can
4667 // thus chose to execute only one.
4668 auto isUniformMemOpUse = [&](Instruction *I) {
4669 if (!Legal->isUniformMemOp(*I))
4670 return false;
4671 if (isa<LoadInst>(I))
4672 // Loading the same address always produces the same result - at least
4673 // assuming aliasing and ordering which have already been checked.
4674 return true;
4675 // Storing the same value on every iteration.
4676 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677 };
4678
4679 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680 InstWidening WideningDecision = getWideningDecision(I, VF);
4681 assert(WideningDecision != CM_Unknown &&
4682 "Widening decision should be ready at this moment");
4683
4684 if (isUniformMemOpUse(I))
4685 return true;
4686
4687 return (WideningDecision == CM_Widen ||
4688 WideningDecision == CM_Widen_Reverse ||
4689 WideningDecision == CM_Interleave);
4690 };
4691
4692
4693 // Returns true if Ptr is the pointer operand of a memory access instruction
4694 // I, and I is known to not require scalarization.
4695 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4697 };
4698
4699 // Holds a list of values which are known to have at least one uniform use.
4700 // Note that there may be other uses which aren't uniform. A "uniform use"
4701 // here is something which only demands lane 0 of the unrolled iterations;
4702 // it does not imply that all lanes produce the same value (e.g. this is not
4703 // the usual meaning of uniform)
4704 SetVector<Value *> HasUniformUse;
4705
4706 // Scan the loop for instructions which are either a) known to have only
4707 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4708 for (auto *BB : TheLoop->blocks())
4709 for (auto &I : *BB) {
4710 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4711 switch (II->getIntrinsicID()) {
4712 case Intrinsic::sideeffect:
4713 case Intrinsic::experimental_noalias_scope_decl:
4714 case Intrinsic::assume:
4715 case Intrinsic::lifetime_start:
4716 case Intrinsic::lifetime_end:
4717 if (TheLoop->hasLoopInvariantOperands(&I))
4718 addToWorklistIfAllowed(&I);
4719 break;
4720 default:
4721 break;
4722 }
4723 }
4724
4725 // ExtractValue instructions must be uniform, because the operands are
4726 // known to be loop-invariant.
4727 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4728 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4729 "Expected aggregate value to be loop invariant");
4730 addToWorklistIfAllowed(EVI);
4731 continue;
4732 }
4733
4734 // If there's no pointer operand, there's nothing to do.
4735 auto *Ptr = getLoadStorePointerOperand(&I);
4736 if (!Ptr)
4737 continue;
4738
4739 if (isUniformMemOpUse(&I))
4740 addToWorklistIfAllowed(&I);
4741
4742 if (isUniformDecision(&I, VF)) {
4743 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4744 HasUniformUse.insert(Ptr);
4745 }
4746 }
4747
4748 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4749 // demanding) users. Since loops are assumed to be in LCSSA form, this
4750 // disallows uses outside the loop as well.
4751 for (auto *V : HasUniformUse) {
4752 if (isOutOfScope(V))
4753 continue;
4754 auto *I = cast<Instruction>(V);
4755 auto UsersAreMemAccesses =
4756 llvm::all_of(I->users(), [&](User *U) -> bool {
4757 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4758 });
4759 if (UsersAreMemAccesses)
4760 addToWorklistIfAllowed(I);
4761 }
4762
4763 // Expand Worklist in topological order: whenever a new instruction
4764 // is added , its users should be already inside Worklist. It ensures
4765 // a uniform instruction will only be used by uniform instructions.
4766 unsigned idx = 0;
4767 while (idx != Worklist.size()) {
4768 Instruction *I = Worklist[idx++];
4769
4770 for (auto *OV : I->operand_values()) {
4771 // isOutOfScope operands cannot be uniform instructions.
4772 if (isOutOfScope(OV))
4773 continue;
4774 // First order recurrence Phi's should typically be considered
4775 // non-uniform.
4776 auto *OP = dyn_cast<PHINode>(OV);
4777 if (OP && Legal->isFixedOrderRecurrence(OP))
4778 continue;
4779 // If all the users of the operand are uniform, then add the
4780 // operand into the uniform worklist.
4781 auto *OI = cast<Instruction>(OV);
4782 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4783 auto *J = cast<Instruction>(U);
4784 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4785 }))
4786 addToWorklistIfAllowed(OI);
4787 }
4788 }
4789
4790 // For an instruction to be added into Worklist above, all its users inside
4791 // the loop should also be in Worklist. However, this condition cannot be
4792 // true for phi nodes that form a cyclic dependence. We must process phi
4793 // nodes separately. An induction variable will remain uniform if all users
4794 // of the induction variable and induction variable update remain uniform.
4795 // The code below handles both pointer and non-pointer induction variables.
4796 for (const auto &Induction : Legal->getInductionVars()) {
4797 auto *Ind = Induction.first;
4798 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4799
4800 // Determine if all users of the induction variable are uniform after
4801 // vectorization.
4802 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4803 auto *I = cast<Instruction>(U);
4804 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4805 isVectorizedMemAccessUse(I, Ind);
4806 });
4807 if (!UniformInd)
4808 continue;
4809
4810 // Determine if all users of the induction variable update instruction are
4811 // uniform after vectorization.
4812 auto UniformIndUpdate =
4813 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4814 auto *I = cast<Instruction>(U);
4815 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4816 isVectorizedMemAccessUse(I, IndUpdate);
4817 });
4818 if (!UniformIndUpdate)
4819 continue;
4820
4821 // The induction variable and its update instruction will remain uniform.
4822 addToWorklistIfAllowed(Ind);
4823 addToWorklistIfAllowed(IndUpdate);
4824 }
4825
4826 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4827 }
4828
runtimeChecksRequired()4829 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4830 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4831
4832 if (Legal->getRuntimePointerChecking()->Need) {
4833 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4834 "runtime pointer checks needed. Enable vectorization of this "
4835 "loop with '#pragma clang loop vectorize(enable)' when "
4836 "compiling with -Os/-Oz",
4837 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838 return true;
4839 }
4840
4841 if (!PSE.getPredicate().isAlwaysTrue()) {
4842 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4843 "runtime SCEV checks needed. Enable vectorization of this "
4844 "loop with '#pragma clang loop vectorize(enable)' when "
4845 "compiling with -Os/-Oz",
4846 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4847 return true;
4848 }
4849
4850 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4851 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4852 reportVectorizationFailure("Runtime stride check for small trip count",
4853 "runtime stride == 1 checks needed. Enable vectorization of "
4854 "this loop without such check by compiling with -Os/-Oz",
4855 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4856 return true;
4857 }
4858
4859 return false;
4860 }
4861
4862 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)4863 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4864 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4865 return ElementCount::getScalable(0);
4866
4867 if (Hints->isScalableVectorizationDisabled()) {
4868 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4869 "ScalableVectorizationDisabled", ORE, TheLoop);
4870 return ElementCount::getScalable(0);
4871 }
4872
4873 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4874
4875 auto MaxScalableVF = ElementCount::getScalable(
4876 std::numeric_limits<ElementCount::ScalarTy>::max());
4877
4878 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4879 // FIXME: While for scalable vectors this is currently sufficient, this should
4880 // be replaced by a more detailed mechanism that filters out specific VFs,
4881 // instead of invalidating vectorization for a whole set of VFs based on the
4882 // MaxVF.
4883
4884 // Disable scalable vectorization if the loop contains unsupported reductions.
4885 if (!canVectorizeReductions(MaxScalableVF)) {
4886 reportVectorizationInfo(
4887 "Scalable vectorization not supported for the reduction "
4888 "operations found in this loop.",
4889 "ScalableVFUnfeasible", ORE, TheLoop);
4890 return ElementCount::getScalable(0);
4891 }
4892
4893 // Disable scalable vectorization if the loop contains any instructions
4894 // with element types not supported for scalable vectors.
4895 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4896 return !Ty->isVoidTy() &&
4897 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4898 })) {
4899 reportVectorizationInfo("Scalable vectorization is not supported "
4900 "for all element types found in this loop.",
4901 "ScalableVFUnfeasible", ORE, TheLoop);
4902 return ElementCount::getScalable(0);
4903 }
4904
4905 if (Legal->isSafeForAnyVectorWidth())
4906 return MaxScalableVF;
4907
4908 // Limit MaxScalableVF by the maximum safe dependence distance.
4909 std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4910 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4911 MaxVScale =
4912 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4913 MaxScalableVF =
4914 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4915 if (!MaxScalableVF)
4916 reportVectorizationInfo(
4917 "Max legal vector width too small, scalable vectorization "
4918 "unfeasible.",
4919 "ScalableVFUnfeasible", ORE, TheLoop);
4920
4921 return MaxScalableVF;
4922 }
4923
computeFeasibleMaxVF(unsigned ConstTripCount,ElementCount UserVF,bool FoldTailByMasking)4924 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4925 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4926 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4927 unsigned SmallestType, WidestType;
4928 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4929
4930 // Get the maximum safe dependence distance in bits computed by LAA.
4931 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4932 // the memory accesses that is most restrictive (involved in the smallest
4933 // dependence distance).
4934 unsigned MaxSafeElements =
4935 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4936
4937 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4938 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4939
4940 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4941 << ".\n");
4942 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4943 << ".\n");
4944
4945 // First analyze the UserVF, fall back if the UserVF should be ignored.
4946 if (UserVF) {
4947 auto MaxSafeUserVF =
4948 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4949
4950 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4951 // If `VF=vscale x N` is safe, then so is `VF=N`
4952 if (UserVF.isScalable())
4953 return FixedScalableVFPair(
4954 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4955 else
4956 return UserVF;
4957 }
4958
4959 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4960
4961 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4962 // is better to ignore the hint and let the compiler choose a suitable VF.
4963 if (!UserVF.isScalable()) {
4964 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4965 << " is unsafe, clamping to max safe VF="
4966 << MaxSafeFixedVF << ".\n");
4967 ORE->emit([&]() {
4968 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4969 TheLoop->getStartLoc(),
4970 TheLoop->getHeader())
4971 << "User-specified vectorization factor "
4972 << ore::NV("UserVectorizationFactor", UserVF)
4973 << " is unsafe, clamping to maximum safe vectorization factor "
4974 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4975 });
4976 return MaxSafeFixedVF;
4977 }
4978
4979 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4980 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4981 << " is ignored because scalable vectors are not "
4982 "available.\n");
4983 ORE->emit([&]() {
4984 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4985 TheLoop->getStartLoc(),
4986 TheLoop->getHeader())
4987 << "User-specified vectorization factor "
4988 << ore::NV("UserVectorizationFactor", UserVF)
4989 << " is ignored because the target does not support scalable "
4990 "vectors. The compiler will pick a more suitable value.";
4991 });
4992 } else {
4993 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4994 << " is unsafe. Ignoring scalable UserVF.\n");
4995 ORE->emit([&]() {
4996 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4997 TheLoop->getStartLoc(),
4998 TheLoop->getHeader())
4999 << "User-specified vectorization factor "
5000 << ore::NV("UserVectorizationFactor", UserVF)
5001 << " is unsafe. Ignoring the hint to let the compiler pick a "
5002 "more suitable value.";
5003 });
5004 }
5005 }
5006
5007 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5008 << " / " << WidestType << " bits.\n");
5009
5010 FixedScalableVFPair Result(ElementCount::getFixed(1),
5011 ElementCount::getScalable(0));
5012 if (auto MaxVF =
5013 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5014 MaxSafeFixedVF, FoldTailByMasking))
5015 Result.FixedVF = MaxVF;
5016
5017 if (auto MaxVF =
5018 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5019 MaxSafeScalableVF, FoldTailByMasking))
5020 if (MaxVF.isScalable()) {
5021 Result.ScalableVF = MaxVF;
5022 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5023 << "\n");
5024 }
5025
5026 return Result;
5027 }
5028
5029 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)5030 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5031 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5032 // TODO: It may by useful to do since it's still likely to be dynamically
5033 // uniform if the target can skip.
5034 reportVectorizationFailure(
5035 "Not inserting runtime ptr check for divergent target",
5036 "runtime pointer checks needed. Not enabled for divergent target",
5037 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5038 return FixedScalableVFPair::getNone();
5039 }
5040
5041 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5042 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5043 if (TC == 1) {
5044 reportVectorizationFailure("Single iteration (non) loop",
5045 "loop trip count is one, irrelevant for vectorization",
5046 "SingleIterationLoop", ORE, TheLoop);
5047 return FixedScalableVFPair::getNone();
5048 }
5049
5050 switch (ScalarEpilogueStatus) {
5051 case CM_ScalarEpilogueAllowed:
5052 return computeFeasibleMaxVF(TC, UserVF, false);
5053 case CM_ScalarEpilogueNotAllowedUsePredicate:
5054 [[fallthrough]];
5055 case CM_ScalarEpilogueNotNeededUsePredicate:
5056 LLVM_DEBUG(
5057 dbgs() << "LV: vector predicate hint/switch found.\n"
5058 << "LV: Not allowing scalar epilogue, creating predicated "
5059 << "vector loop.\n");
5060 break;
5061 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5062 // fallthrough as a special case of OptForSize
5063 case CM_ScalarEpilogueNotAllowedOptSize:
5064 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5065 LLVM_DEBUG(
5066 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5067 else
5068 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5069 << "count.\n");
5070
5071 // Bail if runtime checks are required, which are not good when optimising
5072 // for size.
5073 if (runtimeChecksRequired())
5074 return FixedScalableVFPair::getNone();
5075
5076 break;
5077 }
5078
5079 // The only loops we can vectorize without a scalar epilogue, are loops with
5080 // a bottom-test and a single exiting block. We'd have to handle the fact
5081 // that not every instruction executes on the last iteration. This will
5082 // require a lane mask which varies through the vector loop body. (TODO)
5083 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5084 // If there was a tail-folding hint/switch, but we can't fold the tail by
5085 // masking, fallback to a vectorization with a scalar epilogue.
5086 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5087 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5088 "scalar epilogue instead.\n");
5089 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5090 return computeFeasibleMaxVF(TC, UserVF, false);
5091 }
5092 return FixedScalableVFPair::getNone();
5093 }
5094
5095 // Now try the tail folding
5096
5097 // Invalidate interleave groups that require an epilogue if we can't mask
5098 // the interleave-group.
5099 if (!useMaskedInterleavedAccesses(TTI)) {
5100 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5101 "No decisions should have been taken at this point");
5102 // Note: There is no need to invalidate any cost modeling decisions here, as
5103 // non where taken so far.
5104 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5105 }
5106
5107 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5108 // Avoid tail folding if the trip count is known to be a multiple of any VF
5109 // we chose.
5110 // FIXME: The condition below pessimises the case for fixed-width vectors,
5111 // when scalable VFs are also candidates for vectorization.
5112 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5113 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5114 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5115 "MaxFixedVF must be a power of 2");
5116 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5117 : MaxFixedVF.getFixedValue();
5118 ScalarEvolution *SE = PSE.getSE();
5119 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5120 const SCEV *ExitCount = SE->getAddExpr(
5121 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5122 const SCEV *Rem = SE->getURemExpr(
5123 SE->applyLoopGuards(ExitCount, TheLoop),
5124 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5125 if (Rem->isZero()) {
5126 // Accept MaxFixedVF if we do not have a tail.
5127 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5128 return MaxFactors;
5129 }
5130 }
5131
5132 // If we don't know the precise trip count, or if the trip count that we
5133 // found modulo the vectorization factor is not zero, try to fold the tail
5134 // by masking.
5135 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5136 if (Legal->prepareToFoldTailByMasking()) {
5137 FoldTailByMasking = true;
5138 return MaxFactors;
5139 }
5140
5141 // If there was a tail-folding hint/switch, but we can't fold the tail by
5142 // masking, fallback to a vectorization with a scalar epilogue.
5143 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5144 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5145 "scalar epilogue instead.\n");
5146 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5147 return MaxFactors;
5148 }
5149
5150 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5151 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5152 return FixedScalableVFPair::getNone();
5153 }
5154
5155 if (TC == 0) {
5156 reportVectorizationFailure(
5157 "Unable to calculate the loop count due to complex control flow",
5158 "unable to calculate the loop count due to complex control flow",
5159 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5160 return FixedScalableVFPair::getNone();
5161 }
5162
5163 reportVectorizationFailure(
5164 "Cannot optimize for size and vectorize at the same time.",
5165 "cannot optimize for size and vectorize at the same time. "
5166 "Enable vectorization of this loop with '#pragma clang loop "
5167 "vectorize(enable)' when compiling with -Os/-Oz",
5168 "NoTailLoopWithOptForSize", ORE, TheLoop);
5169 return FixedScalableVFPair::getNone();
5170 }
5171
getMaximizedVFForTarget(unsigned ConstTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)5172 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5173 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5174 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5175 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5176 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5177 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5178 : TargetTransformInfo::RGK_FixedWidthVector);
5179
5180 // Convenience function to return the minimum of two ElementCounts.
5181 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5182 assert((LHS.isScalable() == RHS.isScalable()) &&
5183 "Scalable flags must match");
5184 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5185 };
5186
5187 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5188 // Note that both WidestRegister and WidestType may not be a powers of 2.
5189 auto MaxVectorElementCount = ElementCount::get(
5190 PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5191 ComputeScalableMaxVF);
5192 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5193 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5194 << (MaxVectorElementCount * WidestType) << " bits.\n");
5195
5196 if (!MaxVectorElementCount) {
5197 LLVM_DEBUG(dbgs() << "LV: The target has no "
5198 << (ComputeScalableMaxVF ? "scalable" : "fixed")
5199 << " vector registers.\n");
5200 return ElementCount::getFixed(1);
5201 }
5202
5203 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5204 if (MaxVectorElementCount.isScalable() &&
5205 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5206 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5207 auto Min = Attr.getVScaleRangeMin();
5208 WidestRegisterMinEC *= Min;
5209 }
5210 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5211 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5212 // If loop trip count (TC) is known at compile time there is no point in
5213 // choosing VF greater than TC (as done in the loop below). Select maximum
5214 // power of two which doesn't exceed TC.
5215 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5216 // when the TC is less than or equal to the known number of lanes.
5217 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5218 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5219 "exceeding the constant trip count: "
5220 << ClampedConstTripCount << "\n");
5221 return ElementCount::getFixed(ClampedConstTripCount);
5222 }
5223
5224 TargetTransformInfo::RegisterKind RegKind =
5225 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5226 : TargetTransformInfo::RGK_FixedWidthVector;
5227 ElementCount MaxVF = MaxVectorElementCount;
5228 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5229 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5230 auto MaxVectorElementCountMaxBW = ElementCount::get(
5231 PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5232 ComputeScalableMaxVF);
5233 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5234
5235 // Collect all viable vectorization factors larger than the default MaxVF
5236 // (i.e. MaxVectorElementCount).
5237 SmallVector<ElementCount, 8> VFs;
5238 for (ElementCount VS = MaxVectorElementCount * 2;
5239 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5240 VFs.push_back(VS);
5241
5242 // For each VF calculate its register usage.
5243 auto RUs = calculateRegisterUsage(VFs);
5244
5245 // Select the largest VF which doesn't require more registers than existing
5246 // ones.
5247 for (int i = RUs.size() - 1; i >= 0; --i) {
5248 bool Selected = true;
5249 for (auto &pair : RUs[i].MaxLocalUsers) {
5250 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5251 if (pair.second > TargetNumRegisters)
5252 Selected = false;
5253 }
5254 if (Selected) {
5255 MaxVF = VFs[i];
5256 break;
5257 }
5258 }
5259 if (ElementCount MinVF =
5260 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5261 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5262 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5263 << ") with target's minimum: " << MinVF << '\n');
5264 MaxVF = MinVF;
5265 }
5266 }
5267
5268 // Invalidate any widening decisions we might have made, in case the loop
5269 // requires prediction (decided later), but we have already made some
5270 // load/store widening decisions.
5271 invalidateCostModelingDecisions();
5272 }
5273 return MaxVF;
5274 }
5275
getVScaleForTuning() const5276 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5277 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5278 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5279 auto Min = Attr.getVScaleRangeMin();
5280 auto Max = Attr.getVScaleRangeMax();
5281 if (Max && Min == Max)
5282 return Max;
5283 }
5284
5285 return TTI.getVScaleForTuning();
5286 }
5287
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const5288 bool LoopVectorizationCostModel::isMoreProfitable(
5289 const VectorizationFactor &A, const VectorizationFactor &B) const {
5290 InstructionCost CostA = A.Cost;
5291 InstructionCost CostB = B.Cost;
5292
5293 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5294
5295 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5296 MaxTripCount) {
5297 // If we are folding the tail and the trip count is a known (possibly small)
5298 // constant, the trip count will be rounded up to an integer number of
5299 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5300 // which we compare directly. When not folding the tail, the total cost will
5301 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5302 // approximated with the per-lane cost below instead of using the tripcount
5303 // as here.
5304 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5305 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5306 return RTCostA < RTCostB;
5307 }
5308
5309 // Improve estimate for the vector width if it is scalable.
5310 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5311 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5312 if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5313 if (A.Width.isScalable())
5314 EstimatedWidthA *= *VScale;
5315 if (B.Width.isScalable())
5316 EstimatedWidthB *= *VScale;
5317 }
5318
5319 // Assume vscale may be larger than 1 (or the value being tuned for),
5320 // so that scalable vectorization is slightly favorable over fixed-width
5321 // vectorization.
5322 if (A.Width.isScalable() && !B.Width.isScalable())
5323 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5324
5325 // To avoid the need for FP division:
5326 // (CostA / A.Width) < (CostB / B.Width)
5327 // <=> (CostA * B.Width) < (CostB * A.Width)
5328 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5329 }
5330
selectVectorizationFactor(const ElementCountSet & VFCandidates)5331 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5332 const ElementCountSet &VFCandidates) {
5333 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5334 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5335 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5336 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5337 "Expected Scalar VF to be a candidate");
5338
5339 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5340 ExpectedCost);
5341 VectorizationFactor ChosenFactor = ScalarCost;
5342
5343 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5344 if (ForceVectorization && VFCandidates.size() > 1) {
5345 // Ignore scalar width, because the user explicitly wants vectorization.
5346 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5347 // evaluation.
5348 ChosenFactor.Cost = InstructionCost::getMax();
5349 }
5350
5351 SmallVector<InstructionVFPair> InvalidCosts;
5352 for (const auto &i : VFCandidates) {
5353 // The cost for scalar VF=1 is already calculated, so ignore it.
5354 if (i.isScalar())
5355 continue;
5356
5357 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5358 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5359
5360 #ifndef NDEBUG
5361 unsigned AssumedMinimumVscale = 1;
5362 if (std::optional<unsigned> VScale = getVScaleForTuning())
5363 AssumedMinimumVscale = *VScale;
5364 unsigned Width =
5365 Candidate.Width.isScalable()
5366 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5367 : Candidate.Width.getFixedValue();
5368 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5369 << " costs: " << (Candidate.Cost / Width));
5370 if (i.isScalable())
5371 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5372 << AssumedMinimumVscale << ")");
5373 LLVM_DEBUG(dbgs() << ".\n");
5374 #endif
5375
5376 if (!C.second && !ForceVectorization) {
5377 LLVM_DEBUG(
5378 dbgs() << "LV: Not considering vector loop of width " << i
5379 << " because it will not generate any vector instructions.\n");
5380 continue;
5381 }
5382
5383 // If profitable add it to ProfitableVF list.
5384 if (isMoreProfitable(Candidate, ScalarCost))
5385 ProfitableVFs.push_back(Candidate);
5386
5387 if (isMoreProfitable(Candidate, ChosenFactor))
5388 ChosenFactor = Candidate;
5389 }
5390
5391 // Emit a report of VFs with invalid costs in the loop.
5392 if (!InvalidCosts.empty()) {
5393 // Group the remarks per instruction, keeping the instruction order from
5394 // InvalidCosts.
5395 std::map<Instruction *, unsigned> Numbering;
5396 unsigned I = 0;
5397 for (auto &Pair : InvalidCosts)
5398 if (!Numbering.count(Pair.first))
5399 Numbering[Pair.first] = I++;
5400
5401 // Sort the list, first on instruction(number) then on VF.
5402 llvm::sort(InvalidCosts,
5403 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5404 if (Numbering[A.first] != Numbering[B.first])
5405 return Numbering[A.first] < Numbering[B.first];
5406 ElementCountComparator ECC;
5407 return ECC(A.second, B.second);
5408 });
5409
5410 // For a list of ordered instruction-vf pairs:
5411 // [(load, vf1), (load, vf2), (store, vf1)]
5412 // Group the instructions together to emit separate remarks for:
5413 // load (vf1, vf2)
5414 // store (vf1)
5415 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5416 auto Subset = ArrayRef<InstructionVFPair>();
5417 do {
5418 if (Subset.empty())
5419 Subset = Tail.take_front(1);
5420
5421 Instruction *I = Subset.front().first;
5422
5423 // If the next instruction is different, or if there are no other pairs,
5424 // emit a remark for the collated subset. e.g.
5425 // [(load, vf1), (load, vf2))]
5426 // to emit:
5427 // remark: invalid costs for 'load' at VF=(vf, vf2)
5428 if (Subset == Tail || Tail[Subset.size()].first != I) {
5429 std::string OutString;
5430 raw_string_ostream OS(OutString);
5431 assert(!Subset.empty() && "Unexpected empty range");
5432 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5433 for (const auto &Pair : Subset)
5434 OS << (Pair.second == Subset.front().second ? "" : ", ")
5435 << Pair.second;
5436 OS << "):";
5437 if (auto *CI = dyn_cast<CallInst>(I))
5438 OS << " call to " << CI->getCalledFunction()->getName();
5439 else
5440 OS << " " << I->getOpcodeName();
5441 OS.flush();
5442 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5443 Tail = Tail.drop_front(Subset.size());
5444 Subset = {};
5445 } else
5446 // Grow the subset by one element
5447 Subset = Tail.take_front(Subset.size() + 1);
5448 } while (!Tail.empty());
5449 }
5450
5451 if (!EnableCondStoresVectorization && NumPredStores) {
5452 reportVectorizationFailure("There are conditional stores.",
5453 "store that is conditionally executed prevents vectorization",
5454 "ConditionalStore", ORE, TheLoop);
5455 ChosenFactor = ScalarCost;
5456 }
5457
5458 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5459 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5460 << "LV: Vectorization seems to be not beneficial, "
5461 << "but was forced by a user.\n");
5462 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5463 return ChosenFactor;
5464 }
5465
isCandidateForEpilogueVectorization(const Loop & L,ElementCount VF) const5466 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5467 const Loop &L, ElementCount VF) const {
5468 // Cross iteration phis such as reductions need special handling and are
5469 // currently unsupported.
5470 if (any_of(L.getHeader()->phis(),
5471 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5472 return false;
5473
5474 // Phis with uses outside of the loop require special handling and are
5475 // currently unsupported.
5476 for (const auto &Entry : Legal->getInductionVars()) {
5477 // Look for uses of the value of the induction at the last iteration.
5478 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5479 for (User *U : PostInc->users())
5480 if (!L.contains(cast<Instruction>(U)))
5481 return false;
5482 // Look for uses of penultimate value of the induction.
5483 for (User *U : Entry.first->users())
5484 if (!L.contains(cast<Instruction>(U)))
5485 return false;
5486 }
5487
5488 // Epilogue vectorization code has not been auditted to ensure it handles
5489 // non-latch exits properly. It may be fine, but it needs auditted and
5490 // tested.
5491 if (L.getExitingBlock() != L.getLoopLatch())
5492 return false;
5493
5494 return true;
5495 }
5496
isEpilogueVectorizationProfitable(const ElementCount VF) const5497 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5498 const ElementCount VF) const {
5499 // FIXME: We need a much better cost-model to take different parameters such
5500 // as register pressure, code size increase and cost of extra branches into
5501 // account. For now we apply a very crude heuristic and only consider loops
5502 // with vectorization factors larger than a certain value.
5503
5504 // Allow the target to opt out entirely.
5505 if (!TTI.preferEpilogueVectorization())
5506 return false;
5507
5508 // We also consider epilogue vectorization unprofitable for targets that don't
5509 // consider interleaving beneficial (eg. MVE).
5510 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5511 return false;
5512 // FIXME: We should consider changing the threshold for scalable
5513 // vectors to take VScaleForTuning into account.
5514 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5515 return true;
5516 return false;
5517 }
5518
5519 VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,const LoopVectorizationPlanner & LVP)5520 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5521 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5522 VectorizationFactor Result = VectorizationFactor::Disabled();
5523 if (!EnableEpilogueVectorization) {
5524 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5525 return Result;
5526 }
5527
5528 if (!isScalarEpilogueAllowed()) {
5529 LLVM_DEBUG(
5530 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5531 "allowed.\n";);
5532 return Result;
5533 }
5534
5535 // Not really a cost consideration, but check for unsupported cases here to
5536 // simplify the logic.
5537 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5538 LLVM_DEBUG(
5539 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5540 "not a supported candidate.\n";);
5541 return Result;
5542 }
5543
5544 if (EpilogueVectorizationForceVF > 1) {
5545 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5546 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5547 if (LVP.hasPlanWithVF(ForcedEC))
5548 return {ForcedEC, 0, 0};
5549 else {
5550 LLVM_DEBUG(
5551 dbgs()
5552 << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5553 return Result;
5554 }
5555 }
5556
5557 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5558 TheLoop->getHeader()->getParent()->hasMinSize()) {
5559 LLVM_DEBUG(
5560 dbgs()
5561 << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5562 return Result;
5563 }
5564
5565 if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5566 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5567 "this loop\n");
5568 return Result;
5569 }
5570
5571 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5572 // the main loop handles 8 lanes per iteration. We could still benefit from
5573 // vectorizing the epilogue loop with VF=4.
5574 ElementCount EstimatedRuntimeVF = MainLoopVF;
5575 if (MainLoopVF.isScalable()) {
5576 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5577 if (std::optional<unsigned> VScale = getVScaleForTuning())
5578 EstimatedRuntimeVF *= *VScale;
5579 }
5580
5581 for (auto &NextVF : ProfitableVFs)
5582 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5583 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5584 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5585 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5586 LVP.hasPlanWithVF(NextVF.Width))
5587 Result = NextVF;
5588
5589 if (Result != VectorizationFactor::Disabled())
5590 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5591 << Result.Width << "\n";);
5592 return Result;
5593 }
5594
5595 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5596 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5597 unsigned MinWidth = -1U;
5598 unsigned MaxWidth = 8;
5599 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5600 // For in-loop reductions, no element types are added to ElementTypesInLoop
5601 // if there are no loads/stores in the loop. In this case, check through the
5602 // reduction variables to determine the maximum width.
5603 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5604 // Reset MaxWidth so that we can find the smallest type used by recurrences
5605 // in the loop.
5606 MaxWidth = -1U;
5607 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5608 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5609 // When finding the min width used by the recurrence we need to account
5610 // for casts on the input operands of the recurrence.
5611 MaxWidth = std::min<unsigned>(
5612 MaxWidth, std::min<unsigned>(
5613 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5614 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5615 }
5616 } else {
5617 for (Type *T : ElementTypesInLoop) {
5618 MinWidth = std::min<unsigned>(
5619 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5620 MaxWidth = std::max<unsigned>(
5621 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5622 }
5623 }
5624 return {MinWidth, MaxWidth};
5625 }
5626
collectElementTypesForWidening()5627 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5628 ElementTypesInLoop.clear();
5629 // For each block.
5630 for (BasicBlock *BB : TheLoop->blocks()) {
5631 // For each instruction in the loop.
5632 for (Instruction &I : BB->instructionsWithoutDebug()) {
5633 Type *T = I.getType();
5634
5635 // Skip ignored values.
5636 if (ValuesToIgnore.count(&I))
5637 continue;
5638
5639 // Only examine Loads, Stores and PHINodes.
5640 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5641 continue;
5642
5643 // Examine PHI nodes that are reduction variables. Update the type to
5644 // account for the recurrence type.
5645 if (auto *PN = dyn_cast<PHINode>(&I)) {
5646 if (!Legal->isReductionVariable(PN))
5647 continue;
5648 const RecurrenceDescriptor &RdxDesc =
5649 Legal->getReductionVars().find(PN)->second;
5650 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5651 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5652 RdxDesc.getRecurrenceType(),
5653 TargetTransformInfo::ReductionFlags()))
5654 continue;
5655 T = RdxDesc.getRecurrenceType();
5656 }
5657
5658 // Examine the stored values.
5659 if (auto *ST = dyn_cast<StoreInst>(&I))
5660 T = ST->getValueOperand()->getType();
5661
5662 assert(T->isSized() &&
5663 "Expected the load/store/recurrence type to be sized");
5664
5665 ElementTypesInLoop.insert(T);
5666 }
5667 }
5668 }
5669
5670 unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)5671 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5672 InstructionCost LoopCost) {
5673 // -- The interleave heuristics --
5674 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5675 // There are many micro-architectural considerations that we can't predict
5676 // at this level. For example, frontend pressure (on decode or fetch) due to
5677 // code size, or the number and capabilities of the execution ports.
5678 //
5679 // We use the following heuristics to select the interleave count:
5680 // 1. If the code has reductions, then we interleave to break the cross
5681 // iteration dependency.
5682 // 2. If the loop is really small, then we interleave to reduce the loop
5683 // overhead.
5684 // 3. We don't interleave if we think that we will spill registers to memory
5685 // due to the increased register pressure.
5686
5687 if (!isScalarEpilogueAllowed())
5688 return 1;
5689
5690 // We used the distance for the interleave count.
5691 if (Legal->getMaxSafeDepDistBytes() != -1U)
5692 return 1;
5693
5694 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5695 const bool HasReductions = !Legal->getReductionVars().empty();
5696 // Do not interleave loops with a relatively small known or estimated trip
5697 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5698 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5699 // because with the above conditions interleaving can expose ILP and break
5700 // cross iteration dependences for reductions.
5701 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5702 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5703 return 1;
5704
5705 // If we did not calculate the cost for VF (because the user selected the VF)
5706 // then we calculate the cost of VF here.
5707 if (LoopCost == 0) {
5708 LoopCost = expectedCost(VF).first;
5709 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5710
5711 // Loop body is free and there is no need for interleaving.
5712 if (LoopCost == 0)
5713 return 1;
5714 }
5715
5716 RegisterUsage R = calculateRegisterUsage({VF})[0];
5717 // We divide by these constants so assume that we have at least one
5718 // instruction that uses at least one register.
5719 for (auto& pair : R.MaxLocalUsers) {
5720 pair.second = std::max(pair.second, 1U);
5721 }
5722
5723 // We calculate the interleave count using the following formula.
5724 // Subtract the number of loop invariants from the number of available
5725 // registers. These registers are used by all of the interleaved instances.
5726 // Next, divide the remaining registers by the number of registers that is
5727 // required by the loop, in order to estimate how many parallel instances
5728 // fit without causing spills. All of this is rounded down if necessary to be
5729 // a power of two. We want power of two interleave count to simplify any
5730 // addressing operations or alignment considerations.
5731 // We also want power of two interleave counts to ensure that the induction
5732 // variable of the vector loop wraps to zero, when tail is folded by masking;
5733 // this currently happens when OptForSize, in which case IC is set to 1 above.
5734 unsigned IC = UINT_MAX;
5735
5736 for (auto& pair : R.MaxLocalUsers) {
5737 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5738 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5739 << " registers of "
5740 << TTI.getRegisterClassName(pair.first) << " register class\n");
5741 if (VF.isScalar()) {
5742 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5743 TargetNumRegisters = ForceTargetNumScalarRegs;
5744 } else {
5745 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5746 TargetNumRegisters = ForceTargetNumVectorRegs;
5747 }
5748 unsigned MaxLocalUsers = pair.second;
5749 unsigned LoopInvariantRegs = 0;
5750 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5751 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5752
5753 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5754 // Don't count the induction variable as interleaved.
5755 if (EnableIndVarRegisterHeur) {
5756 TmpIC =
5757 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5758 std::max(1U, (MaxLocalUsers - 1)));
5759 }
5760
5761 IC = std::min(IC, TmpIC);
5762 }
5763
5764 // Clamp the interleave ranges to reasonable counts.
5765 unsigned MaxInterleaveCount =
5766 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5767
5768 // Check if the user has overridden the max.
5769 if (VF.isScalar()) {
5770 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5771 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5772 } else {
5773 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5774 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5775 }
5776
5777 // If trip count is known or estimated compile time constant, limit the
5778 // interleave count to be less than the trip count divided by VF, provided it
5779 // is at least 1.
5780 //
5781 // For scalable vectors we can't know if interleaving is beneficial. It may
5782 // not be beneficial for small loops if none of the lanes in the second vector
5783 // iterations is enabled. However, for larger loops, there is likely to be a
5784 // similar benefit as for fixed-width vectors. For now, we choose to leave
5785 // the InterleaveCount as if vscale is '1', although if some information about
5786 // the vector is known (e.g. min vector size), we can make a better decision.
5787 if (BestKnownTC) {
5788 MaxInterleaveCount =
5789 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5790 // Make sure MaxInterleaveCount is greater than 0.
5791 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5792 }
5793
5794 assert(MaxInterleaveCount > 0 &&
5795 "Maximum interleave count must be greater than 0");
5796
5797 // Clamp the calculated IC to be between the 1 and the max interleave count
5798 // that the target and trip count allows.
5799 if (IC > MaxInterleaveCount)
5800 IC = MaxInterleaveCount;
5801 else
5802 // Make sure IC is greater than 0.
5803 IC = std::max(1u, IC);
5804
5805 assert(IC > 0 && "Interleave count must be greater than 0.");
5806
5807 // Interleave if we vectorized this loop and there is a reduction that could
5808 // benefit from interleaving.
5809 if (VF.isVector() && HasReductions) {
5810 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5811 return IC;
5812 }
5813
5814 // For any scalar loop that either requires runtime checks or predication we
5815 // are better off leaving this to the unroller. Note that if we've already
5816 // vectorized the loop we will have done the runtime check and so interleaving
5817 // won't require further checks.
5818 bool ScalarInterleavingRequiresPredication =
5819 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5820 return Legal->blockNeedsPredication(BB);
5821 }));
5822 bool ScalarInterleavingRequiresRuntimePointerCheck =
5823 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5824
5825 // We want to interleave small loops in order to reduce the loop overhead and
5826 // potentially expose ILP opportunities.
5827 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5828 << "LV: IC is " << IC << '\n'
5829 << "LV: VF is " << VF << '\n');
5830 const bool AggressivelyInterleaveReductions =
5831 TTI.enableAggressiveInterleaving(HasReductions);
5832 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5833 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5834 // We assume that the cost overhead is 1 and we use the cost model
5835 // to estimate the cost of the loop and interleave until the cost of the
5836 // loop overhead is about 5% of the cost of the loop.
5837 unsigned SmallIC = std::min(
5838 IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5839
5840 // Interleave until store/load ports (estimated by max interleave count) are
5841 // saturated.
5842 unsigned NumStores = Legal->getNumStores();
5843 unsigned NumLoads = Legal->getNumLoads();
5844 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5845 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5846
5847 // There is little point in interleaving for reductions containing selects
5848 // and compares when VF=1 since it may just create more overhead than it's
5849 // worth for loops with small trip counts. This is because we still have to
5850 // do the final reduction after the loop.
5851 bool HasSelectCmpReductions =
5852 HasReductions &&
5853 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5854 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5855 return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5856 RdxDesc.getRecurrenceKind());
5857 });
5858 if (HasSelectCmpReductions) {
5859 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5860 return 1;
5861 }
5862
5863 // If we have a scalar reduction (vector reductions are already dealt with
5864 // by this point), we can increase the critical path length if the loop
5865 // we're interleaving is inside another loop. For tree-wise reductions
5866 // set the limit to 2, and for ordered reductions it's best to disable
5867 // interleaving entirely.
5868 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5869 bool HasOrderedReductions =
5870 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5871 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5872 return RdxDesc.isOrdered();
5873 });
5874 if (HasOrderedReductions) {
5875 LLVM_DEBUG(
5876 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5877 return 1;
5878 }
5879
5880 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5881 SmallIC = std::min(SmallIC, F);
5882 StoresIC = std::min(StoresIC, F);
5883 LoadsIC = std::min(LoadsIC, F);
5884 }
5885
5886 if (EnableLoadStoreRuntimeInterleave &&
5887 std::max(StoresIC, LoadsIC) > SmallIC) {
5888 LLVM_DEBUG(
5889 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5890 return std::max(StoresIC, LoadsIC);
5891 }
5892
5893 // If there are scalar reductions and TTI has enabled aggressive
5894 // interleaving for reductions, we will interleave to expose ILP.
5895 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5896 AggressivelyInterleaveReductions) {
5897 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5898 // Interleave no less than SmallIC but not as aggressive as the normal IC
5899 // to satisfy the rare situation when resources are too limited.
5900 return std::max(IC / 2, SmallIC);
5901 } else {
5902 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5903 return SmallIC;
5904 }
5905 }
5906
5907 // Interleave if this is a large loop (small loops are already dealt with by
5908 // this point) that could benefit from interleaving.
5909 if (AggressivelyInterleaveReductions) {
5910 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5911 return IC;
5912 }
5913
5914 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5915 return 1;
5916 }
5917
5918 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5919 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5920 // This function calculates the register usage by measuring the highest number
5921 // of values that are alive at a single location. Obviously, this is a very
5922 // rough estimation. We scan the loop in a topological order in order and
5923 // assign a number to each instruction. We use RPO to ensure that defs are
5924 // met before their users. We assume that each instruction that has in-loop
5925 // users starts an interval. We record every time that an in-loop value is
5926 // used, so we have a list of the first and last occurrences of each
5927 // instruction. Next, we transpose this data structure into a multi map that
5928 // holds the list of intervals that *end* at a specific location. This multi
5929 // map allows us to perform a linear search. We scan the instructions linearly
5930 // and record each time that a new interval starts, by placing it in a set.
5931 // If we find this value in the multi-map then we remove it from the set.
5932 // The max register usage is the maximum size of the set.
5933 // We also search for instructions that are defined outside the loop, but are
5934 // used inside the loop. We need this number separately from the max-interval
5935 // usage number because when we unroll, loop-invariant values do not take
5936 // more register.
5937 LoopBlocksDFS DFS(TheLoop);
5938 DFS.perform(LI);
5939
5940 RegisterUsage RU;
5941
5942 // Each 'key' in the map opens a new interval. The values
5943 // of the map are the index of the 'last seen' usage of the
5944 // instruction that is the key.
5945 using IntervalMap = DenseMap<Instruction *, unsigned>;
5946
5947 // Maps instruction to its index.
5948 SmallVector<Instruction *, 64> IdxToInstr;
5949 // Marks the end of each interval.
5950 IntervalMap EndPoint;
5951 // Saves the list of instruction indices that are used in the loop.
5952 SmallPtrSet<Instruction *, 8> Ends;
5953 // Saves the list of values that are used in the loop but are defined outside
5954 // the loop (not including non-instruction values such as arguments and
5955 // constants).
5956 SmallPtrSet<Instruction *, 8> LoopInvariants;
5957
5958 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5959 for (Instruction &I : BB->instructionsWithoutDebug()) {
5960 IdxToInstr.push_back(&I);
5961
5962 // Save the end location of each USE.
5963 for (Value *U : I.operands()) {
5964 auto *Instr = dyn_cast<Instruction>(U);
5965
5966 // Ignore non-instruction values such as arguments, constants, etc.
5967 // FIXME: Might need some motivation why these values are ignored. If
5968 // for example an argument is used inside the loop it will increase the
5969 // register pressure (so shouldn't we add it to LoopInvariants).
5970 if (!Instr)
5971 continue;
5972
5973 // If this instruction is outside the loop then record it and continue.
5974 if (!TheLoop->contains(Instr)) {
5975 LoopInvariants.insert(Instr);
5976 continue;
5977 }
5978
5979 // Overwrite previous end points.
5980 EndPoint[Instr] = IdxToInstr.size();
5981 Ends.insert(Instr);
5982 }
5983 }
5984 }
5985
5986 // Saves the list of intervals that end with the index in 'key'.
5987 using InstrList = SmallVector<Instruction *, 2>;
5988 DenseMap<unsigned, InstrList> TransposeEnds;
5989
5990 // Transpose the EndPoints to a list of values that end at each index.
5991 for (auto &Interval : EndPoint)
5992 TransposeEnds[Interval.second].push_back(Interval.first);
5993
5994 SmallPtrSet<Instruction *, 8> OpenIntervals;
5995 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5996 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5997
5998 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5999
6000 const auto &TTICapture = TTI;
6001 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6002 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6003 return 0;
6004 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6005 };
6006
6007 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6008 Instruction *I = IdxToInstr[i];
6009
6010 // Remove all of the instructions that end at this location.
6011 InstrList &List = TransposeEnds[i];
6012 for (Instruction *ToRemove : List)
6013 OpenIntervals.erase(ToRemove);
6014
6015 // Ignore instructions that are never used within the loop.
6016 if (!Ends.count(I))
6017 continue;
6018
6019 // Skip ignored values.
6020 if (ValuesToIgnore.count(I))
6021 continue;
6022
6023 // For each VF find the maximum usage of registers.
6024 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6025 // Count the number of registers used, per register class, given all open
6026 // intervals.
6027 // Note that elements in this SmallMapVector will be default constructed
6028 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6029 // there is no previous entry for ClassID.
6030 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6031
6032 if (VFs[j].isScalar()) {
6033 for (auto *Inst : OpenIntervals) {
6034 unsigned ClassID =
6035 TTI.getRegisterClassForType(false, Inst->getType());
6036 // FIXME: The target might use more than one register for the type
6037 // even in the scalar case.
6038 RegUsage[ClassID] += 1;
6039 }
6040 } else {
6041 collectUniformsAndScalars(VFs[j]);
6042 for (auto *Inst : OpenIntervals) {
6043 // Skip ignored values for VF > 1.
6044 if (VecValuesToIgnore.count(Inst))
6045 continue;
6046 if (isScalarAfterVectorization(Inst, VFs[j])) {
6047 unsigned ClassID =
6048 TTI.getRegisterClassForType(false, Inst->getType());
6049 // FIXME: The target might use more than one register for the type
6050 // even in the scalar case.
6051 RegUsage[ClassID] += 1;
6052 } else {
6053 unsigned ClassID =
6054 TTI.getRegisterClassForType(true, Inst->getType());
6055 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6056 }
6057 }
6058 }
6059
6060 for (auto& pair : RegUsage) {
6061 auto &Entry = MaxUsages[j][pair.first];
6062 Entry = std::max(Entry, pair.second);
6063 }
6064 }
6065
6066 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6067 << OpenIntervals.size() << '\n');
6068
6069 // Add the current instruction to the list of open intervals.
6070 OpenIntervals.insert(I);
6071 }
6072
6073 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6074 // Note that elements in this SmallMapVector will be default constructed
6075 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6076 // there is no previous entry for ClassID.
6077 SmallMapVector<unsigned, unsigned, 4> Invariant;
6078
6079 for (auto *Inst : LoopInvariants) {
6080 // FIXME: The target might use more than one register for the type
6081 // even in the scalar case.
6082 bool IsScalar = all_of(Inst->users(), [&](User *U) {
6083 auto *I = cast<Instruction>(U);
6084 return TheLoop != LI->getLoopFor(I->getParent()) ||
6085 isScalarAfterVectorization(I, VFs[i]);
6086 });
6087
6088 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
6089 unsigned ClassID =
6090 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
6091 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
6092 }
6093
6094 LLVM_DEBUG({
6095 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6096 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6097 << " item\n";
6098 for (const auto &pair : MaxUsages[i]) {
6099 dbgs() << "LV(REG): RegisterClass: "
6100 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6101 << " registers\n";
6102 }
6103 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6104 << " item\n";
6105 for (const auto &pair : Invariant) {
6106 dbgs() << "LV(REG): RegisterClass: "
6107 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6108 << " registers\n";
6109 }
6110 });
6111
6112 RU.LoopInvariantRegs = Invariant;
6113 RU.MaxLocalUsers = MaxUsages[i];
6114 RUs[i] = RU;
6115 }
6116
6117 return RUs;
6118 }
6119
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)6120 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6121 ElementCount VF) {
6122 // TODO: Cost model for emulated masked load/store is completely
6123 // broken. This hack guides the cost model to use an artificially
6124 // high enough value to practically disable vectorization with such
6125 // operations, except where previously deployed legality hack allowed
6126 // using very low cost values. This is to avoid regressions coming simply
6127 // from moving "masked load/store" check from legality to cost model.
6128 // Masked Load/Gather emulation was previously never allowed.
6129 // Limited number of Masked Store/Scatter emulation was allowed.
6130 assert((isPredicatedInst(I)) &&
6131 "Expecting a scalar emulated instruction");
6132 return isa<LoadInst>(I) ||
6133 (isa<StoreInst>(I) &&
6134 NumPredStores > NumberOfStoresToPredicate);
6135 }
6136
collectInstsToScalarize(ElementCount VF)6137 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6138 // If we aren't vectorizing the loop, or if we've already collected the
6139 // instructions to scalarize, there's nothing to do. Collection may already
6140 // have occurred if we have a user-selected VF and are now computing the
6141 // expected cost for interleaving.
6142 if (VF.isScalar() || VF.isZero() ||
6143 InstsToScalarize.find(VF) != InstsToScalarize.end())
6144 return;
6145
6146 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6147 // not profitable to scalarize any instructions, the presence of VF in the
6148 // map will indicate that we've analyzed it already.
6149 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6150
6151 PredicatedBBsAfterVectorization[VF].clear();
6152
6153 // Find all the instructions that are scalar with predication in the loop and
6154 // determine if it would be better to not if-convert the blocks they are in.
6155 // If so, we also record the instructions to scalarize.
6156 for (BasicBlock *BB : TheLoop->blocks()) {
6157 if (!blockNeedsPredicationForAnyReason(BB))
6158 continue;
6159 for (Instruction &I : *BB)
6160 if (isScalarWithPredication(&I, VF)) {
6161 ScalarCostsTy ScalarCosts;
6162 // Do not apply discount if scalable, because that would lead to
6163 // invalid scalarization costs.
6164 // Do not apply discount logic if hacked cost is needed
6165 // for emulated masked memrefs.
6166 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6167 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6168 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6169 // Remember that BB will remain after vectorization.
6170 PredicatedBBsAfterVectorization[VF].insert(BB);
6171 }
6172 }
6173 }
6174
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)6175 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6176 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6177 assert(!isUniformAfterVectorization(PredInst, VF) &&
6178 "Instruction marked uniform-after-vectorization will be predicated");
6179
6180 // Initialize the discount to zero, meaning that the scalar version and the
6181 // vector version cost the same.
6182 InstructionCost Discount = 0;
6183
6184 // Holds instructions to analyze. The instructions we visit are mapped in
6185 // ScalarCosts. Those instructions are the ones that would be scalarized if
6186 // we find that the scalar version costs less.
6187 SmallVector<Instruction *, 8> Worklist;
6188
6189 // Returns true if the given instruction can be scalarized.
6190 auto canBeScalarized = [&](Instruction *I) -> bool {
6191 // We only attempt to scalarize instructions forming a single-use chain
6192 // from the original predicated block that would otherwise be vectorized.
6193 // Although not strictly necessary, we give up on instructions we know will
6194 // already be scalar to avoid traversing chains that are unlikely to be
6195 // beneficial.
6196 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6197 isScalarAfterVectorization(I, VF))
6198 return false;
6199
6200 // If the instruction is scalar with predication, it will be analyzed
6201 // separately. We ignore it within the context of PredInst.
6202 if (isScalarWithPredication(I, VF))
6203 return false;
6204
6205 // If any of the instruction's operands are uniform after vectorization,
6206 // the instruction cannot be scalarized. This prevents, for example, a
6207 // masked load from being scalarized.
6208 //
6209 // We assume we will only emit a value for lane zero of an instruction
6210 // marked uniform after vectorization, rather than VF identical values.
6211 // Thus, if we scalarize an instruction that uses a uniform, we would
6212 // create uses of values corresponding to the lanes we aren't emitting code
6213 // for. This behavior can be changed by allowing getScalarValue to clone
6214 // the lane zero values for uniforms rather than asserting.
6215 for (Use &U : I->operands())
6216 if (auto *J = dyn_cast<Instruction>(U.get()))
6217 if (isUniformAfterVectorization(J, VF))
6218 return false;
6219
6220 // Otherwise, we can scalarize the instruction.
6221 return true;
6222 };
6223
6224 // Compute the expected cost discount from scalarizing the entire expression
6225 // feeding the predicated instruction. We currently only consider expressions
6226 // that are single-use instruction chains.
6227 Worklist.push_back(PredInst);
6228 while (!Worklist.empty()) {
6229 Instruction *I = Worklist.pop_back_val();
6230
6231 // If we've already analyzed the instruction, there's nothing to do.
6232 if (ScalarCosts.find(I) != ScalarCosts.end())
6233 continue;
6234
6235 // Compute the cost of the vector instruction. Note that this cost already
6236 // includes the scalarization overhead of the predicated instruction.
6237 InstructionCost VectorCost = getInstructionCost(I, VF).first;
6238
6239 // Compute the cost of the scalarized instruction. This cost is the cost of
6240 // the instruction as if it wasn't if-converted and instead remained in the
6241 // predicated block. We will scale this cost by block probability after
6242 // computing the scalarization overhead.
6243 InstructionCost ScalarCost =
6244 VF.getFixedValue() *
6245 getInstructionCost(I, ElementCount::getFixed(1)).first;
6246
6247 // Compute the scalarization overhead of needed insertelement instructions
6248 // and phi nodes.
6249 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6250 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6251 ScalarCost += TTI.getScalarizationOverhead(
6252 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6253 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6254 /*Extract*/ false, CostKind);
6255 ScalarCost +=
6256 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6257 }
6258
6259 // Compute the scalarization overhead of needed extractelement
6260 // instructions. For each of the instruction's operands, if the operand can
6261 // be scalarized, add it to the worklist; otherwise, account for the
6262 // overhead.
6263 for (Use &U : I->operands())
6264 if (auto *J = dyn_cast<Instruction>(U.get())) {
6265 assert(VectorType::isValidElementType(J->getType()) &&
6266 "Instruction has non-scalar type");
6267 if (canBeScalarized(J))
6268 Worklist.push_back(J);
6269 else if (needsExtract(J, VF)) {
6270 ScalarCost += TTI.getScalarizationOverhead(
6271 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6272 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6273 /*Extract*/ true, CostKind);
6274 }
6275 }
6276
6277 // Scale the total scalar cost by block probability.
6278 ScalarCost /= getReciprocalPredBlockProb();
6279
6280 // Compute the discount. A non-negative discount means the vector version
6281 // of the instruction costs more, and scalarizing would be beneficial.
6282 Discount += VectorCost - ScalarCost;
6283 ScalarCosts[I] = ScalarCost;
6284 }
6285
6286 return Discount;
6287 }
6288
6289 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)6290 LoopVectorizationCostModel::expectedCost(
6291 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6292 VectorizationCostTy Cost;
6293
6294 // For each block.
6295 for (BasicBlock *BB : TheLoop->blocks()) {
6296 VectorizationCostTy BlockCost;
6297
6298 // For each instruction in the old loop.
6299 for (Instruction &I : BB->instructionsWithoutDebug()) {
6300 // Skip ignored values.
6301 if (ValuesToIgnore.count(&I) ||
6302 (VF.isVector() && VecValuesToIgnore.count(&I)))
6303 continue;
6304
6305 VectorizationCostTy C = getInstructionCost(&I, VF);
6306
6307 // Check if we should override the cost.
6308 if (C.first.isValid() &&
6309 ForceTargetInstructionCost.getNumOccurrences() > 0)
6310 C.first = InstructionCost(ForceTargetInstructionCost);
6311
6312 // Keep a list of instructions with invalid costs.
6313 if (Invalid && !C.first.isValid())
6314 Invalid->emplace_back(&I, VF);
6315
6316 BlockCost.first += C.first;
6317 BlockCost.second |= C.second;
6318 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6319 << " for VF " << VF << " For instruction: " << I
6320 << '\n');
6321 }
6322
6323 // If we are vectorizing a predicated block, it will have been
6324 // if-converted. This means that the block's instructions (aside from
6325 // stores and instructions that may divide by zero) will now be
6326 // unconditionally executed. For the scalar case, we may not always execute
6327 // the predicated block, if it is an if-else block. Thus, scale the block's
6328 // cost by the probability of executing it. blockNeedsPredication from
6329 // Legal is used so as to not include all blocks in tail folded loops.
6330 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6331 BlockCost.first /= getReciprocalPredBlockProb();
6332
6333 Cost.first += BlockCost.first;
6334 Cost.second |= BlockCost.second;
6335 }
6336
6337 return Cost;
6338 }
6339
6340 /// Gets Address Access SCEV after verifying that the access pattern
6341 /// is loop invariant except the induction variable dependence.
6342 ///
6343 /// This SCEV can be sent to the Target in order to estimate the address
6344 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)6345 static const SCEV *getAddressAccessSCEV(
6346 Value *Ptr,
6347 LoopVectorizationLegality *Legal,
6348 PredicatedScalarEvolution &PSE,
6349 const Loop *TheLoop) {
6350
6351 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6352 if (!Gep)
6353 return nullptr;
6354
6355 // We are looking for a gep with all loop invariant indices except for one
6356 // which should be an induction variable.
6357 auto SE = PSE.getSE();
6358 unsigned NumOperands = Gep->getNumOperands();
6359 for (unsigned i = 1; i < NumOperands; ++i) {
6360 Value *Opd = Gep->getOperand(i);
6361 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6362 !Legal->isInductionVariable(Opd))
6363 return nullptr;
6364 }
6365
6366 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6367 return PSE.getSCEV(Ptr);
6368 }
6369
isStrideMul(Instruction * I,LoopVectorizationLegality * Legal)6370 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6371 return Legal->hasStride(I->getOperand(0)) ||
6372 Legal->hasStride(I->getOperand(1));
6373 }
6374
6375 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6376 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6377 ElementCount VF) {
6378 assert(VF.isVector() &&
6379 "Scalarization cost of instruction implies vectorization.");
6380 if (VF.isScalable())
6381 return InstructionCost::getInvalid();
6382
6383 Type *ValTy = getLoadStoreType(I);
6384 auto SE = PSE.getSE();
6385
6386 unsigned AS = getLoadStoreAddressSpace(I);
6387 Value *Ptr = getLoadStorePointerOperand(I);
6388 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6389 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6390 // that it is being called from this specific place.
6391
6392 // Figure out whether the access is strided and get the stride value
6393 // if it's known in compile time
6394 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6395
6396 // Get the cost of the scalar memory instruction and address computation.
6397 InstructionCost Cost =
6398 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6399
6400 // Don't pass *I here, since it is scalar but will actually be part of a
6401 // vectorized loop where the user of it is a vectorized instruction.
6402 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6403 const Align Alignment = getLoadStoreAlignment(I);
6404 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6405 ValTy->getScalarType(),
6406 Alignment, AS, CostKind);
6407
6408 // Get the overhead of the extractelement and insertelement instructions
6409 // we might create due to scalarization.
6410 Cost += getScalarizationOverhead(I, VF, CostKind);
6411
6412 // If we have a predicated load/store, it will need extra i1 extracts and
6413 // conditional branches, but may not be executed for each vector lane. Scale
6414 // the cost by the probability of executing the predicated block.
6415 if (isPredicatedInst(I)) {
6416 Cost /= getReciprocalPredBlockProb();
6417
6418 // Add the cost of an i1 extract and a branch
6419 auto *Vec_i1Ty =
6420 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6421 Cost += TTI.getScalarizationOverhead(
6422 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6423 /*Insert=*/false, /*Extract=*/true, CostKind);
6424 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6425
6426 if (useEmulatedMaskMemRefHack(I, VF))
6427 // Artificially setting to a high enough value to practically disable
6428 // vectorization with such operations.
6429 Cost = 3000000;
6430 }
6431
6432 return Cost;
6433 }
6434
6435 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6436 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6437 ElementCount VF) {
6438 Type *ValTy = getLoadStoreType(I);
6439 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6440 Value *Ptr = getLoadStorePointerOperand(I);
6441 unsigned AS = getLoadStoreAddressSpace(I);
6442 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6443 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6444
6445 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6446 "Stride should be 1 or -1 for consecutive memory access");
6447 const Align Alignment = getLoadStoreAlignment(I);
6448 InstructionCost Cost = 0;
6449 if (Legal->isMaskRequired(I)) {
6450 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6451 CostKind);
6452 } else {
6453 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6454 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6455 CostKind, OpInfo, I);
6456 }
6457
6458 bool Reverse = ConsecutiveStride < 0;
6459 if (Reverse)
6460 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6461 std::nullopt, CostKind, 0);
6462 return Cost;
6463 }
6464
6465 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)6466 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6467 ElementCount VF) {
6468 assert(Legal->isUniformMemOp(*I));
6469
6470 Type *ValTy = getLoadStoreType(I);
6471 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6472 const Align Alignment = getLoadStoreAlignment(I);
6473 unsigned AS = getLoadStoreAddressSpace(I);
6474 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6475 if (isa<LoadInst>(I)) {
6476 return TTI.getAddressComputationCost(ValTy) +
6477 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6478 CostKind) +
6479 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6480 }
6481 StoreInst *SI = cast<StoreInst>(I);
6482
6483 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6484 return TTI.getAddressComputationCost(ValTy) +
6485 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6486 CostKind) +
6487 (isLoopInvariantStoreValue
6488 ? 0
6489 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6490 CostKind, VF.getKnownMinValue() - 1));
6491 }
6492
6493 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)6494 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6495 ElementCount VF) {
6496 Type *ValTy = getLoadStoreType(I);
6497 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6498 const Align Alignment = getLoadStoreAlignment(I);
6499 const Value *Ptr = getLoadStorePointerOperand(I);
6500
6501 return TTI.getAddressComputationCost(VectorTy) +
6502 TTI.getGatherScatterOpCost(
6503 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6504 TargetTransformInfo::TCK_RecipThroughput, I);
6505 }
6506
6507 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)6508 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6509 ElementCount VF) {
6510 // TODO: Once we have support for interleaving with scalable vectors
6511 // we can calculate the cost properly here.
6512 if (VF.isScalable())
6513 return InstructionCost::getInvalid();
6514
6515 Type *ValTy = getLoadStoreType(I);
6516 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6517 unsigned AS = getLoadStoreAddressSpace(I);
6518 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6519
6520 auto Group = getInterleavedAccessGroup(I);
6521 assert(Group && "Fail to get an interleaved access group.");
6522
6523 unsigned InterleaveFactor = Group->getFactor();
6524 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6525
6526 // Holds the indices of existing members in the interleaved group.
6527 SmallVector<unsigned, 4> Indices;
6528 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6529 if (Group->getMember(IF))
6530 Indices.push_back(IF);
6531
6532 // Calculate the cost of the whole interleaved group.
6533 bool UseMaskForGaps =
6534 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6535 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6536 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6537 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6538 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6539
6540 if (Group->isReverse()) {
6541 // TODO: Add support for reversed masked interleaved access.
6542 assert(!Legal->isMaskRequired(I) &&
6543 "Reverse masked interleaved access not supported.");
6544 Cost += Group->getNumMembers() *
6545 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6546 std::nullopt, CostKind, 0);
6547 }
6548 return Cost;
6549 }
6550
6551 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind)6552 LoopVectorizationCostModel::getReductionPatternCost(
6553 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6554 using namespace llvm::PatternMatch;
6555 // Early exit for no inloop reductions
6556 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6557 return std::nullopt;
6558 auto *VectorTy = cast<VectorType>(Ty);
6559
6560 // We are looking for a pattern of, and finding the minimal acceptable cost:
6561 // reduce(mul(ext(A), ext(B))) or
6562 // reduce(mul(A, B)) or
6563 // reduce(ext(A)) or
6564 // reduce(A).
6565 // The basic idea is that we walk down the tree to do that, finding the root
6566 // reduction instruction in InLoopReductionImmediateChains. From there we find
6567 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6568 // of the components. If the reduction cost is lower then we return it for the
6569 // reduction instruction and 0 for the other instructions in the pattern. If
6570 // it is not we return an invalid cost specifying the orignal cost method
6571 // should be used.
6572 Instruction *RetI = I;
6573 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6574 if (!RetI->hasOneUser())
6575 return std::nullopt;
6576 RetI = RetI->user_back();
6577 }
6578
6579 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6580 RetI->user_back()->getOpcode() == Instruction::Add) {
6581 RetI = RetI->user_back();
6582 }
6583
6584 // Test if the found instruction is a reduction, and if not return an invalid
6585 // cost specifying the parent to use the original cost modelling.
6586 if (!InLoopReductionImmediateChains.count(RetI))
6587 return std::nullopt;
6588
6589 // Find the reduction this chain is a part of and calculate the basic cost of
6590 // the reduction on its own.
6591 Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6592 Instruction *ReductionPhi = LastChain;
6593 while (!isa<PHINode>(ReductionPhi))
6594 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6595
6596 const RecurrenceDescriptor &RdxDesc =
6597 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6598
6599 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6600 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6601
6602 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6603 // normal fmul instruction to the cost of the fadd reduction.
6604 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6605 BaseCost +=
6606 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6607
6608 // If we're using ordered reductions then we can just return the base cost
6609 // here, since getArithmeticReductionCost calculates the full ordered
6610 // reduction cost when FP reassociation is not allowed.
6611 if (useOrderedReductions(RdxDesc))
6612 return BaseCost;
6613
6614 // Get the operand that was not the reduction chain and match it to one of the
6615 // patterns, returning the better cost if it is found.
6616 Instruction *RedOp = RetI->getOperand(1) == LastChain
6617 ? dyn_cast<Instruction>(RetI->getOperand(0))
6618 : dyn_cast<Instruction>(RetI->getOperand(1));
6619
6620 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6621
6622 Instruction *Op0, *Op1;
6623 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6624 match(RedOp,
6625 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6626 match(Op0, m_ZExtOrSExt(m_Value())) &&
6627 Op0->getOpcode() == Op1->getOpcode() &&
6628 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6629 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6630 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6631
6632 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6633 // Note that the extend opcodes need to all match, or if A==B they will have
6634 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6635 // which is equally fine.
6636 bool IsUnsigned = isa<ZExtInst>(Op0);
6637 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6638 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6639
6640 InstructionCost ExtCost =
6641 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6642 TTI::CastContextHint::None, CostKind, Op0);
6643 InstructionCost MulCost =
6644 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6645 InstructionCost Ext2Cost =
6646 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6647 TTI::CastContextHint::None, CostKind, RedOp);
6648
6649 InstructionCost RedCost = TTI.getMulAccReductionCost(
6650 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6651
6652 if (RedCost.isValid() &&
6653 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6654 return I == RetI ? RedCost : 0;
6655 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6656 !TheLoop->isLoopInvariant(RedOp)) {
6657 // Matched reduce(ext(A))
6658 bool IsUnsigned = isa<ZExtInst>(RedOp);
6659 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6660 InstructionCost RedCost = TTI.getExtendedReductionCost(
6661 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6662 RdxDesc.getFastMathFlags(), CostKind);
6663
6664 InstructionCost ExtCost =
6665 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6666 TTI::CastContextHint::None, CostKind, RedOp);
6667 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6668 return I == RetI ? RedCost : 0;
6669 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6670 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6671 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6672 Op0->getOpcode() == Op1->getOpcode() &&
6673 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6674 bool IsUnsigned = isa<ZExtInst>(Op0);
6675 Type *Op0Ty = Op0->getOperand(0)->getType();
6676 Type *Op1Ty = Op1->getOperand(0)->getType();
6677 Type *LargestOpTy =
6678 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6679 : Op0Ty;
6680 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6681
6682 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6683 // different sizes. We take the largest type as the ext to reduce, and add
6684 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6685 InstructionCost ExtCost0 = TTI.getCastInstrCost(
6686 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6687 TTI::CastContextHint::None, CostKind, Op0);
6688 InstructionCost ExtCost1 = TTI.getCastInstrCost(
6689 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6690 TTI::CastContextHint::None, CostKind, Op1);
6691 InstructionCost MulCost =
6692 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6693
6694 InstructionCost RedCost = TTI.getMulAccReductionCost(
6695 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6696 InstructionCost ExtraExtCost = 0;
6697 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6698 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6699 ExtraExtCost = TTI.getCastInstrCost(
6700 ExtraExtOp->getOpcode(), ExtType,
6701 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6702 TTI::CastContextHint::None, CostKind, ExtraExtOp);
6703 }
6704
6705 if (RedCost.isValid() &&
6706 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6707 return I == RetI ? RedCost : 0;
6708 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6709 // Matched reduce.add(mul())
6710 InstructionCost MulCost =
6711 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6712
6713 InstructionCost RedCost = TTI.getMulAccReductionCost(
6714 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6715
6716 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6717 return I == RetI ? RedCost : 0;
6718 }
6719 }
6720
6721 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6722 }
6723
6724 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)6725 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6726 ElementCount VF) {
6727 // Calculate scalar cost only. Vectorization cost should be ready at this
6728 // moment.
6729 if (VF.isScalar()) {
6730 Type *ValTy = getLoadStoreType(I);
6731 const Align Alignment = getLoadStoreAlignment(I);
6732 unsigned AS = getLoadStoreAddressSpace(I);
6733
6734 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6735 return TTI.getAddressComputationCost(ValTy) +
6736 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6737 TTI::TCK_RecipThroughput, OpInfo, I);
6738 }
6739 return getWideningCost(I, VF);
6740 }
6741
6742 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6743 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6744 ElementCount VF) {
6745 // If we know that this instruction will remain uniform, check the cost of
6746 // the scalar version.
6747 if (isUniformAfterVectorization(I, VF))
6748 VF = ElementCount::getFixed(1);
6749
6750 if (VF.isVector() && isProfitableToScalarize(I, VF))
6751 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6752
6753 // Forced scalars do not have any scalarization overhead.
6754 auto ForcedScalar = ForcedScalars.find(VF);
6755 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6756 auto InstSet = ForcedScalar->second;
6757 if (InstSet.count(I))
6758 return VectorizationCostTy(
6759 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6760 VF.getKnownMinValue()),
6761 false);
6762 }
6763
6764 Type *VectorTy;
6765 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6766
6767 bool TypeNotScalarized = false;
6768 if (VF.isVector() && VectorTy->isVectorTy()) {
6769 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6770 if (VF.isScalable())
6771 // <vscale x 1 x iN> is assumed to be profitable over iN because
6772 // scalable registers are a distinct register class from scalar ones.
6773 // If we ever find a target which wants to lower scalable vectors
6774 // back to scalars, we'll need to update this code to explicitly
6775 // ask TTI about the register class uses for each part.
6776 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6777 else
6778 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6779 } else
6780 C = InstructionCost::getInvalid();
6781 }
6782 return VectorizationCostTy(C, TypeNotScalarized);
6783 }
6784
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const6785 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6786 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6787
6788 // There is no mechanism yet to create a scalable scalarization loop,
6789 // so this is currently Invalid.
6790 if (VF.isScalable())
6791 return InstructionCost::getInvalid();
6792
6793 if (VF.isScalar())
6794 return 0;
6795
6796 InstructionCost Cost = 0;
6797 Type *RetTy = ToVectorTy(I->getType(), VF);
6798 if (!RetTy->isVoidTy() &&
6799 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6800 Cost += TTI.getScalarizationOverhead(
6801 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6802 /*Insert*/ true,
6803 /*Extract*/ false, CostKind);
6804
6805 // Some targets keep addresses scalar.
6806 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6807 return Cost;
6808
6809 // Some targets support efficient element stores.
6810 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6811 return Cost;
6812
6813 // Collect operands to consider.
6814 CallInst *CI = dyn_cast<CallInst>(I);
6815 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6816
6817 // Skip operands that do not require extraction/scalarization and do not incur
6818 // any overhead.
6819 SmallVector<Type *> Tys;
6820 for (auto *V : filterExtractingOperands(Ops, VF))
6821 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6822 return Cost + TTI.getOperandsScalarizationOverhead(
6823 filterExtractingOperands(Ops, VF), Tys, CostKind);
6824 }
6825
setCostBasedWideningDecision(ElementCount VF)6826 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6827 if (VF.isScalar())
6828 return;
6829 NumPredStores = 0;
6830 for (BasicBlock *BB : TheLoop->blocks()) {
6831 // For each instruction in the old loop.
6832 for (Instruction &I : *BB) {
6833 Value *Ptr = getLoadStorePointerOperand(&I);
6834 if (!Ptr)
6835 continue;
6836
6837 // TODO: We should generate better code and update the cost model for
6838 // predicated uniform stores. Today they are treated as any other
6839 // predicated store (see added test cases in
6840 // invariant-store-vectorization.ll).
6841 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6842 NumPredStores++;
6843
6844 if (Legal->isUniformMemOp(I)) {
6845 auto isLegalToScalarize = [&]() {
6846 if (!VF.isScalable())
6847 // Scalarization of fixed length vectors "just works".
6848 return true;
6849
6850 // We have dedicated lowering for unpredicated uniform loads and
6851 // stores. Note that even with tail folding we know that at least
6852 // one lane is active (i.e. generalized predication is not possible
6853 // here), and the logic below depends on this fact.
6854 if (!foldTailByMasking())
6855 return true;
6856
6857 // For scalable vectors, a uniform memop load is always
6858 // uniform-by-parts and we know how to scalarize that.
6859 if (isa<LoadInst>(I))
6860 return true;
6861
6862 // A uniform store isn't neccessarily uniform-by-part
6863 // and we can't assume scalarization.
6864 auto &SI = cast<StoreInst>(I);
6865 return TheLoop->isLoopInvariant(SI.getValueOperand());
6866 };
6867
6868 const InstructionCost GatherScatterCost =
6869 isLegalGatherOrScatter(&I, VF) ?
6870 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6871
6872 // Load: Scalar load + broadcast
6873 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6874 // FIXME: This cost is a significant under-estimate for tail folded
6875 // memory ops.
6876 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6877 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6878
6879 // Choose better solution for the current VF, Note that Invalid
6880 // costs compare as maximumal large. If both are invalid, we get
6881 // scalable invalid which signals a failure and a vectorization abort.
6882 if (GatherScatterCost < ScalarizationCost)
6883 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6884 else
6885 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6886 continue;
6887 }
6888
6889 // We assume that widening is the best solution when possible.
6890 if (memoryInstructionCanBeWidened(&I, VF)) {
6891 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6892 int ConsecutiveStride = Legal->isConsecutivePtr(
6893 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6894 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6895 "Expected consecutive stride.");
6896 InstWidening Decision =
6897 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6898 setWideningDecision(&I, VF, Decision, Cost);
6899 continue;
6900 }
6901
6902 // Choose between Interleaving, Gather/Scatter or Scalarization.
6903 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6904 unsigned NumAccesses = 1;
6905 if (isAccessInterleaved(&I)) {
6906 auto Group = getInterleavedAccessGroup(&I);
6907 assert(Group && "Fail to get an interleaved access group.");
6908
6909 // Make one decision for the whole group.
6910 if (getWideningDecision(&I, VF) != CM_Unknown)
6911 continue;
6912
6913 NumAccesses = Group->getNumMembers();
6914 if (interleavedAccessCanBeWidened(&I, VF))
6915 InterleaveCost = getInterleaveGroupCost(&I, VF);
6916 }
6917
6918 InstructionCost GatherScatterCost =
6919 isLegalGatherOrScatter(&I, VF)
6920 ? getGatherScatterCost(&I, VF) * NumAccesses
6921 : InstructionCost::getInvalid();
6922
6923 InstructionCost ScalarizationCost =
6924 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6925
6926 // Choose better solution for the current VF,
6927 // write down this decision and use it during vectorization.
6928 InstructionCost Cost;
6929 InstWidening Decision;
6930 if (InterleaveCost <= GatherScatterCost &&
6931 InterleaveCost < ScalarizationCost) {
6932 Decision = CM_Interleave;
6933 Cost = InterleaveCost;
6934 } else if (GatherScatterCost < ScalarizationCost) {
6935 Decision = CM_GatherScatter;
6936 Cost = GatherScatterCost;
6937 } else {
6938 Decision = CM_Scalarize;
6939 Cost = ScalarizationCost;
6940 }
6941 // If the instructions belongs to an interleave group, the whole group
6942 // receives the same decision. The whole group receives the cost, but
6943 // the cost will actually be assigned to one instruction.
6944 if (auto Group = getInterleavedAccessGroup(&I))
6945 setWideningDecision(Group, VF, Decision, Cost);
6946 else
6947 setWideningDecision(&I, VF, Decision, Cost);
6948 }
6949 }
6950
6951 // Make sure that any load of address and any other address computation
6952 // remains scalar unless there is gather/scatter support. This avoids
6953 // inevitable extracts into address registers, and also has the benefit of
6954 // activating LSR more, since that pass can't optimize vectorized
6955 // addresses.
6956 if (TTI.prefersVectorizedAddressing())
6957 return;
6958
6959 // Start with all scalar pointer uses.
6960 SmallPtrSet<Instruction *, 8> AddrDefs;
6961 for (BasicBlock *BB : TheLoop->blocks())
6962 for (Instruction &I : *BB) {
6963 Instruction *PtrDef =
6964 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6965 if (PtrDef && TheLoop->contains(PtrDef) &&
6966 getWideningDecision(&I, VF) != CM_GatherScatter)
6967 AddrDefs.insert(PtrDef);
6968 }
6969
6970 // Add all instructions used to generate the addresses.
6971 SmallVector<Instruction *, 4> Worklist;
6972 append_range(Worklist, AddrDefs);
6973 while (!Worklist.empty()) {
6974 Instruction *I = Worklist.pop_back_val();
6975 for (auto &Op : I->operands())
6976 if (auto *InstOp = dyn_cast<Instruction>(Op))
6977 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6978 AddrDefs.insert(InstOp).second)
6979 Worklist.push_back(InstOp);
6980 }
6981
6982 for (auto *I : AddrDefs) {
6983 if (isa<LoadInst>(I)) {
6984 // Setting the desired widening decision should ideally be handled in
6985 // by cost functions, but since this involves the task of finding out
6986 // if the loaded register is involved in an address computation, it is
6987 // instead changed here when we know this is the case.
6988 InstWidening Decision = getWideningDecision(I, VF);
6989 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6990 // Scalarize a widened load of address.
6991 setWideningDecision(
6992 I, VF, CM_Scalarize,
6993 (VF.getKnownMinValue() *
6994 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6995 else if (auto Group = getInterleavedAccessGroup(I)) {
6996 // Scalarize an interleave group of address loads.
6997 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6998 if (Instruction *Member = Group->getMember(I))
6999 setWideningDecision(
7000 Member, VF, CM_Scalarize,
7001 (VF.getKnownMinValue() *
7002 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7003 }
7004 }
7005 } else
7006 // Make sure I gets scalarized and a cost estimate without
7007 // scalarization overhead.
7008 ForcedScalars[VF].insert(I);
7009 }
7010 }
7011
7012 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)7013 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7014 Type *&VectorTy) {
7015 Type *RetTy = I->getType();
7016 if (canTruncateToMinimalBitwidth(I, VF))
7017 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7018 auto SE = PSE.getSE();
7019 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7020
7021 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7022 ElementCount VF) -> bool {
7023 if (VF.isScalar())
7024 return true;
7025
7026 auto Scalarized = InstsToScalarize.find(VF);
7027 assert(Scalarized != InstsToScalarize.end() &&
7028 "VF not yet analyzed for scalarization profitability");
7029 return !Scalarized->second.count(I) &&
7030 llvm::all_of(I->users(), [&](User *U) {
7031 auto *UI = cast<Instruction>(U);
7032 return !Scalarized->second.count(UI);
7033 });
7034 };
7035 (void) hasSingleCopyAfterVectorization;
7036
7037 if (isScalarAfterVectorization(I, VF)) {
7038 // With the exception of GEPs and PHIs, after scalarization there should
7039 // only be one copy of the instruction generated in the loop. This is
7040 // because the VF is either 1, or any instructions that need scalarizing
7041 // have already been dealt with by the the time we get here. As a result,
7042 // it means we don't have to multiply the instruction cost by VF.
7043 assert(I->getOpcode() == Instruction::GetElementPtr ||
7044 I->getOpcode() == Instruction::PHI ||
7045 (I->getOpcode() == Instruction::BitCast &&
7046 I->getType()->isPointerTy()) ||
7047 hasSingleCopyAfterVectorization(I, VF));
7048 VectorTy = RetTy;
7049 } else
7050 VectorTy = ToVectorTy(RetTy, VF);
7051
7052 // TODO: We need to estimate the cost of intrinsic calls.
7053 switch (I->getOpcode()) {
7054 case Instruction::GetElementPtr:
7055 // We mark this instruction as zero-cost because the cost of GEPs in
7056 // vectorized code depends on whether the corresponding memory instruction
7057 // is scalarized or not. Therefore, we handle GEPs with the memory
7058 // instruction cost.
7059 return 0;
7060 case Instruction::Br: {
7061 // In cases of scalarized and predicated instructions, there will be VF
7062 // predicated blocks in the vectorized loop. Each branch around these
7063 // blocks requires also an extract of its vector compare i1 element.
7064 bool ScalarPredicatedBB = false;
7065 BranchInst *BI = cast<BranchInst>(I);
7066 if (VF.isVector() && BI->isConditional() &&
7067 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7068 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7069 ScalarPredicatedBB = true;
7070
7071 if (ScalarPredicatedBB) {
7072 // Not possible to scalarize scalable vector with predicated instructions.
7073 if (VF.isScalable())
7074 return InstructionCost::getInvalid();
7075 // Return cost for branches around scalarized and predicated blocks.
7076 auto *Vec_i1Ty =
7077 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7078 return (
7079 TTI.getScalarizationOverhead(
7080 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7081 /*Insert*/ false, /*Extract*/ true, CostKind) +
7082 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7083 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7084 // The back-edge branch will remain, as will all scalar branches.
7085 return TTI.getCFInstrCost(Instruction::Br, CostKind);
7086 else
7087 // This branch will be eliminated by if-conversion.
7088 return 0;
7089 // Note: We currently assume zero cost for an unconditional branch inside
7090 // a predicated block since it will become a fall-through, although we
7091 // may decide in the future to call TTI for all branches.
7092 }
7093 case Instruction::PHI: {
7094 auto *Phi = cast<PHINode>(I);
7095
7096 // First-order recurrences are replaced by vector shuffles inside the loop.
7097 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7098 SmallVector<int> Mask(VF.getKnownMinValue());
7099 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7100 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7101 cast<VectorType>(VectorTy), Mask, CostKind,
7102 VF.getKnownMinValue() - 1);
7103 }
7104
7105 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7106 // converted into select instructions. We require N - 1 selects per phi
7107 // node, where N is the number of incoming values.
7108 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7109 return (Phi->getNumIncomingValues() - 1) *
7110 TTI.getCmpSelInstrCost(
7111 Instruction::Select, ToVectorTy(Phi->getType(), VF),
7112 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7113 CmpInst::BAD_ICMP_PREDICATE, CostKind);
7114
7115 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7116 }
7117 case Instruction::UDiv:
7118 case Instruction::SDiv:
7119 case Instruction::URem:
7120 case Instruction::SRem:
7121 if (VF.isVector() && isPredicatedInst(I)) {
7122 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7123 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7124 ScalarCost : SafeDivisorCost;
7125 }
7126 // We've proven all lanes safe to speculate, fall through.
7127 [[fallthrough]];
7128 case Instruction::Add:
7129 case Instruction::FAdd:
7130 case Instruction::Sub:
7131 case Instruction::FSub:
7132 case Instruction::Mul:
7133 case Instruction::FMul:
7134 case Instruction::FDiv:
7135 case Instruction::FRem:
7136 case Instruction::Shl:
7137 case Instruction::LShr:
7138 case Instruction::AShr:
7139 case Instruction::And:
7140 case Instruction::Or:
7141 case Instruction::Xor: {
7142 // Since we will replace the stride by 1 the multiplication should go away.
7143 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7144 return 0;
7145
7146 // Detect reduction patterns
7147 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7148 return *RedCost;
7149
7150 // Certain instructions can be cheaper to vectorize if they have a constant
7151 // second vector operand. One example of this are shifts on x86.
7152 Value *Op2 = I->getOperand(1);
7153 auto Op2Info = TTI.getOperandInfo(Op2);
7154 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7155 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7156
7157 SmallVector<const Value *, 4> Operands(I->operand_values());
7158 return TTI.getArithmeticInstrCost(
7159 I->getOpcode(), VectorTy, CostKind,
7160 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7161 Op2Info, Operands, I);
7162 }
7163 case Instruction::FNeg: {
7164 return TTI.getArithmeticInstrCost(
7165 I->getOpcode(), VectorTy, CostKind,
7166 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7167 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7168 I->getOperand(0), I);
7169 }
7170 case Instruction::Select: {
7171 SelectInst *SI = cast<SelectInst>(I);
7172 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7173 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7174
7175 const Value *Op0, *Op1;
7176 using namespace llvm::PatternMatch;
7177 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7178 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7179 // select x, y, false --> x & y
7180 // select x, true, y --> x | y
7181 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7182 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7183 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7184 Op1->getType()->getScalarSizeInBits() == 1);
7185
7186 SmallVector<const Value *, 2> Operands{Op0, Op1};
7187 return TTI.getArithmeticInstrCost(
7188 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7189 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7190 }
7191
7192 Type *CondTy = SI->getCondition()->getType();
7193 if (!ScalarCond)
7194 CondTy = VectorType::get(CondTy, VF);
7195
7196 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7197 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7198 Pred = Cmp->getPredicate();
7199 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7200 CostKind, I);
7201 }
7202 case Instruction::ICmp:
7203 case Instruction::FCmp: {
7204 Type *ValTy = I->getOperand(0)->getType();
7205 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7206 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7207 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7208 VectorTy = ToVectorTy(ValTy, VF);
7209 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7210 cast<CmpInst>(I)->getPredicate(), CostKind,
7211 I);
7212 }
7213 case Instruction::Store:
7214 case Instruction::Load: {
7215 ElementCount Width = VF;
7216 if (Width.isVector()) {
7217 InstWidening Decision = getWideningDecision(I, Width);
7218 assert(Decision != CM_Unknown &&
7219 "CM decision should be taken at this point");
7220 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7221 return InstructionCost::getInvalid();
7222 if (Decision == CM_Scalarize)
7223 Width = ElementCount::getFixed(1);
7224 }
7225 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7226 return getMemoryInstructionCost(I, VF);
7227 }
7228 case Instruction::BitCast:
7229 if (I->getType()->isPointerTy())
7230 return 0;
7231 [[fallthrough]];
7232 case Instruction::ZExt:
7233 case Instruction::SExt:
7234 case Instruction::FPToUI:
7235 case Instruction::FPToSI:
7236 case Instruction::FPExt:
7237 case Instruction::PtrToInt:
7238 case Instruction::IntToPtr:
7239 case Instruction::SIToFP:
7240 case Instruction::UIToFP:
7241 case Instruction::Trunc:
7242 case Instruction::FPTrunc: {
7243 // Computes the CastContextHint from a Load/Store instruction.
7244 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7245 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7246 "Expected a load or a store!");
7247
7248 if (VF.isScalar() || !TheLoop->contains(I))
7249 return TTI::CastContextHint::Normal;
7250
7251 switch (getWideningDecision(I, VF)) {
7252 case LoopVectorizationCostModel::CM_GatherScatter:
7253 return TTI::CastContextHint::GatherScatter;
7254 case LoopVectorizationCostModel::CM_Interleave:
7255 return TTI::CastContextHint::Interleave;
7256 case LoopVectorizationCostModel::CM_Scalarize:
7257 case LoopVectorizationCostModel::CM_Widen:
7258 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7259 : TTI::CastContextHint::Normal;
7260 case LoopVectorizationCostModel::CM_Widen_Reverse:
7261 return TTI::CastContextHint::Reversed;
7262 case LoopVectorizationCostModel::CM_Unknown:
7263 llvm_unreachable("Instr did not go through cost modelling?");
7264 }
7265
7266 llvm_unreachable("Unhandled case!");
7267 };
7268
7269 unsigned Opcode = I->getOpcode();
7270 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7271 // For Trunc, the context is the only user, which must be a StoreInst.
7272 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7273 if (I->hasOneUse())
7274 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7275 CCH = ComputeCCH(Store);
7276 }
7277 // For Z/Sext, the context is the operand, which must be a LoadInst.
7278 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7279 Opcode == Instruction::FPExt) {
7280 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7281 CCH = ComputeCCH(Load);
7282 }
7283
7284 // We optimize the truncation of induction variables having constant
7285 // integer steps. The cost of these truncations is the same as the scalar
7286 // operation.
7287 if (isOptimizableIVTruncate(I, VF)) {
7288 auto *Trunc = cast<TruncInst>(I);
7289 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7290 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7291 }
7292
7293 // Detect reduction patterns
7294 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7295 return *RedCost;
7296
7297 Type *SrcScalarTy = I->getOperand(0)->getType();
7298 Type *SrcVecTy =
7299 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7300 if (canTruncateToMinimalBitwidth(I, VF)) {
7301 // This cast is going to be shrunk. This may remove the cast or it might
7302 // turn it into slightly different cast. For example, if MinBW == 16,
7303 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7304 //
7305 // Calculate the modified src and dest types.
7306 Type *MinVecTy = VectorTy;
7307 if (Opcode == Instruction::Trunc) {
7308 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7309 VectorTy =
7310 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7311 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7312 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7313 VectorTy =
7314 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7315 }
7316 }
7317
7318 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7319 }
7320 case Instruction::Call: {
7321 if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7322 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7323 return *RedCost;
7324 bool NeedToScalarize;
7325 CallInst *CI = cast<CallInst>(I);
7326 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7327 if (getVectorIntrinsicIDForCall(CI, TLI)) {
7328 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7329 return std::min(CallCost, IntrinsicCost);
7330 }
7331 return CallCost;
7332 }
7333 case Instruction::ExtractValue:
7334 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7335 case Instruction::Alloca:
7336 // We cannot easily widen alloca to a scalable alloca, as
7337 // the result would need to be a vector of pointers.
7338 if (VF.isScalable())
7339 return InstructionCost::getInvalid();
7340 [[fallthrough]];
7341 default:
7342 // This opcode is unknown. Assume that it is the same as 'mul'.
7343 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7344 } // end of switch.
7345 }
7346
7347 char LoopVectorize::ID = 0;
7348
7349 static const char lv_name[] = "Loop Vectorization";
7350
7351 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7352 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7353 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7354 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7355 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7356 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7357 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7358 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7359 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7360 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7361 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7362 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7363 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7364 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7365 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7366
7367 namespace llvm {
7368
createLoopVectorizePass()7369 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7370
createLoopVectorizePass(bool InterleaveOnlyWhenForced,bool VectorizeOnlyWhenForced)7371 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7372 bool VectorizeOnlyWhenForced) {
7373 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7374 }
7375
7376 } // end namespace llvm
7377
collectValuesToIgnore()7378 void LoopVectorizationCostModel::collectValuesToIgnore() {
7379 // Ignore ephemeral values.
7380 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7381
7382 // Find all stores to invariant variables. Since they are going to sink
7383 // outside the loop we do not need calculate cost for them.
7384 for (BasicBlock *BB : TheLoop->blocks())
7385 for (Instruction &I : *BB) {
7386 StoreInst *SI;
7387 if ((SI = dyn_cast<StoreInst>(&I)) &&
7388 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7389 ValuesToIgnore.insert(&I);
7390 }
7391
7392 // Ignore type-promoting instructions we identified during reduction
7393 // detection.
7394 for (const auto &Reduction : Legal->getReductionVars()) {
7395 const RecurrenceDescriptor &RedDes = Reduction.second;
7396 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7397 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7398 }
7399 // Ignore type-casting instructions we identified during induction
7400 // detection.
7401 for (const auto &Induction : Legal->getInductionVars()) {
7402 const InductionDescriptor &IndDes = Induction.second;
7403 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7404 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7405 }
7406 }
7407
collectInLoopReductions()7408 void LoopVectorizationCostModel::collectInLoopReductions() {
7409 for (const auto &Reduction : Legal->getReductionVars()) {
7410 PHINode *Phi = Reduction.first;
7411 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7412
7413 // We don't collect reductions that are type promoted (yet).
7414 if (RdxDesc.getRecurrenceType() != Phi->getType())
7415 continue;
7416
7417 // If the target would prefer this reduction to happen "in-loop", then we
7418 // want to record it as such.
7419 unsigned Opcode = RdxDesc.getOpcode();
7420 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7421 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7422 TargetTransformInfo::ReductionFlags()))
7423 continue;
7424
7425 // Check that we can correctly put the reductions into the loop, by
7426 // finding the chain of operations that leads from the phi to the loop
7427 // exit value.
7428 SmallVector<Instruction *, 4> ReductionOperations =
7429 RdxDesc.getReductionOpChain(Phi, TheLoop);
7430 bool InLoop = !ReductionOperations.empty();
7431 if (InLoop) {
7432 InLoopReductionChains[Phi] = ReductionOperations;
7433 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7434 Instruction *LastChain = Phi;
7435 for (auto *I : ReductionOperations) {
7436 InLoopReductionImmediateChains[I] = LastChain;
7437 LastChain = I;
7438 }
7439 }
7440 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7441 << " reduction for phi: " << *Phi << "\n");
7442 }
7443 }
7444
7445 // TODO: we could return a pair of values that specify the max VF and
7446 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7447 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7448 // doesn't have a cost model that can choose which plan to execute if
7449 // more than one is generated.
determineVPlanVF(const unsigned WidestVectorRegBits,LoopVectorizationCostModel & CM)7450 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7451 LoopVectorizationCostModel &CM) {
7452 unsigned WidestType;
7453 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7454 return WidestVectorRegBits / WidestType;
7455 }
7456
7457 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7458 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7459 assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7460 ElementCount VF = UserVF;
7461 // Outer loop handling: They may require CFG and instruction level
7462 // transformations before even evaluating whether vectorization is profitable.
7463 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7464 // the vectorization pipeline.
7465 if (!OrigLoop->isInnermost()) {
7466 // If the user doesn't provide a vectorization factor, determine a
7467 // reasonable one.
7468 if (UserVF.isZero()) {
7469 VF = ElementCount::getFixed(determineVPlanVF(
7470 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7471 .getFixedValue(),
7472 CM));
7473 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7474
7475 // Make sure we have a VF > 1 for stress testing.
7476 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7477 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7478 << "overriding computed VF.\n");
7479 VF = ElementCount::getFixed(4);
7480 }
7481 }
7482 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7483 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7484 "VF needs to be a power of two");
7485 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7486 << "VF " << VF << " to build VPlans.\n");
7487 buildVPlans(VF, VF);
7488
7489 // For VPlan build stress testing, we bail out after VPlan construction.
7490 if (VPlanBuildStressTest)
7491 return VectorizationFactor::Disabled();
7492
7493 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7494 }
7495
7496 LLVM_DEBUG(
7497 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7498 "VPlan-native path.\n");
7499 return VectorizationFactor::Disabled();
7500 }
7501
7502 std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7503 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7504 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7505 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7506 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7507 return std::nullopt;
7508
7509 // Invalidate interleave groups if all blocks of loop will be predicated.
7510 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7511 !useMaskedInterleavedAccesses(*TTI)) {
7512 LLVM_DEBUG(
7513 dbgs()
7514 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7515 "which requires masked-interleaved support.\n");
7516 if (CM.InterleaveInfo.invalidateGroups())
7517 // Invalidating interleave groups also requires invalidating all decisions
7518 // based on them, which includes widening decisions and uniform and scalar
7519 // values.
7520 CM.invalidateCostModelingDecisions();
7521 }
7522
7523 ElementCount MaxUserVF =
7524 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7525 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7526 if (!UserVF.isZero() && UserVFIsLegal) {
7527 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7528 "VF needs to be a power of two");
7529 // Collect the instructions (and their associated costs) that will be more
7530 // profitable to scalarize.
7531 if (CM.selectUserVectorizationFactor(UserVF)) {
7532 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7533 CM.collectInLoopReductions();
7534 buildVPlansWithVPRecipes(UserVF, UserVF);
7535 LLVM_DEBUG(printPlans(dbgs()));
7536 return {{UserVF, 0, 0}};
7537 } else
7538 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7539 "InvalidCost", ORE, OrigLoop);
7540 }
7541
7542 // Populate the set of Vectorization Factor Candidates.
7543 ElementCountSet VFCandidates;
7544 for (auto VF = ElementCount::getFixed(1);
7545 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7546 VFCandidates.insert(VF);
7547 for (auto VF = ElementCount::getScalable(1);
7548 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7549 VFCandidates.insert(VF);
7550
7551 for (const auto &VF : VFCandidates) {
7552 // Collect Uniform and Scalar instructions after vectorization with VF.
7553 CM.collectUniformsAndScalars(VF);
7554
7555 // Collect the instructions (and their associated costs) that will be more
7556 // profitable to scalarize.
7557 if (VF.isVector())
7558 CM.collectInstsToScalarize(VF);
7559 }
7560
7561 CM.collectInLoopReductions();
7562 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7563 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7564
7565 LLVM_DEBUG(printPlans(dbgs()));
7566 if (!MaxFactors.hasVector())
7567 return VectorizationFactor::Disabled();
7568
7569 // Select the optimal vectorization factor.
7570 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7571 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7572 return VF;
7573 }
7574
getBestPlanFor(ElementCount VF) const7575 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7576 assert(count_if(VPlans,
7577 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7578 1 &&
7579 "Best VF has not a single VPlan.");
7580
7581 for (const VPlanPtr &Plan : VPlans) {
7582 if (Plan->hasVF(VF))
7583 return *Plan.get();
7584 }
7585 llvm_unreachable("No plan found!");
7586 }
7587
AddRuntimeUnrollDisableMetaData(Loop * L)7588 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7589 SmallVector<Metadata *, 4> MDs;
7590 // Reserve first location for self reference to the LoopID metadata node.
7591 MDs.push_back(nullptr);
7592 bool IsUnrollMetadata = false;
7593 MDNode *LoopID = L->getLoopID();
7594 if (LoopID) {
7595 // First find existing loop unrolling disable metadata.
7596 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7597 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7598 if (MD) {
7599 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7600 IsUnrollMetadata =
7601 S && S->getString().startswith("llvm.loop.unroll.disable");
7602 }
7603 MDs.push_back(LoopID->getOperand(i));
7604 }
7605 }
7606
7607 if (!IsUnrollMetadata) {
7608 // Add runtime unroll disable metadata.
7609 LLVMContext &Context = L->getHeader()->getContext();
7610 SmallVector<Metadata *, 1> DisableOperands;
7611 DisableOperands.push_back(
7612 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7613 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7614 MDs.push_back(DisableNode);
7615 MDNode *NewLoopID = MDNode::get(Context, MDs);
7616 // Set operand 0 to refer to the loop id itself.
7617 NewLoopID->replaceOperandWith(0, NewLoopID);
7618 L->setLoopID(NewLoopID);
7619 }
7620 }
7621
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization)7622 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7623 VPlan &BestVPlan,
7624 InnerLoopVectorizer &ILV,
7625 DominatorTree *DT,
7626 bool IsEpilogueVectorization) {
7627 assert(BestVPlan.hasVF(BestVF) &&
7628 "Trying to execute plan with unsupported VF");
7629 assert(BestVPlan.hasUF(BestUF) &&
7630 "Trying to execute plan with unsupported UF");
7631
7632 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7633 << '\n');
7634
7635 // Workaround! Compute the trip count of the original loop and cache it
7636 // before we start modifying the CFG. This code has a systemic problem
7637 // wherein it tries to run analysis over partially constructed IR; this is
7638 // wrong, and not simply for SCEV. The trip count of the original loop
7639 // simply happens to be prone to hitting this in practice. In theory, we
7640 // can hit the same issue for any SCEV, or ValueTracking query done during
7641 // mutation. See PR49900.
7642 ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7643
7644 if (!IsEpilogueVectorization)
7645 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7646
7647 // Perform the actual loop transformation.
7648
7649 // 1. Set up the skeleton for vectorization, including vector pre-header and
7650 // middle block. The vector loop is created during VPlan execution.
7651 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7652 Value *CanonicalIVStartValue;
7653 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7654 ILV.createVectorizedLoopSkeleton();
7655
7656 // Only use noalias metadata when using memory checks guaranteeing no overlap
7657 // across all iterations.
7658 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7659 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7660 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7661
7662 // We currently don't use LoopVersioning for the actual loop cloning but we
7663 // still use it to add the noalias metadata.
7664 // TODO: Find a better way to re-use LoopVersioning functionality to add
7665 // metadata.
7666 State.LVer = std::make_unique<LoopVersioning>(
7667 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7668 PSE.getSE());
7669 State.LVer->prepareNoAliasMetadata();
7670 }
7671
7672 ILV.collectPoisonGeneratingRecipes(State);
7673
7674 ILV.printDebugTracesAtStart();
7675
7676 //===------------------------------------------------===//
7677 //
7678 // Notice: any optimization or new instruction that go
7679 // into the code below should also be implemented in
7680 // the cost-model.
7681 //
7682 //===------------------------------------------------===//
7683
7684 // 2. Copy and widen instructions from the old loop into the new loop.
7685 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7686 ILV.getOrCreateVectorTripCount(nullptr),
7687 CanonicalIVStartValue, State,
7688 IsEpilogueVectorization);
7689
7690 BestVPlan.execute(&State);
7691
7692 // Keep all loop hints from the original loop on the vector loop (we'll
7693 // replace the vectorizer-specific hints below).
7694 MDNode *OrigLoopID = OrigLoop->getLoopID();
7695
7696 std::optional<MDNode *> VectorizedLoopID =
7697 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7698 LLVMLoopVectorizeFollowupVectorized});
7699
7700 VPBasicBlock *HeaderVPBB =
7701 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7702 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7703 if (VectorizedLoopID)
7704 L->setLoopID(*VectorizedLoopID);
7705 else {
7706 // Keep all loop hints from the original loop on the vector loop (we'll
7707 // replace the vectorizer-specific hints below).
7708 if (MDNode *LID = OrigLoop->getLoopID())
7709 L->setLoopID(LID);
7710
7711 LoopVectorizeHints Hints(L, true, *ORE);
7712 Hints.setAlreadyVectorized();
7713 }
7714 AddRuntimeUnrollDisableMetaData(L);
7715
7716 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7717 // predication, updating analyses.
7718 ILV.fixVectorizedLoop(State, BestVPlan);
7719
7720 ILV.printDebugTracesAtEnd();
7721 }
7722
7723 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7724 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7725 for (const auto &Plan : VPlans)
7726 if (PrintVPlansInDotFormat)
7727 Plan->printDOT(O);
7728 else
7729 Plan->print(O);
7730 }
7731 #endif
7732
getBroadcastInstrs(Value * V)7733 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7734
7735 //===--------------------------------------------------------------------===//
7736 // EpilogueVectorizerMainLoop
7737 //===--------------------------------------------------------------------===//
7738
7739 /// This function is partially responsible for generating the control flow
7740 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7741 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton()7742 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7743 createVectorLoopSkeleton("");
7744
7745 // Generate the code to check the minimum iteration count of the vector
7746 // epilogue (see below).
7747 EPI.EpilogueIterationCountCheck =
7748 emitIterationCountCheck(LoopScalarPreHeader, true);
7749 EPI.EpilogueIterationCountCheck->setName("iter.check");
7750
7751 // Generate the code to check any assumptions that we've made for SCEV
7752 // expressions.
7753 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7754
7755 // Generate the code that checks at runtime if arrays overlap. We put the
7756 // checks into a separate block to make the more common case of few elements
7757 // faster.
7758 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7759
7760 // Generate the iteration count check for the main loop, *after* the check
7761 // for the epilogue loop, so that the path-length is shorter for the case
7762 // that goes directly through the vector epilogue. The longer-path length for
7763 // the main loop is compensated for, by the gain from vectorizing the larger
7764 // trip count. Note: the branch will get updated later on when we vectorize
7765 // the epilogue.
7766 EPI.MainLoopIterationCountCheck =
7767 emitIterationCountCheck(LoopScalarPreHeader, false);
7768
7769 // Generate the induction variable.
7770 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7771
7772 // Skip induction resume value creation here because they will be created in
7773 // the second pass for the scalar loop. The induction resume values for the
7774 // inductions in the epilogue loop are created before executing the plan for
7775 // the epilogue loop.
7776
7777 return {completeLoopSkeleton(), nullptr};
7778 }
7779
printDebugTracesAtStart()7780 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7781 LLVM_DEBUG({
7782 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7783 << "Main Loop VF:" << EPI.MainLoopVF
7784 << ", Main Loop UF:" << EPI.MainLoopUF
7785 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7786 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7787 });
7788 }
7789
printDebugTracesAtEnd()7790 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7791 DEBUG_WITH_TYPE(VerboseDebug, {
7792 dbgs() << "intermediate fn:\n"
7793 << *OrigLoop->getHeader()->getParent() << "\n";
7794 });
7795 }
7796
7797 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7798 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7799 bool ForEpilogue) {
7800 assert(Bypass && "Expected valid bypass basic block.");
7801 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7802 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7803 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7804 // Reuse existing vector loop preheader for TC checks.
7805 // Note that new preheader block is generated for vector loop.
7806 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7807 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7808
7809 // Generate code to check if the loop's trip count is less than VF * UF of the
7810 // main vector loop.
7811 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7812 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7813
7814 Value *CheckMinIters = Builder.CreateICmp(
7815 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7816 "min.iters.check");
7817
7818 if (!ForEpilogue)
7819 TCCheckBlock->setName("vector.main.loop.iter.check");
7820
7821 // Create new preheader for vector loop.
7822 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7823 DT, LI, nullptr, "vector.ph");
7824
7825 if (ForEpilogue) {
7826 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7827 DT->getNode(Bypass)->getIDom()) &&
7828 "TC check is expected to dominate Bypass");
7829
7830 // Update dominator for Bypass & LoopExit.
7831 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7832 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7833 // For loops with multiple exits, there's no edge from the middle block
7834 // to exit blocks (as the epilogue must run) and thus no need to update
7835 // the immediate dominator of the exit blocks.
7836 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7837
7838 LoopBypassBlocks.push_back(TCCheckBlock);
7839
7840 // Save the trip count so we don't have to regenerate it in the
7841 // vec.epilog.iter.check. This is safe to do because the trip count
7842 // generated here dominates the vector epilog iter check.
7843 EPI.TripCount = Count;
7844 }
7845
7846 ReplaceInstWithInst(
7847 TCCheckBlock->getTerminator(),
7848 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7849
7850 return TCCheckBlock;
7851 }
7852
7853 //===--------------------------------------------------------------------===//
7854 // EpilogueVectorizerEpilogueLoop
7855 //===--------------------------------------------------------------------===//
7856
7857 /// This function is partially responsible for generating the control flow
7858 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7859 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton()7860 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7861 createVectorLoopSkeleton("vec.epilog.");
7862
7863 // Now, compare the remaining count and if there aren't enough iterations to
7864 // execute the vectorized epilogue skip to the scalar part.
7865 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7866 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7867 LoopVectorPreHeader =
7868 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7869 LI, nullptr, "vec.epilog.ph");
7870 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7871 VecEpilogueIterationCountCheck);
7872
7873 // Adjust the control flow taking the state info from the main loop
7874 // vectorization into account.
7875 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7876 "expected this to be saved from the previous pass.");
7877 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7878 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7879
7880 DT->changeImmediateDominator(LoopVectorPreHeader,
7881 EPI.MainLoopIterationCountCheck);
7882
7883 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7884 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7885
7886 if (EPI.SCEVSafetyCheck)
7887 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7888 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7889 if (EPI.MemSafetyCheck)
7890 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7891 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7892
7893 DT->changeImmediateDominator(
7894 VecEpilogueIterationCountCheck,
7895 VecEpilogueIterationCountCheck->getSinglePredecessor());
7896
7897 DT->changeImmediateDominator(LoopScalarPreHeader,
7898 EPI.EpilogueIterationCountCheck);
7899 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7900 // If there is an epilogue which must run, there's no edge from the
7901 // middle block to exit blocks and thus no need to update the immediate
7902 // dominator of the exit blocks.
7903 DT->changeImmediateDominator(LoopExitBlock,
7904 EPI.EpilogueIterationCountCheck);
7905
7906 // Keep track of bypass blocks, as they feed start values to the induction and
7907 // reduction phis in the scalar loop preheader.
7908 if (EPI.SCEVSafetyCheck)
7909 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7910 if (EPI.MemSafetyCheck)
7911 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7912 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7913
7914 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7915 // reductions which merge control-flow from the latch block and the middle
7916 // block. Update the incoming values here and move the Phi into the preheader.
7917 SmallVector<PHINode *, 4> PhisInBlock;
7918 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7919 PhisInBlock.push_back(&Phi);
7920
7921 for (PHINode *Phi : PhisInBlock) {
7922 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7923 Phi->replaceIncomingBlockWith(
7924 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7925 VecEpilogueIterationCountCheck);
7926
7927 // If the phi doesn't have an incoming value from the
7928 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7929 // value and also those from other check blocks. This is needed for
7930 // reduction phis only.
7931 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7932 return EPI.EpilogueIterationCountCheck == IncB;
7933 }))
7934 continue;
7935 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7936 if (EPI.SCEVSafetyCheck)
7937 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7938 if (EPI.MemSafetyCheck)
7939 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7940 }
7941
7942 // Generate a resume induction for the vector epilogue and put it in the
7943 // vector epilogue preheader
7944 Type *IdxTy = Legal->getWidestInductionType();
7945 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7946 LoopVectorPreHeader->getFirstNonPHI());
7947 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7948 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7949 EPI.MainLoopIterationCountCheck);
7950
7951 // Generate induction resume values. These variables save the new starting
7952 // indexes for the scalar loop. They are used to test if there are any tail
7953 // iterations left once the vector loop has completed.
7954 // Note that when the vectorized epilogue is skipped due to iteration count
7955 // check, then the resume value for the induction variable comes from
7956 // the trip count of the main vector loop, hence passing the AdditionalBypass
7957 // argument.
7958 createInductionResumeValues({VecEpilogueIterationCountCheck,
7959 EPI.VectorTripCount} /* AdditionalBypass */);
7960
7961 return {completeLoopSkeleton(), EPResumeVal};
7962 }
7963
7964 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7965 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7966 BasicBlock *Bypass, BasicBlock *Insert) {
7967
7968 assert(EPI.TripCount &&
7969 "Expected trip count to have been safed in the first pass.");
7970 assert(
7971 (!isa<Instruction>(EPI.TripCount) ||
7972 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7973 "saved trip count does not dominate insertion point.");
7974 Value *TC = EPI.TripCount;
7975 IRBuilder<> Builder(Insert->getTerminator());
7976 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7977
7978 // Generate code to check if the loop's trip count is less than VF * UF of the
7979 // vector epilogue loop.
7980 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7981 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7982
7983 Value *CheckMinIters =
7984 Builder.CreateICmp(P, Count,
7985 createStepForVF(Builder, Count->getType(),
7986 EPI.EpilogueVF, EPI.EpilogueUF),
7987 "min.epilog.iters.check");
7988
7989 ReplaceInstWithInst(
7990 Insert->getTerminator(),
7991 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7992
7993 LoopBypassBlocks.push_back(Insert);
7994 return Insert;
7995 }
7996
printDebugTracesAtStart()7997 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7998 LLVM_DEBUG({
7999 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8000 << "Epilogue Loop VF:" << EPI.EpilogueVF
8001 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8002 });
8003 }
8004
printDebugTracesAtEnd()8005 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8006 DEBUG_WITH_TYPE(VerboseDebug, {
8007 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8008 });
8009 }
8010
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)8011 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8012 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8013 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8014 bool PredicateAtRangeStart = Predicate(Range.Start);
8015
8016 for (ElementCount TmpVF = Range.Start * 2;
8017 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8018 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8019 Range.End = TmpVF;
8020 break;
8021 }
8022
8023 return PredicateAtRangeStart;
8024 }
8025
8026 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8027 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8028 /// of VF's starting at a given VF and extending it as much as possible. Each
8029 /// vectorization decision can potentially shorten this sub-range during
8030 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)8031 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8032 ElementCount MaxVF) {
8033 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8034 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8035 VFRange SubRange = {VF, MaxVFPlusOne};
8036 VPlans.push_back(buildVPlan(SubRange));
8037 VF = SubRange.End;
8038 }
8039 }
8040
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlanPtr & Plan)8041 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8042 VPlanPtr &Plan) {
8043 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8044
8045 // Look for cached value.
8046 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8047 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8048 if (ECEntryIt != EdgeMaskCache.end())
8049 return ECEntryIt->second;
8050
8051 VPValue *SrcMask = createBlockInMask(Src, Plan);
8052
8053 // The terminator has to be a branch inst!
8054 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8055 assert(BI && "Unexpected terminator found");
8056
8057 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8058 return EdgeMaskCache[Edge] = SrcMask;
8059
8060 // If source is an exiting block, we know the exit edge is dynamically dead
8061 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8062 // adding uses of an otherwise potentially dead instruction.
8063 if (OrigLoop->isLoopExiting(Src))
8064 return EdgeMaskCache[Edge] = SrcMask;
8065
8066 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8067 assert(EdgeMask && "No Edge Mask found for condition");
8068
8069 if (BI->getSuccessor(0) != Dst)
8070 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8071
8072 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8073 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8074 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8075 // The select version does not introduce new UB if SrcMask is false and
8076 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8077 VPValue *False = Plan->getOrAddVPValue(
8078 ConstantInt::getFalse(BI->getCondition()->getType()));
8079 EdgeMask =
8080 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8081 }
8082
8083 return EdgeMaskCache[Edge] = EdgeMask;
8084 }
8085
createBlockInMask(BasicBlock * BB,VPlanPtr & Plan)8086 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8087 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8088
8089 // Look for cached value.
8090 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8091 if (BCEntryIt != BlockMaskCache.end())
8092 return BCEntryIt->second;
8093
8094 // All-one mask is modelled as no-mask following the convention for masked
8095 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8096 VPValue *BlockMask = nullptr;
8097
8098 if (OrigLoop->getHeader() == BB) {
8099 if (!CM.blockNeedsPredicationForAnyReason(BB))
8100 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8101
8102 assert(CM.foldTailByMasking() && "must fold the tail");
8103
8104 // If we're using the active lane mask for control flow, then we get the
8105 // mask from the active lane mask PHI that is cached in the VPlan.
8106 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8107 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8108 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8109
8110 // Introduce the early-exit compare IV <= BTC to form header block mask.
8111 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8112 // constructing the desired canonical IV in the header block as its first
8113 // non-phi instructions.
8114
8115 VPBasicBlock *HeaderVPBB =
8116 Plan->getVectorLoopRegion()->getEntryBasicBlock();
8117 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8118 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8119 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8120
8121 VPBuilder::InsertPointGuard Guard(Builder);
8122 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8123 if (EmitGetActiveLaneMask != PredicationStyle::None) {
8124 VPValue *TC = Plan->getOrCreateTripCount();
8125 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8126 nullptr, "active.lane.mask");
8127 } else {
8128 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8129 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8130 }
8131 return BlockMaskCache[BB] = BlockMask;
8132 }
8133
8134 // This is the block mask. We OR all incoming edges.
8135 for (auto *Predecessor : predecessors(BB)) {
8136 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8137 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8138 return BlockMaskCache[BB] = EdgeMask;
8139
8140 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8141 BlockMask = EdgeMask;
8142 continue;
8143 }
8144
8145 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8146 }
8147
8148 return BlockMaskCache[BB] = BlockMask;
8149 }
8150
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8151 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8152 ArrayRef<VPValue *> Operands,
8153 VFRange &Range,
8154 VPlanPtr &Plan) {
8155 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8156 "Must be called with either a load or store");
8157
8158 auto willWiden = [&](ElementCount VF) -> bool {
8159 LoopVectorizationCostModel::InstWidening Decision =
8160 CM.getWideningDecision(I, VF);
8161 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8162 "CM decision should be taken at this point.");
8163 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8164 return true;
8165 if (CM.isScalarAfterVectorization(I, VF) ||
8166 CM.isProfitableToScalarize(I, VF))
8167 return false;
8168 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8169 };
8170
8171 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8172 return nullptr;
8173
8174 VPValue *Mask = nullptr;
8175 if (Legal->isMaskRequired(I))
8176 Mask = createBlockInMask(I->getParent(), Plan);
8177
8178 // Determine if the pointer operand of the access is either consecutive or
8179 // reverse consecutive.
8180 LoopVectorizationCostModel::InstWidening Decision =
8181 CM.getWideningDecision(I, Range.Start);
8182 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8183 bool Consecutive =
8184 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8185
8186 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8187 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8188 Consecutive, Reverse);
8189
8190 StoreInst *Store = cast<StoreInst>(I);
8191 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8192 Mask, Consecutive, Reverse);
8193 }
8194
8195 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8196 /// insert a recipe to expand the step for the induction recipe.
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,LoopVectorizationCostModel & CM,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop,VFRange & Range)8197 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8198 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8199 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8200 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8201 // Returns true if an instruction \p I should be scalarized instead of
8202 // vectorized for the chosen vectorization factor.
8203 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8204 return CM.isScalarAfterVectorization(I, VF) ||
8205 CM.isProfitableToScalarize(I, VF);
8206 };
8207
8208 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8209 [&](ElementCount VF) {
8210 return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8211 },
8212 Range);
8213 assert(IndDesc.getStartValue() ==
8214 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8215 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8216 "step must be loop invariant");
8217
8218 VPValue *Step =
8219 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8220 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8221 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8222 !NeedsScalarIVOnly);
8223 }
8224 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8225 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8226 !NeedsScalarIVOnly);
8227 }
8228
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlan & Plan,VFRange & Range)8229 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8230 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8231
8232 // Check if this is an integer or fp induction. If so, build the recipe that
8233 // produces its scalar and vector values.
8234 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8235 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8236 *PSE.getSE(), *OrigLoop, Range);
8237
8238 // Check if this is pointer induction. If so, build the recipe for it.
8239 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8240 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8241 *PSE.getSE());
8242 assert(isa<SCEVConstant>(II->getStep()));
8243 return new VPWidenPointerInductionRecipe(
8244 Phi, Operands[0], Step, *II,
8245 LoopVectorizationPlanner::getDecisionAndClampRange(
8246 [&](ElementCount VF) {
8247 return CM.isScalarAfterVectorization(Phi, VF);
8248 },
8249 Range));
8250 }
8251 return nullptr;
8252 }
8253
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlan & Plan)8254 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8255 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8256 // Optimize the special case where the source is a constant integer
8257 // induction variable. Notice that we can only optimize the 'trunc' case
8258 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8259 // (c) other casts depend on pointer size.
8260
8261 // Determine whether \p K is a truncation based on an induction variable that
8262 // can be optimized.
8263 auto isOptimizableIVTruncate =
8264 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8265 return [=](ElementCount VF) -> bool {
8266 return CM.isOptimizableIVTruncate(K, VF);
8267 };
8268 };
8269
8270 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8271 isOptimizableIVTruncate(I), Range)) {
8272
8273 auto *Phi = cast<PHINode>(I->getOperand(0));
8274 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8275 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8276 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8277 *PSE.getSE(), *OrigLoop, Range);
8278 }
8279 return nullptr;
8280 }
8281
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlanPtr & Plan)8282 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8283 ArrayRef<VPValue *> Operands,
8284 VPlanPtr &Plan) {
8285 // If all incoming values are equal, the incoming VPValue can be used directly
8286 // instead of creating a new VPBlendRecipe.
8287 if (llvm::all_equal(Operands))
8288 return Operands[0];
8289
8290 unsigned NumIncoming = Phi->getNumIncomingValues();
8291 // For in-loop reductions, we do not need to create an additional select.
8292 VPValue *InLoopVal = nullptr;
8293 for (unsigned In = 0; In < NumIncoming; In++) {
8294 PHINode *PhiOp =
8295 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8296 if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8297 assert(!InLoopVal && "Found more than one in-loop reduction!");
8298 InLoopVal = Operands[In];
8299 }
8300 }
8301
8302 assert((!InLoopVal || NumIncoming == 2) &&
8303 "Found an in-loop reduction for PHI with unexpected number of "
8304 "incoming values");
8305 if (InLoopVal)
8306 return Operands[Operands[0] == InLoopVal ? 1 : 0];
8307
8308 // We know that all PHIs in non-header blocks are converted into selects, so
8309 // we don't have to worry about the insertion order and we can just use the
8310 // builder. At this point we generate the predication tree. There may be
8311 // duplications since this is a simple recursive scan, but future
8312 // optimizations will clean it up.
8313 SmallVector<VPValue *, 2> OperandsWithMask;
8314
8315 for (unsigned In = 0; In < NumIncoming; In++) {
8316 VPValue *EdgeMask =
8317 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8318 assert((EdgeMask || NumIncoming == 1) &&
8319 "Multiple predecessors with one having a full mask");
8320 OperandsWithMask.push_back(Operands[In]);
8321 if (EdgeMask)
8322 OperandsWithMask.push_back(EdgeMask);
8323 }
8324 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8325 }
8326
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range) const8327 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8328 ArrayRef<VPValue *> Operands,
8329 VFRange &Range) const {
8330
8331 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8332 [this, CI](ElementCount VF) {
8333 return CM.isScalarWithPredication(CI, VF);
8334 },
8335 Range);
8336
8337 if (IsPredicated)
8338 return nullptr;
8339
8340 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8341 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8342 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8343 ID == Intrinsic::pseudoprobe ||
8344 ID == Intrinsic::experimental_noalias_scope_decl))
8345 return nullptr;
8346
8347 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8348
8349 // Is it beneficial to perform intrinsic call compared to lib call?
8350 bool ShouldUseVectorIntrinsic =
8351 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8352 [&](ElementCount VF) -> bool {
8353 bool NeedToScalarize = false;
8354 // Is it beneficial to perform intrinsic call compared to lib
8355 // call?
8356 InstructionCost CallCost =
8357 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8358 InstructionCost IntrinsicCost =
8359 CM.getVectorIntrinsicCost(CI, VF);
8360 return IntrinsicCost <= CallCost;
8361 },
8362 Range);
8363 if (ShouldUseVectorIntrinsic)
8364 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8365
8366 // Is better to call a vectorized version of the function than to to scalarize
8367 // the call?
8368 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8369 [&](ElementCount VF) -> bool {
8370 // The following case may be scalarized depending on the VF.
8371 // The flag shows whether we can use a usual Call for vectorized
8372 // version of the instruction.
8373 bool NeedToScalarize = false;
8374 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8375 return !NeedToScalarize;
8376 },
8377 Range);
8378 if (ShouldUseVectorCall)
8379 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8380 Intrinsic::not_intrinsic);
8381
8382 return nullptr;
8383 }
8384
shouldWiden(Instruction * I,VFRange & Range) const8385 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8386 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8387 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8388 // Instruction should be widened, unless it is scalar after vectorization,
8389 // scalarization is profitable or it is predicated.
8390 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8391 return CM.isScalarAfterVectorization(I, VF) ||
8392 CM.isProfitableToScalarize(I, VF) ||
8393 CM.isScalarWithPredication(I, VF);
8394 };
8395 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8396 Range);
8397 }
8398
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB,VPlanPtr & Plan)8399 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8400 ArrayRef<VPValue *> Operands,
8401 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8402 switch (I->getOpcode()) {
8403 default:
8404 return nullptr;
8405 case Instruction::SDiv:
8406 case Instruction::UDiv:
8407 case Instruction::SRem:
8408 case Instruction::URem: {
8409 // If not provably safe, use a select to form a safe divisor before widening the
8410 // div/rem operation itself. Otherwise fall through to general handling below.
8411 if (CM.isPredicatedInst(I)) {
8412 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8413 VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8414 VPValue *One =
8415 Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8416 auto *SafeRHS =
8417 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8418 I->getDebugLoc());
8419 VPBB->appendRecipe(SafeRHS);
8420 Ops[1] = SafeRHS;
8421 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8422 }
8423 LLVM_FALLTHROUGH;
8424 }
8425 case Instruction::Add:
8426 case Instruction::And:
8427 case Instruction::AShr:
8428 case Instruction::BitCast:
8429 case Instruction::FAdd:
8430 case Instruction::FCmp:
8431 case Instruction::FDiv:
8432 case Instruction::FMul:
8433 case Instruction::FNeg:
8434 case Instruction::FPExt:
8435 case Instruction::FPToSI:
8436 case Instruction::FPToUI:
8437 case Instruction::FPTrunc:
8438 case Instruction::FRem:
8439 case Instruction::FSub:
8440 case Instruction::ICmp:
8441 case Instruction::IntToPtr:
8442 case Instruction::LShr:
8443 case Instruction::Mul:
8444 case Instruction::Or:
8445 case Instruction::PtrToInt:
8446 case Instruction::Select:
8447 case Instruction::SExt:
8448 case Instruction::Shl:
8449 case Instruction::SIToFP:
8450 case Instruction::Sub:
8451 case Instruction::Trunc:
8452 case Instruction::UIToFP:
8453 case Instruction::Xor:
8454 case Instruction::ZExt:
8455 case Instruction::Freeze:
8456 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8457 };
8458 }
8459
fixHeaderPhis()8460 void VPRecipeBuilder::fixHeaderPhis() {
8461 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8462 for (VPHeaderPHIRecipe *R : PhisToFix) {
8463 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8464 VPRecipeBase *IncR =
8465 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8466 R->addOperand(IncR->getVPSingleValue());
8467 }
8468 }
8469
handleReplication(Instruction * I,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8470 VPBasicBlock *VPRecipeBuilder::handleReplication(
8471 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8472 VPlanPtr &Plan) {
8473 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8474 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8475 Range);
8476
8477 bool IsPredicated = CM.isPredicatedInst(I);
8478
8479 // Even if the instruction is not marked as uniform, there are certain
8480 // intrinsic calls that can be effectively treated as such, so we check for
8481 // them here. Conservatively, we only do this for scalable vectors, since
8482 // for fixed-width VFs we can always fall back on full scalarization.
8483 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8484 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8485 case Intrinsic::assume:
8486 case Intrinsic::lifetime_start:
8487 case Intrinsic::lifetime_end:
8488 // For scalable vectors if one of the operands is variant then we still
8489 // want to mark as uniform, which will generate one instruction for just
8490 // the first lane of the vector. We can't scalarize the call in the same
8491 // way as for fixed-width vectors because we don't know how many lanes
8492 // there are.
8493 //
8494 // The reasons for doing it this way for scalable vectors are:
8495 // 1. For the assume intrinsic generating the instruction for the first
8496 // lane is still be better than not generating any at all. For
8497 // example, the input may be a splat across all lanes.
8498 // 2. For the lifetime start/end intrinsics the pointer operand only
8499 // does anything useful when the input comes from a stack object,
8500 // which suggests it should always be uniform. For non-stack objects
8501 // the effect is to poison the object, which still allows us to
8502 // remove the call.
8503 IsUniform = true;
8504 break;
8505 default:
8506 break;
8507 }
8508 }
8509
8510 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8511 IsUniform, IsPredicated);
8512
8513 // Find if I uses a predicated instruction. If so, it will use its scalar
8514 // value. Avoid hoisting the insert-element which packs the scalar value into
8515 // a vector value, as that happens iff all users use the vector value.
8516 for (VPValue *Op : Recipe->operands()) {
8517 auto *PredR =
8518 dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8519 if (!PredR)
8520 continue;
8521 auto *RepR = cast<VPReplicateRecipe>(
8522 PredR->getOperand(0)->getDefiningRecipe());
8523 assert(RepR->isPredicated() &&
8524 "expected Replicate recipe to be predicated");
8525 RepR->setAlsoPack(false);
8526 }
8527
8528 // Finalize the recipe for Instr, first if it is not predicated.
8529 if (!IsPredicated) {
8530 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8531 setRecipe(I, Recipe);
8532 Plan->addVPValue(I, Recipe);
8533 VPBB->appendRecipe(Recipe);
8534 return VPBB;
8535 }
8536 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8537
8538 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8539 assert(SingleSucc && "VPBB must have a single successor when handling "
8540 "predicated replication.");
8541 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8542 // Record predicated instructions for above packing optimizations.
8543 VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8544 VPBlockUtils::insertBlockAfter(Region, VPBB);
8545 auto *RegSucc = new VPBasicBlock();
8546 VPBlockUtils::insertBlockAfter(RegSucc, Region);
8547 VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8548 return RegSucc;
8549 }
8550
8551 VPRegionBlock *
createReplicateRegion(VPReplicateRecipe * PredRecipe,VPlanPtr & Plan)8552 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8553 VPlanPtr &Plan) {
8554 Instruction *Instr = PredRecipe->getUnderlyingInstr();
8555 // Instructions marked for predication are replicated and placed under an
8556 // if-then construct to prevent side-effects.
8557 // Generate recipes to compute the block mask for this region.
8558 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8559
8560 // Build the triangular if-then region.
8561 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8562 assert(Instr->getParent() && "Predicated instruction not in any basic block");
8563 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8564 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8565 auto *PHIRecipe = Instr->getType()->isVoidTy()
8566 ? nullptr
8567 : new VPPredInstPHIRecipe(PredRecipe);
8568 if (PHIRecipe) {
8569 setRecipe(Instr, PHIRecipe);
8570 Plan->addVPValue(Instr, PHIRecipe);
8571 } else {
8572 setRecipe(Instr, PredRecipe);
8573 Plan->addVPValue(Instr, PredRecipe);
8574 }
8575
8576 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8577 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8578 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8579
8580 // Note: first set Entry as region entry and then connect successors starting
8581 // from it in order, to propagate the "parent" of each VPBasicBlock.
8582 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8583 VPBlockUtils::connectBlocks(Pred, Exiting);
8584
8585 return Region;
8586 }
8587
8588 VPRecipeOrVPValueTy
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8589 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8590 ArrayRef<VPValue *> Operands,
8591 VFRange &Range, VPBasicBlock *VPBB,
8592 VPlanPtr &Plan) {
8593 // First, check for specific widening recipes that deal with inductions, Phi
8594 // nodes, calls and memory operations.
8595 VPRecipeBase *Recipe;
8596 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8597 if (Phi->getParent() != OrigLoop->getHeader())
8598 return tryToBlend(Phi, Operands, Plan);
8599
8600 // Always record recipes for header phis. Later first-order recurrence phis
8601 // can have earlier phis as incoming values.
8602 recordRecipeOf(Phi);
8603
8604 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8605 return toVPRecipeResult(Recipe);
8606
8607 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8608 assert((Legal->isReductionVariable(Phi) ||
8609 Legal->isFixedOrderRecurrence(Phi)) &&
8610 "can only widen reductions and fixed-order recurrences here");
8611 VPValue *StartV = Operands[0];
8612 if (Legal->isReductionVariable(Phi)) {
8613 const RecurrenceDescriptor &RdxDesc =
8614 Legal->getReductionVars().find(Phi)->second;
8615 assert(RdxDesc.getRecurrenceStartValue() ==
8616 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8617 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8618 CM.isInLoopReduction(Phi),
8619 CM.useOrderedReductions(RdxDesc));
8620 } else {
8621 // TODO: Currently fixed-order recurrences are modeled as chains of
8622 // first-order recurrences. If there are no users of the intermediate
8623 // recurrences in the chain, the fixed order recurrence should be modeled
8624 // directly, enabling more efficient codegen.
8625 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8626 }
8627
8628 // Record the incoming value from the backedge, so we can add the incoming
8629 // value from the backedge after all recipes have been created.
8630 auto *Inc = cast<Instruction>(
8631 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8632 auto RecipeIter = Ingredient2Recipe.find(Inc);
8633 if (RecipeIter == Ingredient2Recipe.end())
8634 recordRecipeOf(Inc);
8635
8636 PhisToFix.push_back(PhiRecipe);
8637 return toVPRecipeResult(PhiRecipe);
8638 }
8639
8640 if (isa<TruncInst>(Instr) &&
8641 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8642 Range, *Plan)))
8643 return toVPRecipeResult(Recipe);
8644
8645 // All widen recipes below deal only with VF > 1.
8646 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8647 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8648 return nullptr;
8649
8650 if (auto *CI = dyn_cast<CallInst>(Instr))
8651 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8652
8653 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8654 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8655
8656 if (!shouldWiden(Instr, Range))
8657 return nullptr;
8658
8659 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8660 return toVPRecipeResult(new VPWidenGEPRecipe(
8661 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8662
8663 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8664 bool InvariantCond =
8665 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8666 return toVPRecipeResult(new VPWidenSelectRecipe(
8667 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8668 }
8669
8670 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8671 }
8672
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8673 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8674 ElementCount MaxVF) {
8675 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8676
8677 // Add assume instructions we need to drop to DeadInstructions, to prevent
8678 // them from being added to the VPlan.
8679 // TODO: We only need to drop assumes in blocks that get flattend. If the
8680 // control flow is preserved, we should keep them.
8681 SmallPtrSet<Instruction *, 4> DeadInstructions;
8682 auto &ConditionalAssumes = Legal->getConditionalAssumes();
8683 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8684
8685 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8686 // Dead instructions do not need sinking. Remove them from SinkAfter.
8687 for (Instruction *I : DeadInstructions)
8688 SinkAfter.erase(I);
8689
8690 // Cannot sink instructions after dead instructions (there won't be any
8691 // recipes for them). Instead, find the first non-dead previous instruction.
8692 for (auto &P : Legal->getSinkAfter()) {
8693 Instruction *SinkTarget = P.second;
8694 Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8695 (void)FirstInst;
8696 while (DeadInstructions.contains(SinkTarget)) {
8697 assert(
8698 SinkTarget != FirstInst &&
8699 "Must find a live instruction (at least the one feeding the "
8700 "fixed-order recurrence PHI) before reaching beginning of the block");
8701 SinkTarget = SinkTarget->getPrevNode();
8702 assert(SinkTarget != P.first &&
8703 "sink source equals target, no sinking required");
8704 }
8705 P.second = SinkTarget;
8706 }
8707
8708 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8709 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8710 VFRange SubRange = {VF, MaxVFPlusOne};
8711 VPlans.push_back(
8712 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8713 VF = SubRange.End;
8714 }
8715 }
8716
8717 // Add the necessary canonical IV and branch recipes required to control the
8718 // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,DebugLoc DL,bool HasNUW,bool UseLaneMaskForLoopControlFlow)8719 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8720 bool HasNUW,
8721 bool UseLaneMaskForLoopControlFlow) {
8722 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8723 auto *StartV = Plan.getOrAddVPValue(StartIdx);
8724
8725 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8726 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8727 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8728 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8729 Header->insert(CanonicalIVPHI, Header->begin());
8730
8731 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8732 // IV by VF * UF.
8733 auto *CanonicalIVIncrement =
8734 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8735 : VPInstruction::CanonicalIVIncrement,
8736 {CanonicalIVPHI}, DL, "index.next");
8737 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8738
8739 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8740 EB->appendRecipe(CanonicalIVIncrement);
8741
8742 if (UseLaneMaskForLoopControlFlow) {
8743 // Create the active lane mask instruction in the vplan preheader.
8744 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8745
8746 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8747 // we have to take unrolling into account. Each part needs to start at
8748 // Part * VF
8749 auto *CanonicalIVIncrementParts =
8750 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8751 : VPInstruction::CanonicalIVIncrementForPart,
8752 {StartV}, DL, "index.part.next");
8753 Preheader->appendRecipe(CanonicalIVIncrementParts);
8754
8755 // Create the ActiveLaneMask instruction using the correct start values.
8756 VPValue *TC = Plan.getOrCreateTripCount();
8757 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8758 {CanonicalIVIncrementParts, TC}, DL,
8759 "active.lane.mask.entry");
8760 Preheader->appendRecipe(EntryALM);
8761
8762 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8763 // preheader ActiveLaneMask instruction.
8764 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8765 Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8766
8767 // Create the active lane mask for the next iteration of the loop.
8768 CanonicalIVIncrementParts =
8769 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8770 : VPInstruction::CanonicalIVIncrementForPart,
8771 {CanonicalIVIncrement}, DL);
8772 EB->appendRecipe(CanonicalIVIncrementParts);
8773
8774 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8775 {CanonicalIVIncrementParts, TC}, DL,
8776 "active.lane.mask.next");
8777 EB->appendRecipe(ALM);
8778 LaneMaskPhi->addOperand(ALM);
8779
8780 // We have to invert the mask here because a true condition means jumping
8781 // to the exit block.
8782 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8783 EB->appendRecipe(NotMask);
8784
8785 VPInstruction *BranchBack =
8786 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8787 EB->appendRecipe(BranchBack);
8788 } else {
8789 // Add the BranchOnCount VPInstruction to the latch.
8790 VPInstruction *BranchBack = new VPInstruction(
8791 VPInstruction::BranchOnCount,
8792 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8793 EB->appendRecipe(BranchBack);
8794 }
8795 }
8796
8797 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8798 // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,VPBasicBlock * MiddleVPBB,Loop * OrigLoop,VPlan & Plan)8799 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8800 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8801 VPlan &Plan) {
8802 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8803 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8804 // Only handle single-exit loops with unique exit blocks for now.
8805 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8806 return;
8807
8808 // Introduce VPUsers modeling the exit values.
8809 for (PHINode &ExitPhi : ExitBB->phis()) {
8810 Value *IncomingValue =
8811 ExitPhi.getIncomingValueForBlock(ExitingBB);
8812 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8813 Plan.addLiveOut(&ExitPhi, V);
8814 }
8815 }
8816
buildVPlanWithVPRecipes(VFRange & Range,SmallPtrSetImpl<Instruction * > & DeadInstructions,const MapVector<Instruction *,Instruction * > & SinkAfter)8817 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8818 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8819 const MapVector<Instruction *, Instruction *> &SinkAfter) {
8820
8821 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8822
8823 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8824
8825 // ---------------------------------------------------------------------------
8826 // Pre-construction: record ingredients whose recipes we'll need to further
8827 // process after constructing the initial VPlan.
8828 // ---------------------------------------------------------------------------
8829
8830 // Mark instructions we'll need to sink later and their targets as
8831 // ingredients whose recipe we'll need to record.
8832 for (const auto &Entry : SinkAfter) {
8833 RecipeBuilder.recordRecipeOf(Entry.first);
8834 RecipeBuilder.recordRecipeOf(Entry.second);
8835 }
8836 for (const auto &Reduction : CM.getInLoopReductionChains()) {
8837 PHINode *Phi = Reduction.first;
8838 RecurKind Kind =
8839 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8840 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8841
8842 RecipeBuilder.recordRecipeOf(Phi);
8843 for (const auto &R : ReductionOperations) {
8844 RecipeBuilder.recordRecipeOf(R);
8845 // For min/max reductions, where we have a pair of icmp/select, we also
8846 // need to record the ICmp recipe, so it can be removed later.
8847 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8848 "Only min/max recurrences allowed for inloop reductions");
8849 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8850 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8851 }
8852 }
8853
8854 // For each interleave group which is relevant for this (possibly trimmed)
8855 // Range, add it to the set of groups to be later applied to the VPlan and add
8856 // placeholders for its members' Recipes which we'll be replacing with a
8857 // single VPInterleaveRecipe.
8858 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8859 auto applyIG = [IG, this](ElementCount VF) -> bool {
8860 return (VF.isVector() && // Query is illegal for VF == 1
8861 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8862 LoopVectorizationCostModel::CM_Interleave);
8863 };
8864 if (!getDecisionAndClampRange(applyIG, Range))
8865 continue;
8866 InterleaveGroups.insert(IG);
8867 for (unsigned i = 0; i < IG->getFactor(); i++)
8868 if (Instruction *Member = IG->getMember(i))
8869 RecipeBuilder.recordRecipeOf(Member);
8870 };
8871
8872 // ---------------------------------------------------------------------------
8873 // Build initial VPlan: Scan the body of the loop in a topological order to
8874 // visit each basic block after having visited its predecessor basic blocks.
8875 // ---------------------------------------------------------------------------
8876
8877 // Create initial VPlan skeleton, starting with a block for the pre-header,
8878 // followed by a region for the vector loop, followed by the middle block. The
8879 // skeleton vector loop region contains a header and latch block.
8880 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8881 auto Plan = std::make_unique<VPlan>(Preheader);
8882
8883 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8884 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8885 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8886 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8887 VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8888 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8889 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8890
8891 Instruction *DLInst =
8892 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8893 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8894 DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8895 !CM.foldTailByMasking(),
8896 CM.useActiveLaneMaskForControlFlow());
8897
8898 // Scan the body of the loop in a topological order to visit each basic block
8899 // after having visited its predecessor basic blocks.
8900 LoopBlocksDFS DFS(OrigLoop);
8901 DFS.perform(LI);
8902
8903 VPBasicBlock *VPBB = HeaderVPBB;
8904 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8905 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8906 // Relevant instructions from basic block BB will be grouped into VPRecipe
8907 // ingredients and fill a new VPBasicBlock.
8908 unsigned VPBBsForBB = 0;
8909 if (VPBB != HeaderVPBB)
8910 VPBB->setName(BB->getName());
8911 Builder.setInsertPoint(VPBB);
8912
8913 // Introduce each ingredient into VPlan.
8914 // TODO: Model and preserve debug intrinsics in VPlan.
8915 for (Instruction &I : BB->instructionsWithoutDebug()) {
8916 Instruction *Instr = &I;
8917
8918 // First filter out irrelevant instructions, to ensure no recipes are
8919 // built for them.
8920 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8921 continue;
8922
8923 SmallVector<VPValue *, 4> Operands;
8924 auto *Phi = dyn_cast<PHINode>(Instr);
8925 if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8926 Operands.push_back(Plan->getOrAddVPValue(
8927 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8928 } else {
8929 auto OpRange = Plan->mapToVPValues(Instr->operands());
8930 Operands = {OpRange.begin(), OpRange.end()};
8931 }
8932
8933 // Invariant stores inside loop will be deleted and a single store
8934 // with the final reduction value will be added to the exit block
8935 StoreInst *SI;
8936 if ((SI = dyn_cast<StoreInst>(&I)) &&
8937 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8938 continue;
8939
8940 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8941 Instr, Operands, Range, VPBB, Plan)) {
8942 // If Instr can be simplified to an existing VPValue, use it.
8943 if (RecipeOrValue.is<VPValue *>()) {
8944 auto *VPV = RecipeOrValue.get<VPValue *>();
8945 Plan->addVPValue(Instr, VPV);
8946 // If the re-used value is a recipe, register the recipe for the
8947 // instruction, in case the recipe for Instr needs to be recorded.
8948 if (VPRecipeBase *R = VPV->getDefiningRecipe())
8949 RecipeBuilder.setRecipe(Instr, R);
8950 continue;
8951 }
8952 // Otherwise, add the new recipe.
8953 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8954 for (auto *Def : Recipe->definedValues()) {
8955 auto *UV = Def->getUnderlyingValue();
8956 Plan->addVPValue(UV, Def);
8957 }
8958
8959 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8960 HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8961 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8962 // of the header block. That can happen for truncates of induction
8963 // variables. Those recipes are moved to the phi section of the header
8964 // block after applying SinkAfter, which relies on the original
8965 // position of the trunc.
8966 assert(isa<TruncInst>(Instr));
8967 InductionsToMove.push_back(
8968 cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8969 }
8970 RecipeBuilder.setRecipe(Instr, Recipe);
8971 VPBB->appendRecipe(Recipe);
8972 continue;
8973 }
8974
8975 // Otherwise, if all widening options failed, Instruction is to be
8976 // replicated. This may create a successor for VPBB.
8977 VPBasicBlock *NextVPBB =
8978 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8979 if (NextVPBB != VPBB) {
8980 VPBB = NextVPBB;
8981 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8982 : "");
8983 }
8984 }
8985
8986 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8987 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8988 }
8989
8990 // After here, VPBB should not be used.
8991 VPBB = nullptr;
8992
8993 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8994
8995 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8996 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8997 "entry block must be set to a VPRegionBlock having a non-empty entry "
8998 "VPBasicBlock");
8999 RecipeBuilder.fixHeaderPhis();
9000
9001 // ---------------------------------------------------------------------------
9002 // Transform initial VPlan: Apply previously taken decisions, in order, to
9003 // bring the VPlan to its final state.
9004 // ---------------------------------------------------------------------------
9005
9006 // Apply Sink-After legal constraints.
9007 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9008 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9009 if (Region && Region->isReplicator()) {
9010 assert(Region->getNumSuccessors() == 1 &&
9011 Region->getNumPredecessors() == 1 && "Expected SESE region!");
9012 assert(R->getParent()->size() == 1 &&
9013 "A recipe in an original replicator region must be the only "
9014 "recipe in its block");
9015 return Region;
9016 }
9017 return nullptr;
9018 };
9019 for (const auto &Entry : SinkAfter) {
9020 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9021 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9022
9023 auto *TargetRegion = GetReplicateRegion(Target);
9024 auto *SinkRegion = GetReplicateRegion(Sink);
9025 if (!SinkRegion) {
9026 // If the sink source is not a replicate region, sink the recipe directly.
9027 if (TargetRegion) {
9028 // The target is in a replication region, make sure to move Sink to
9029 // the block after it, not into the replication region itself.
9030 VPBasicBlock *NextBlock =
9031 cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9032 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9033 } else
9034 Sink->moveAfter(Target);
9035 continue;
9036 }
9037
9038 // The sink source is in a replicate region. Unhook the region from the CFG.
9039 auto *SinkPred = SinkRegion->getSinglePredecessor();
9040 auto *SinkSucc = SinkRegion->getSingleSuccessor();
9041 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9042 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9043 VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9044
9045 if (TargetRegion) {
9046 // The target recipe is also in a replicate region, move the sink region
9047 // after the target region.
9048 auto *TargetSucc = TargetRegion->getSingleSuccessor();
9049 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9050 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9051 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9052 } else {
9053 // The sink source is in a replicate region, we need to move the whole
9054 // replicate region, which should only contain a single recipe in the
9055 // main block.
9056 auto *SplitBlock =
9057 Target->getParent()->splitAt(std::next(Target->getIterator()));
9058
9059 auto *SplitPred = SplitBlock->getSinglePredecessor();
9060
9061 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9062 VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9063 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9064 }
9065 }
9066
9067 VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9068 VPlanTransforms::removeRedundantInductionCasts(*Plan);
9069
9070 // Now that sink-after is done, move induction recipes for optimized truncates
9071 // to the phi section of the header block.
9072 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9073 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9074
9075 // Adjust the recipes for any inloop reductions.
9076 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9077 RecipeBuilder, Range.Start);
9078
9079 // Introduce a recipe to combine the incoming and previous values of a
9080 // fixed-order recurrence.
9081 for (VPRecipeBase &R :
9082 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9083 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9084 if (!RecurPhi)
9085 continue;
9086
9087 VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9088 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9089 // to terminate.
9090 while (auto *PrevPhi =
9091 dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9092 PrevRecipe = &PrevPhi->getBackedgeRecipe();
9093 VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9094 auto *Region = GetReplicateRegion(PrevRecipe);
9095 if (Region)
9096 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9097 if (!InsertBlock) {
9098 InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9099 VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9100 }
9101 if (Region || PrevRecipe->isPhi())
9102 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9103 else
9104 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9105
9106 auto *RecurSplice = cast<VPInstruction>(
9107 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9108 {RecurPhi, RecurPhi->getBackedgeValue()}));
9109
9110 RecurPhi->replaceAllUsesWith(RecurSplice);
9111 // Set the first operand of RecurSplice to RecurPhi again, after replacing
9112 // all users.
9113 RecurSplice->setOperand(0, RecurPhi);
9114 }
9115
9116 // Interleave memory: for each Interleave Group we marked earlier as relevant
9117 // for this VPlan, replace the Recipes widening its memory instructions with a
9118 // single VPInterleaveRecipe at its insertion point.
9119 for (const auto *IG : InterleaveGroups) {
9120 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9121 RecipeBuilder.getRecipe(IG->getInsertPos()));
9122 SmallVector<VPValue *, 4> StoredValues;
9123 for (unsigned i = 0; i < IG->getFactor(); ++i)
9124 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9125 auto *StoreR =
9126 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9127 StoredValues.push_back(StoreR->getStoredValue());
9128 }
9129
9130 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9131 Recipe->getMask());
9132 VPIG->insertBefore(Recipe);
9133 unsigned J = 0;
9134 for (unsigned i = 0; i < IG->getFactor(); ++i)
9135 if (Instruction *Member = IG->getMember(i)) {
9136 if (!Member->getType()->isVoidTy()) {
9137 VPValue *OriginalV = Plan->getVPValue(Member);
9138 Plan->removeVPValueFor(Member);
9139 Plan->addVPValue(Member, VPIG->getVPValue(J));
9140 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9141 J++;
9142 }
9143 RecipeBuilder.getRecipe(Member)->eraseFromParent();
9144 }
9145 }
9146
9147 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9148 VF *= 2)
9149 Plan->addVF(VF);
9150 Plan->setName("Initial VPlan");
9151
9152 // From this point onwards, VPlan-to-VPlan transformations may change the plan
9153 // in ways that accessing values using original IR values is incorrect.
9154 Plan->disableValue2VPValue();
9155
9156 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9157 VPlanTransforms::removeDeadRecipes(*Plan);
9158
9159 bool ShouldSimplify = true;
9160 while (ShouldSimplify) {
9161 ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
9162 ShouldSimplify |=
9163 VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
9164 ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9165 }
9166
9167 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9168 VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9169
9170 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9171 return Plan;
9172 }
9173
buildVPlan(VFRange & Range)9174 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9175 // Outer loop handling: They may require CFG and instruction level
9176 // transformations before even evaluating whether vectorization is profitable.
9177 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9178 // the vectorization pipeline.
9179 assert(!OrigLoop->isInnermost());
9180 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9181
9182 // Create new empty VPlan
9183 auto Plan = std::make_unique<VPlan>();
9184
9185 // Build hierarchical CFG
9186 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9187 HCFGBuilder.buildHierarchicalCFG();
9188
9189 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9190 VF *= 2)
9191 Plan->addVF(VF);
9192
9193 SmallPtrSet<Instruction *, 1> DeadInstructions;
9194 VPlanTransforms::VPInstructionsToVPRecipes(
9195 OrigLoop, Plan,
9196 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9197 DeadInstructions, *PSE.getSE(), *TLI);
9198
9199 // Remove the existing terminator of the exiting block of the top-most region.
9200 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9201 auto *Term =
9202 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9203 Term->eraseFromParent();
9204
9205 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9206 true, CM.useActiveLaneMaskForControlFlow());
9207 return Plan;
9208 }
9209
9210 // Adjust the recipes for reductions. For in-loop reductions the chain of
9211 // instructions leading from the loop exit instr to the phi need to be converted
9212 // to reductions, with one operand being vector and the other being the scalar
9213 // reduction chain. For other reductions, a select is introduced between the phi
9214 // and live-out recipes when folding the tail.
adjustRecipesForReductions(VPBasicBlock * LatchVPBB,VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)9215 void LoopVectorizationPlanner::adjustRecipesForReductions(
9216 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9217 ElementCount MinVF) {
9218 for (const auto &Reduction : CM.getInLoopReductionChains()) {
9219 PHINode *Phi = Reduction.first;
9220 const RecurrenceDescriptor &RdxDesc =
9221 Legal->getReductionVars().find(Phi)->second;
9222 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9223
9224 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9225 continue;
9226
9227 // ReductionOperations are orders top-down from the phi's use to the
9228 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9229 // which of the two operands will remain scalar and which will be reduced.
9230 // For minmax the chain will be the select instructions.
9231 Instruction *Chain = Phi;
9232 for (Instruction *R : ReductionOperations) {
9233 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9234 RecurKind Kind = RdxDesc.getRecurrenceKind();
9235
9236 VPValue *ChainOp = Plan->getVPValue(Chain);
9237 unsigned FirstOpId;
9238 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9239 "Only min/max recurrences allowed for inloop reductions");
9240 // Recognize a call to the llvm.fmuladd intrinsic.
9241 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9242 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9243 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9244 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9245 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9246 "Expected to replace a VPWidenSelectSC");
9247 FirstOpId = 1;
9248 } else {
9249 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9250 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9251 "Expected to replace a VPWidenSC");
9252 FirstOpId = 0;
9253 }
9254 unsigned VecOpId =
9255 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9256 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9257
9258 VPValue *CondOp = nullptr;
9259 if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
9260 VPBuilder::InsertPointGuard Guard(Builder);
9261 Builder.setInsertPoint(WidenRecipe->getParent(),
9262 WidenRecipe->getIterator());
9263 CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
9264 }
9265
9266 if (IsFMulAdd) {
9267 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9268 // need to create an fmul recipe to use as the vector operand for the
9269 // fadd reduction.
9270 VPInstruction *FMulRecipe = new VPInstruction(
9271 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9272 FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9273 WidenRecipe->getParent()->insert(FMulRecipe,
9274 WidenRecipe->getIterator());
9275 VecOp = FMulRecipe;
9276 }
9277 VPReductionRecipe *RedRecipe =
9278 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9279 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9280 Plan->removeVPValueFor(R);
9281 Plan->addVPValue(R, RedRecipe);
9282 // Append the recipe to the end of the VPBasicBlock because we need to
9283 // ensure that it comes after all of it's inputs, including CondOp.
9284 WidenRecipe->getParent()->appendRecipe(RedRecipe);
9285 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9286 WidenRecipe->eraseFromParent();
9287
9288 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9289 VPRecipeBase *CompareRecipe =
9290 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9291 assert(isa<VPWidenRecipe>(CompareRecipe) &&
9292 "Expected to replace a VPWidenSC");
9293 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9294 "Expected no remaining users");
9295 CompareRecipe->eraseFromParent();
9296 }
9297 Chain = R;
9298 }
9299 }
9300
9301 // If tail is folded by masking, introduce selects between the phi
9302 // and the live-out instruction of each reduction, at the beginning of the
9303 // dedicated latch block.
9304 if (CM.foldTailByMasking()) {
9305 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9306 for (VPRecipeBase &R :
9307 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9308 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9309 if (!PhiR || PhiR->isInLoop())
9310 continue;
9311 VPValue *Cond =
9312 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9313 VPValue *Red = PhiR->getBackedgeValue();
9314 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
9315 "reduction recipe must be defined before latch");
9316 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9317 }
9318 }
9319 }
9320
9321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const9322 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9323 VPSlotTracker &SlotTracker) const {
9324 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9325 IG->getInsertPos()->printAsOperand(O, false);
9326 O << ", ";
9327 getAddr()->printAsOperand(O, SlotTracker);
9328 VPValue *Mask = getMask();
9329 if (Mask) {
9330 O << ", ";
9331 Mask->printAsOperand(O, SlotTracker);
9332 }
9333
9334 unsigned OpIdx = 0;
9335 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9336 if (!IG->getMember(i))
9337 continue;
9338 if (getNumStoreOperands() > 0) {
9339 O << "\n" << Indent << " store ";
9340 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9341 O << " to index " << i;
9342 } else {
9343 O << "\n" << Indent << " ";
9344 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9345 O << " = load from index " << i;
9346 }
9347 ++OpIdx;
9348 }
9349 }
9350 #endif
9351
execute(VPTransformState & State)9352 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9353 assert(!State.Instance && "Int or FP induction being replicated.");
9354
9355 Value *Start = getStartValue()->getLiveInIRValue();
9356 const InductionDescriptor &ID = getInductionDescriptor();
9357 TruncInst *Trunc = getTruncInst();
9358 IRBuilderBase &Builder = State.Builder;
9359 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9360 assert(State.VF.isVector() && "must have vector VF");
9361
9362 // The value from the original loop to which we are mapping the new induction
9363 // variable.
9364 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9365
9366 // Fast-math-flags propagate from the original induction instruction.
9367 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9368 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9369 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9370
9371 // Now do the actual transformations, and start with fetching the step value.
9372 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9373
9374 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9375 "Expected either an induction phi-node or a truncate of it!");
9376
9377 // Construct the initial value of the vector IV in the vector loop preheader
9378 auto CurrIP = Builder.saveIP();
9379 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9380 Builder.SetInsertPoint(VectorPH->getTerminator());
9381 if (isa<TruncInst>(EntryVal)) {
9382 assert(Start->getType()->isIntegerTy() &&
9383 "Truncation requires an integer type");
9384 auto *TruncType = cast<IntegerType>(EntryVal->getType());
9385 Step = Builder.CreateTrunc(Step, TruncType);
9386 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9387 }
9388
9389 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9390 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9391 Value *SteppedStart = getStepVector(
9392 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9393
9394 // We create vector phi nodes for both integer and floating-point induction
9395 // variables. Here, we determine the kind of arithmetic we will perform.
9396 Instruction::BinaryOps AddOp;
9397 Instruction::BinaryOps MulOp;
9398 if (Step->getType()->isIntegerTy()) {
9399 AddOp = Instruction::Add;
9400 MulOp = Instruction::Mul;
9401 } else {
9402 AddOp = ID.getInductionOpcode();
9403 MulOp = Instruction::FMul;
9404 }
9405
9406 // Multiply the vectorization factor by the step using integer or
9407 // floating-point arithmetic as appropriate.
9408 Type *StepType = Step->getType();
9409 Value *RuntimeVF;
9410 if (Step->getType()->isFloatingPointTy())
9411 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9412 else
9413 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9414 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9415
9416 // Create a vector splat to use in the induction update.
9417 //
9418 // FIXME: If the step is non-constant, we create the vector splat with
9419 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9420 // handle a constant vector splat.
9421 Value *SplatVF = isa<Constant>(Mul)
9422 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9423 : Builder.CreateVectorSplat(State.VF, Mul);
9424 Builder.restoreIP(CurrIP);
9425
9426 // We may need to add the step a number of times, depending on the unroll
9427 // factor. The last of those goes into the PHI.
9428 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9429 &*State.CFG.PrevBB->getFirstInsertionPt());
9430 VecInd->setDebugLoc(EntryVal->getDebugLoc());
9431 Instruction *LastInduction = VecInd;
9432 for (unsigned Part = 0; Part < State.UF; ++Part) {
9433 State.set(this, LastInduction, Part);
9434
9435 if (isa<TruncInst>(EntryVal))
9436 State.addMetadata(LastInduction, EntryVal);
9437
9438 LastInduction = cast<Instruction>(
9439 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9440 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9441 }
9442
9443 LastInduction->setName("vec.ind.next");
9444 VecInd->addIncoming(SteppedStart, VectorPH);
9445 // Add induction update using an incorrect block temporarily. The phi node
9446 // will be fixed after VPlan execution. Note that at this point the latch
9447 // block cannot be used, as it does not exist yet.
9448 // TODO: Model increment value in VPlan, by turning the recipe into a
9449 // multi-def and a subclass of VPHeaderPHIRecipe.
9450 VecInd->addIncoming(LastInduction, VectorPH);
9451 }
9452
execute(VPTransformState & State)9453 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9454 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9455 "Not a pointer induction according to InductionDescriptor!");
9456 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9457 "Unexpected type.");
9458
9459 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9460 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9461
9462 if (onlyScalarsGenerated(State.VF)) {
9463 // This is the normalized GEP that starts counting at zero.
9464 Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9465 CanonicalIV, IndDesc.getStep()->getType());
9466 // Determine the number of scalars we need to generate for each unroll
9467 // iteration. If the instruction is uniform, we only need to generate the
9468 // first lane. Otherwise, we generate all VF values.
9469 bool IsUniform = vputils::onlyFirstLaneUsed(this);
9470 assert((IsUniform || !State.VF.isScalable()) &&
9471 "Cannot scalarize a scalable VF");
9472 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9473
9474 for (unsigned Part = 0; Part < State.UF; ++Part) {
9475 Value *PartStart =
9476 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9477
9478 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9479 Value *Idx = State.Builder.CreateAdd(
9480 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9481 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9482
9483 Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9484 Value *SclrGep = emitTransformedIndex(
9485 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9486 SclrGep->setName("next.gep");
9487 State.set(this, SclrGep, VPIteration(Part, Lane));
9488 }
9489 }
9490 return;
9491 }
9492
9493 assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9494 "Induction step not a SCEV constant!");
9495 Type *PhiType = IndDesc.getStep()->getType();
9496
9497 // Build a pointer phi
9498 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9499 Type *ScStValueType = ScalarStartValue->getType();
9500 PHINode *NewPointerPhi =
9501 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9502
9503 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9504 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9505
9506 // A pointer induction, performed by using a gep
9507 Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9508
9509 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9510 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9511 Value *NumUnrolledElems =
9512 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9513 Value *InductionGEP = GetElementPtrInst::Create(
9514 IndDesc.getElementType(), NewPointerPhi,
9515 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9516 InductionLoc);
9517 // Add induction update using an incorrect block temporarily. The phi node
9518 // will be fixed after VPlan execution. Note that at this point the latch
9519 // block cannot be used, as it does not exist yet.
9520 // TODO: Model increment value in VPlan, by turning the recipe into a
9521 // multi-def and a subclass of VPHeaderPHIRecipe.
9522 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9523
9524 // Create UF many actual address geps that use the pointer
9525 // phi as base and a vectorized version of the step value
9526 // (<step*0, ..., step*N>) as offset.
9527 for (unsigned Part = 0; Part < State.UF; ++Part) {
9528 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9529 Value *StartOffsetScalar =
9530 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9531 Value *StartOffset =
9532 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9533 // Create a vector of consecutive numbers from zero to VF.
9534 StartOffset = State.Builder.CreateAdd(
9535 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9536
9537 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
9538 "scalar step must be the same across all parts");
9539 Value *GEP = State.Builder.CreateGEP(
9540 IndDesc.getElementType(), NewPointerPhi,
9541 State.Builder.CreateMul(
9542 StartOffset,
9543 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9544 "vector.gep"));
9545 State.set(this, GEP, Part);
9546 }
9547 }
9548
execute(VPTransformState & State)9549 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9550 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9551
9552 // Fast-math-flags propagate from the original induction instruction.
9553 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9554 if (IndDesc.getInductionBinOp() &&
9555 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9556 State.Builder.setFastMathFlags(
9557 IndDesc.getInductionBinOp()->getFastMathFlags());
9558
9559 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9560 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9561 Value *DerivedIV =
9562 emitTransformedIndex(State.Builder, CanonicalIV,
9563 getStartValue()->getLiveInIRValue(), Step, IndDesc);
9564 DerivedIV->setName("offset.idx");
9565 if (ResultTy != DerivedIV->getType()) {
9566 assert(Step->getType()->isIntegerTy() &&
9567 "Truncation requires an integer step");
9568 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9569 }
9570 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9571
9572 State.set(this, DerivedIV, VPIteration(0, 0));
9573 }
9574
execute(VPTransformState & State)9575 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9576 // Fast-math-flags propagate from the original induction instruction.
9577 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9578 if (IndDesc.getInductionBinOp() &&
9579 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9580 State.Builder.setFastMathFlags(
9581 IndDesc.getInductionBinOp()->getFastMathFlags());
9582
9583 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9584 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9585
9586 buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9587 }
9588
execute(VPTransformState & State)9589 void VPInterleaveRecipe::execute(VPTransformState &State) {
9590 assert(!State.Instance && "Interleave group being replicated.");
9591 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9592 getStoredValues(), getMask());
9593 }
9594
execute(VPTransformState & State)9595 void VPReductionRecipe::execute(VPTransformState &State) {
9596 assert(!State.Instance && "Reduction being replicated.");
9597 Value *PrevInChain = State.get(getChainOp(), 0);
9598 RecurKind Kind = RdxDesc->getRecurrenceKind();
9599 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9600 // Propagate the fast-math flags carried by the underlying instruction.
9601 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9602 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9603 for (unsigned Part = 0; Part < State.UF; ++Part) {
9604 Value *NewVecOp = State.get(getVecOp(), Part);
9605 if (VPValue *Cond = getCondOp()) {
9606 Value *NewCond = State.get(Cond, Part);
9607 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9608 Value *Iden = RdxDesc->getRecurrenceIdentity(
9609 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9610 Value *IdenVec =
9611 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9612 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9613 NewVecOp = Select;
9614 }
9615 Value *NewRed;
9616 Value *NextInChain;
9617 if (IsOrdered) {
9618 if (State.VF.isVector())
9619 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9620 PrevInChain);
9621 else
9622 NewRed = State.Builder.CreateBinOp(
9623 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9624 NewVecOp);
9625 PrevInChain = NewRed;
9626 } else {
9627 PrevInChain = State.get(getChainOp(), Part);
9628 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9629 }
9630 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9631 NextInChain =
9632 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9633 NewRed, PrevInChain);
9634 } else if (IsOrdered)
9635 NextInChain = NewRed;
9636 else
9637 NextInChain = State.Builder.CreateBinOp(
9638 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9639 PrevInChain);
9640 State.set(this, NextInChain, Part);
9641 }
9642 }
9643
execute(VPTransformState & State)9644 void VPReplicateRecipe::execute(VPTransformState &State) {
9645 Instruction *UI = getUnderlyingInstr();
9646 if (State.Instance) { // Generate a single instance.
9647 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9648 State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9649 IsPredicated, State);
9650 // Insert scalar instance packing it into a vector.
9651 if (AlsoPack && State.VF.isVector()) {
9652 // If we're constructing lane 0, initialize to start from poison.
9653 if (State.Instance->Lane.isFirstLane()) {
9654 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9655 Value *Poison = PoisonValue::get(
9656 VectorType::get(UI->getType(), State.VF));
9657 State.set(this, Poison, State.Instance->Part);
9658 }
9659 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9660 }
9661 return;
9662 }
9663
9664 if (IsUniform) {
9665 // If the recipe is uniform across all parts (instead of just per VF), only
9666 // generate a single instance.
9667 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9668 all_of(operands(), [](VPValue *Op) {
9669 return Op->isDefinedOutsideVectorRegions();
9670 })) {
9671 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9672 State);
9673 if (user_begin() != user_end()) {
9674 for (unsigned Part = 1; Part < State.UF; ++Part)
9675 State.set(this, State.get(this, VPIteration(0, 0)),
9676 VPIteration(Part, 0));
9677 }
9678 return;
9679 }
9680
9681 // Uniform within VL means we need to generate lane 0 only for each
9682 // unrolled copy.
9683 for (unsigned Part = 0; Part < State.UF; ++Part)
9684 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9685 IsPredicated, State);
9686 return;
9687 }
9688
9689 // A store of a loop varying value to a loop invariant address only
9690 // needs only the last copy of the store.
9691 if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9692 auto Lane = VPLane::getLastLaneForVF(State.VF);
9693 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9694 State);
9695 return;
9696 }
9697
9698 // Generate scalar instances for all VF lanes of all UF parts.
9699 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9700 const unsigned EndLane = State.VF.getKnownMinValue();
9701 for (unsigned Part = 0; Part < State.UF; ++Part)
9702 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9703 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9704 IsPredicated, State);
9705 }
9706
execute(VPTransformState & State)9707 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9708 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9709
9710 // Attempt to issue a wide load.
9711 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9712 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9713
9714 assert((LI || SI) && "Invalid Load/Store instruction");
9715 assert((!SI || StoredValue) && "No stored value provided for widened store");
9716 assert((!LI || !StoredValue) && "Stored value provided for widened load");
9717
9718 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9719
9720 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9721 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9722 bool CreateGatherScatter = !Consecutive;
9723
9724 auto &Builder = State.Builder;
9725 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9726 bool isMaskRequired = getMask();
9727 if (isMaskRequired)
9728 for (unsigned Part = 0; Part < State.UF; ++Part)
9729 BlockInMaskParts[Part] = State.get(getMask(), Part);
9730
9731 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9732 // Calculate the pointer for the specific unroll-part.
9733 GetElementPtrInst *PartPtr = nullptr;
9734
9735 bool InBounds = false;
9736 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9737 InBounds = gep->isInBounds();
9738 if (Reverse) {
9739 // If the address is consecutive but reversed, then the
9740 // wide store needs to start at the last vector element.
9741 // RunTimeVF = VScale * VF.getKnownMinValue()
9742 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9743 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9744 // NumElt = -Part * RunTimeVF
9745 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9746 // LastLane = 1 - RunTimeVF
9747 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9748 PartPtr =
9749 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9750 PartPtr->setIsInBounds(InBounds);
9751 PartPtr = cast<GetElementPtrInst>(
9752 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9753 PartPtr->setIsInBounds(InBounds);
9754 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9755 BlockInMaskParts[Part] =
9756 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9757 } else {
9758 Value *Increment =
9759 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9760 PartPtr = cast<GetElementPtrInst>(
9761 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9762 PartPtr->setIsInBounds(InBounds);
9763 }
9764
9765 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9766 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9767 };
9768
9769 // Handle Stores:
9770 if (SI) {
9771 State.setDebugLocFromInst(SI);
9772
9773 for (unsigned Part = 0; Part < State.UF; ++Part) {
9774 Instruction *NewSI = nullptr;
9775 Value *StoredVal = State.get(StoredValue, Part);
9776 if (CreateGatherScatter) {
9777 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9778 Value *VectorGep = State.get(getAddr(), Part);
9779 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9780 MaskPart);
9781 } else {
9782 if (Reverse) {
9783 // If we store to reverse consecutive memory locations, then we need
9784 // to reverse the order of elements in the stored value.
9785 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9786 // We don't want to update the value in the map as it might be used in
9787 // another expression. So don't call resetVectorValue(StoredVal).
9788 }
9789 auto *VecPtr =
9790 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9791 if (isMaskRequired)
9792 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9793 BlockInMaskParts[Part]);
9794 else
9795 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9796 }
9797 State.addMetadata(NewSI, SI);
9798 }
9799 return;
9800 }
9801
9802 // Handle loads.
9803 assert(LI && "Must have a load instruction");
9804 State.setDebugLocFromInst(LI);
9805 for (unsigned Part = 0; Part < State.UF; ++Part) {
9806 Value *NewLI;
9807 if (CreateGatherScatter) {
9808 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9809 Value *VectorGep = State.get(getAddr(), Part);
9810 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9811 nullptr, "wide.masked.gather");
9812 State.addMetadata(NewLI, LI);
9813 } else {
9814 auto *VecPtr =
9815 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9816 if (isMaskRequired)
9817 NewLI = Builder.CreateMaskedLoad(
9818 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9819 PoisonValue::get(DataTy), "wide.masked.load");
9820 else
9821 NewLI =
9822 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9823
9824 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9825 State.addMetadata(NewLI, LI);
9826 if (Reverse)
9827 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9828 }
9829
9830 State.set(getVPSingleValue(), NewLI, Part);
9831 }
9832 }
9833
9834 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9835 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9836 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9837 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,AssumptionCache * AC,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9838 static ScalarEpilogueLowering getScalarEpilogueLowering(
9839 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9840 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9841 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9842 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9843 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9844 // don't look at hints or options, and don't request a scalar epilogue.
9845 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9846 // LoopAccessInfo (due to code dependency and not being able to reliably get
9847 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9848 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9849 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9850 // back to the old way and vectorize with versioning when forced. See D81345.)
9851 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9852 PGSOQueryType::IRPass) &&
9853 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9854 return CM_ScalarEpilogueNotAllowedOptSize;
9855
9856 // 2) If set, obey the directives
9857 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9858 switch (PreferPredicateOverEpilogue) {
9859 case PreferPredicateTy::ScalarEpilogue:
9860 return CM_ScalarEpilogueAllowed;
9861 case PreferPredicateTy::PredicateElseScalarEpilogue:
9862 return CM_ScalarEpilogueNotNeededUsePredicate;
9863 case PreferPredicateTy::PredicateOrDontVectorize:
9864 return CM_ScalarEpilogueNotAllowedUsePredicate;
9865 };
9866 }
9867
9868 // 3) If set, obey the hints
9869 switch (Hints.getPredicate()) {
9870 case LoopVectorizeHints::FK_Enabled:
9871 return CM_ScalarEpilogueNotNeededUsePredicate;
9872 case LoopVectorizeHints::FK_Disabled:
9873 return CM_ScalarEpilogueAllowed;
9874 };
9875
9876 // 4) if the TTI hook indicates this is profitable, request predication.
9877 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9878 return CM_ScalarEpilogueNotNeededUsePredicate;
9879
9880 return CM_ScalarEpilogueAllowed;
9881 }
9882
get(VPValue * Def,unsigned Part)9883 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9884 // If Values have been set for this Def return the one relevant for \p Part.
9885 if (hasVectorValue(Def, Part))
9886 return Data.PerPartOutput[Def][Part];
9887
9888 if (!hasScalarValue(Def, {Part, 0})) {
9889 Value *IRV = Def->getLiveInIRValue();
9890 Value *B = ILV->getBroadcastInstrs(IRV);
9891 set(Def, B, Part);
9892 return B;
9893 }
9894
9895 Value *ScalarValue = get(Def, {Part, 0});
9896 // If we aren't vectorizing, we can just copy the scalar map values over
9897 // to the vector map.
9898 if (VF.isScalar()) {
9899 set(Def, ScalarValue, Part);
9900 return ScalarValue;
9901 }
9902
9903 bool IsUniform = vputils::isUniformAfterVectorization(Def);
9904
9905 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9906 // Check if there is a scalar value for the selected lane.
9907 if (!hasScalarValue(Def, {Part, LastLane})) {
9908 // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9909 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
9910 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
9911 "unexpected recipe found to be invariant");
9912 IsUniform = true;
9913 LastLane = 0;
9914 }
9915
9916 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9917 // Set the insert point after the last scalarized instruction or after the
9918 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9919 // will directly follow the scalar definitions.
9920 auto OldIP = Builder.saveIP();
9921 auto NewIP =
9922 isa<PHINode>(LastInst)
9923 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9924 : std::next(BasicBlock::iterator(LastInst));
9925 Builder.SetInsertPoint(&*NewIP);
9926
9927 // However, if we are vectorizing, we need to construct the vector values.
9928 // If the value is known to be uniform after vectorization, we can just
9929 // broadcast the scalar value corresponding to lane zero for each unroll
9930 // iteration. Otherwise, we construct the vector values using
9931 // insertelement instructions. Since the resulting vectors are stored in
9932 // State, we will only generate the insertelements once.
9933 Value *VectorValue = nullptr;
9934 if (IsUniform) {
9935 VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9936 set(Def, VectorValue, Part);
9937 } else {
9938 // Initialize packing with insertelements to start from undef.
9939 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9940 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9941 set(Def, Undef, Part);
9942 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9943 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9944 VectorValue = get(Def, Part);
9945 }
9946 Builder.restoreIP(OldIP);
9947 return VectorValue;
9948 }
9949
9950 // Process the loop in the VPlan-native vectorization path. This path builds
9951 // VPlan upfront in the vectorization pipeline, which allows to apply
9952 // VPlan-to-VPlan transformations from the very beginning without modifying the
9953 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9954 static bool processLoopInVPlanNativePath(
9955 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9956 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9957 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9958 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9959 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9960 LoopVectorizationRequirements &Requirements) {
9961
9962 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9963 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9964 return false;
9965 }
9966 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9967 Function *F = L->getHeader()->getParent();
9968 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9969
9970 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9971 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9972
9973 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9974 &Hints, IAI);
9975 // Use the planner for outer loop vectorization.
9976 // TODO: CM is not used at this point inside the planner. Turn CM into an
9977 // optional argument if we don't need it in the future.
9978 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9979
9980 // Get user vectorization factor.
9981 ElementCount UserVF = Hints.getWidth();
9982
9983 CM.collectElementTypesForWidening();
9984
9985 // Plan how to best vectorize, return the best VF and its cost.
9986 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9987
9988 // If we are stress testing VPlan builds, do not attempt to generate vector
9989 // code. Masked vector code generation support will follow soon.
9990 // Also, do not attempt to vectorize if no vector code will be produced.
9991 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9992 return false;
9993
9994 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9995
9996 {
9997 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9998 F->getParent()->getDataLayout());
9999 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10000 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
10001 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10002 << L->getHeader()->getParent()->getName() << "\"\n");
10003 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10004 }
10005
10006 // Mark the loop as already vectorized to avoid vectorizing again.
10007 Hints.setAlreadyVectorized();
10008 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10009 return true;
10010 }
10011
10012 // Emit a remark if there are stores to floats that required a floating point
10013 // extension. If the vectorized loop was generated with floating point there
10014 // will be a performance penalty from the conversion overhead and the change in
10015 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)10016 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10017 SmallVector<Instruction *, 4> Worklist;
10018 for (BasicBlock *BB : L->getBlocks()) {
10019 for (Instruction &Inst : *BB) {
10020 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10021 if (S->getValueOperand()->getType()->isFloatTy())
10022 Worklist.push_back(S);
10023 }
10024 }
10025 }
10026
10027 // Traverse the floating point stores upwards searching, for floating point
10028 // conversions.
10029 SmallPtrSet<const Instruction *, 4> Visited;
10030 SmallPtrSet<const Instruction *, 4> EmittedRemark;
10031 while (!Worklist.empty()) {
10032 auto *I = Worklist.pop_back_val();
10033 if (!L->contains(I))
10034 continue;
10035 if (!Visited.insert(I).second)
10036 continue;
10037
10038 // Emit a remark if the floating point store required a floating
10039 // point conversion.
10040 // TODO: More work could be done to identify the root cause such as a
10041 // constant or a function return type and point the user to it.
10042 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10043 ORE->emit([&]() {
10044 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10045 I->getDebugLoc(), L->getHeader())
10046 << "floating point conversion changes vector width. "
10047 << "Mixed floating point precision requires an up/down "
10048 << "cast that will negatively impact performance.";
10049 });
10050
10051 for (Use &Op : I->operands())
10052 if (auto *OpI = dyn_cast<Instruction>(Op))
10053 Worklist.push_back(OpI);
10054 }
10055 }
10056
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE)10057 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10058 VectorizationFactor &VF,
10059 std::optional<unsigned> VScale, Loop *L,
10060 ScalarEvolution &SE) {
10061 InstructionCost CheckCost = Checks.getCost();
10062 if (!CheckCost.isValid())
10063 return false;
10064
10065 // When interleaving only scalar and vector cost will be equal, which in turn
10066 // would lead to a divide by 0. Fall back to hard threshold.
10067 if (VF.Width.isScalar()) {
10068 if (CheckCost > VectorizeMemoryCheckThreshold) {
10069 LLVM_DEBUG(
10070 dbgs()
10071 << "LV: Interleaving only is not profitable due to runtime checks\n");
10072 return false;
10073 }
10074 return true;
10075 }
10076
10077 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10078 double ScalarC = *VF.ScalarCost.getValue();
10079 if (ScalarC == 0)
10080 return true;
10081
10082 // First, compute the minimum iteration count required so that the vector
10083 // loop outperforms the scalar loop.
10084 // The total cost of the scalar loop is
10085 // ScalarC * TC
10086 // where
10087 // * TC is the actual trip count of the loop.
10088 // * ScalarC is the cost of a single scalar iteration.
10089 //
10090 // The total cost of the vector loop is
10091 // RtC + VecC * (TC / VF) + EpiC
10092 // where
10093 // * RtC is the cost of the generated runtime checks
10094 // * VecC is the cost of a single vector iteration.
10095 // * TC is the actual trip count of the loop
10096 // * VF is the vectorization factor
10097 // * EpiCost is the cost of the generated epilogue, including the cost
10098 // of the remaining scalar operations.
10099 //
10100 // Vectorization is profitable once the total vector cost is less than the
10101 // total scalar cost:
10102 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10103 //
10104 // Now we can compute the minimum required trip count TC as
10105 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10106 //
10107 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10108 // the computations are performed on doubles, not integers and the result
10109 // is rounded up, hence we get an upper estimate of the TC.
10110 unsigned IntVF = VF.Width.getKnownMinValue();
10111 if (VF.Width.isScalable()) {
10112 unsigned AssumedMinimumVscale = 1;
10113 if (VScale)
10114 AssumedMinimumVscale = *VScale;
10115 IntVF *= AssumedMinimumVscale;
10116 }
10117 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10118 double RtC = *CheckCost.getValue();
10119 double MinTC1 = RtC / (ScalarC - VecCOverVF);
10120
10121 // Second, compute a minimum iteration count so that the cost of the
10122 // runtime checks is only a fraction of the total scalar loop cost. This
10123 // adds a loop-dependent bound on the overhead incurred if the runtime
10124 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10125 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10126 // cost, compute
10127 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10128 double MinTC2 = RtC * 10 / ScalarC;
10129
10130 // Now pick the larger minimum. If it is not a multiple of VF, choose the
10131 // next closest multiple of VF. This should partly compensate for ignoring
10132 // the epilogue cost.
10133 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10134 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10135
10136 LLVM_DEBUG(
10137 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10138 << VF.MinProfitableTripCount << "\n");
10139
10140 // Skip vectorization if the expected trip count is less than the minimum
10141 // required trip count.
10142 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10143 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10144 VF.MinProfitableTripCount)) {
10145 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10146 "trip count < minimum profitable VF ("
10147 << *ExpectedTC << " < " << VF.MinProfitableTripCount
10148 << ")\n");
10149
10150 return false;
10151 }
10152 }
10153 return true;
10154 }
10155
LoopVectorizePass(LoopVectorizeOptions Opts)10156 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10157 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10158 !EnableLoopInterleaving),
10159 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10160 !EnableLoopVectorization) {}
10161
processLoop(Loop * L)10162 bool LoopVectorizePass::processLoop(Loop *L) {
10163 assert((EnableVPlanNativePath || L->isInnermost()) &&
10164 "VPlan-native path is not enabled. Only process inner loops.");
10165
10166 #ifndef NDEBUG
10167 const std::string DebugLocStr = getDebugLocString(L);
10168 #endif /* NDEBUG */
10169
10170 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10171 << L->getHeader()->getParent()->getName() << "' from "
10172 << DebugLocStr << "\n");
10173
10174 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10175
10176 LLVM_DEBUG(
10177 dbgs() << "LV: Loop hints:"
10178 << " force="
10179 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10180 ? "disabled"
10181 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10182 ? "enabled"
10183 : "?"))
10184 << " width=" << Hints.getWidth()
10185 << " interleave=" << Hints.getInterleave() << "\n");
10186
10187 // Function containing loop
10188 Function *F = L->getHeader()->getParent();
10189
10190 // Looking at the diagnostic output is the only way to determine if a loop
10191 // was vectorized (other than looking at the IR or machine code), so it
10192 // is important to generate an optimization remark for each loop. Most of
10193 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10194 // generated as OptimizationRemark and OptimizationRemarkMissed are
10195 // less verbose reporting vectorized loops and unvectorized loops that may
10196 // benefit from vectorization, respectively.
10197
10198 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10199 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10200 return false;
10201 }
10202
10203 PredicatedScalarEvolution PSE(*SE, *L);
10204
10205 // Check if it is legal to vectorize the loop.
10206 LoopVectorizationRequirements Requirements;
10207 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10208 &Requirements, &Hints, DB, AC, BFI, PSI);
10209 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10210 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10211 Hints.emitRemarkWithHints();
10212 return false;
10213 }
10214
10215 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10216 // here. They may require CFG and instruction level transformations before
10217 // even evaluating whether vectorization is profitable. Since we cannot modify
10218 // the incoming IR, we need to build VPlan upfront in the vectorization
10219 // pipeline.
10220 if (!L->isInnermost())
10221 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10222 ORE, BFI, PSI, Hints, Requirements);
10223
10224 assert(L->isInnermost() && "Inner loop expected.");
10225
10226 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10227 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10228
10229 // If an override option has been passed in for interleaved accesses, use it.
10230 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10231 UseInterleaved = EnableInterleavedMemAccesses;
10232
10233 // Analyze interleaved memory accesses.
10234 if (UseInterleaved)
10235 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10236
10237 // Check the function attributes and profiles to find out if this function
10238 // should be optimized for size.
10239 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10240 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10241
10242 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10243 // count by optimizing for size, to minimize overheads.
10244 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10245 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10246 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10247 << "This loop is worth vectorizing only if no scalar "
10248 << "iteration overheads are incurred.");
10249 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10250 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10251 else {
10252 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10253 LLVM_DEBUG(dbgs() << "\n");
10254 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10255 } else {
10256 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10257 "small to consider vectorizing.\n");
10258 reportVectorizationFailure(
10259 "The trip count is below the minial threshold value.",
10260 "loop trip count is too low, avoiding vectorization",
10261 "LowTripCount", ORE, L);
10262 Hints.emitRemarkWithHints();
10263 return false;
10264 }
10265 }
10266 }
10267
10268 // Check the function attributes to see if implicit floats or vectors are
10269 // allowed.
10270 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10271 reportVectorizationFailure(
10272 "Can't vectorize when the NoImplicitFloat attribute is used",
10273 "loop not vectorized due to NoImplicitFloat attribute",
10274 "NoImplicitFloat", ORE, L);
10275 Hints.emitRemarkWithHints();
10276 return false;
10277 }
10278
10279 // Check if the target supports potentially unsafe FP vectorization.
10280 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10281 // for the target we're vectorizing for, to make sure none of the
10282 // additional fp-math flags can help.
10283 if (Hints.isPotentiallyUnsafe() &&
10284 TTI->isFPVectorizationPotentiallyUnsafe()) {
10285 reportVectorizationFailure(
10286 "Potentially unsafe FP op prevents vectorization",
10287 "loop not vectorized due to unsafe FP support.",
10288 "UnsafeFP", ORE, L);
10289 Hints.emitRemarkWithHints();
10290 return false;
10291 }
10292
10293 bool AllowOrderedReductions;
10294 // If the flag is set, use that instead and override the TTI behaviour.
10295 if (ForceOrderedReductions.getNumOccurrences() > 0)
10296 AllowOrderedReductions = ForceOrderedReductions;
10297 else
10298 AllowOrderedReductions = TTI->enableOrderedReductions();
10299 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10300 ORE->emit([&]() {
10301 auto *ExactFPMathInst = Requirements.getExactFPInst();
10302 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10303 ExactFPMathInst->getDebugLoc(),
10304 ExactFPMathInst->getParent())
10305 << "loop not vectorized: cannot prove it is safe to reorder "
10306 "floating-point operations";
10307 });
10308 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10309 "reorder floating-point operations\n");
10310 Hints.emitRemarkWithHints();
10311 return false;
10312 }
10313
10314 // Use the cost model.
10315 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10316 F, &Hints, IAI);
10317 CM.collectValuesToIgnore();
10318 CM.collectElementTypesForWidening();
10319
10320 // Use the planner for vectorization.
10321 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10322
10323 // Get user vectorization factor and interleave count.
10324 ElementCount UserVF = Hints.getWidth();
10325 unsigned UserIC = Hints.getInterleave();
10326
10327 // Plan how to best vectorize, return the best VF and its cost.
10328 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10329
10330 VectorizationFactor VF = VectorizationFactor::Disabled();
10331 unsigned IC = 1;
10332
10333 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10334 F->getParent()->getDataLayout());
10335 if (MaybeVF) {
10336 VF = *MaybeVF;
10337 // Select the interleave count.
10338 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10339
10340 unsigned SelectedIC = std::max(IC, UserIC);
10341 // Optimistically generate runtime checks if they are needed. Drop them if
10342 // they turn out to not be profitable.
10343 if (VF.Width.isVector() || SelectedIC > 1)
10344 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10345
10346 // Check if it is profitable to vectorize with runtime checks.
10347 bool ForceVectorization =
10348 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10349 if (!ForceVectorization &&
10350 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10351 *PSE.getSE())) {
10352 ORE->emit([&]() {
10353 return OptimizationRemarkAnalysisAliasing(
10354 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10355 L->getHeader())
10356 << "loop not vectorized: cannot prove it is safe to reorder "
10357 "memory operations";
10358 });
10359 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10360 Hints.emitRemarkWithHints();
10361 return false;
10362 }
10363 }
10364
10365 // Identify the diagnostic messages that should be produced.
10366 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10367 bool VectorizeLoop = true, InterleaveLoop = true;
10368 if (VF.Width.isScalar()) {
10369 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10370 VecDiagMsg = std::make_pair(
10371 "VectorizationNotBeneficial",
10372 "the cost-model indicates that vectorization is not beneficial");
10373 VectorizeLoop = false;
10374 }
10375
10376 if (!MaybeVF && UserIC > 1) {
10377 // Tell the user interleaving was avoided up-front, despite being explicitly
10378 // requested.
10379 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10380 "interleaving should be avoided up front\n");
10381 IntDiagMsg = std::make_pair(
10382 "InterleavingAvoided",
10383 "Ignoring UserIC, because interleaving was avoided up front");
10384 InterleaveLoop = false;
10385 } else if (IC == 1 && UserIC <= 1) {
10386 // Tell the user interleaving is not beneficial.
10387 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10388 IntDiagMsg = std::make_pair(
10389 "InterleavingNotBeneficial",
10390 "the cost-model indicates that interleaving is not beneficial");
10391 InterleaveLoop = false;
10392 if (UserIC == 1) {
10393 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10394 IntDiagMsg.second +=
10395 " and is explicitly disabled or interleave count is set to 1";
10396 }
10397 } else if (IC > 1 && UserIC == 1) {
10398 // Tell the user interleaving is beneficial, but it explicitly disabled.
10399 LLVM_DEBUG(
10400 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10401 IntDiagMsg = std::make_pair(
10402 "InterleavingBeneficialButDisabled",
10403 "the cost-model indicates that interleaving is beneficial "
10404 "but is explicitly disabled or interleave count is set to 1");
10405 InterleaveLoop = false;
10406 }
10407
10408 // Override IC if user provided an interleave count.
10409 IC = UserIC > 0 ? UserIC : IC;
10410
10411 // Emit diagnostic messages, if any.
10412 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10413 if (!VectorizeLoop && !InterleaveLoop) {
10414 // Do not vectorize or interleaving the loop.
10415 ORE->emit([&]() {
10416 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10417 L->getStartLoc(), L->getHeader())
10418 << VecDiagMsg.second;
10419 });
10420 ORE->emit([&]() {
10421 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10422 L->getStartLoc(), L->getHeader())
10423 << IntDiagMsg.second;
10424 });
10425 return false;
10426 } else if (!VectorizeLoop && InterleaveLoop) {
10427 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10428 ORE->emit([&]() {
10429 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10430 L->getStartLoc(), L->getHeader())
10431 << VecDiagMsg.second;
10432 });
10433 } else if (VectorizeLoop && !InterleaveLoop) {
10434 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10435 << ") in " << DebugLocStr << '\n');
10436 ORE->emit([&]() {
10437 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10438 L->getStartLoc(), L->getHeader())
10439 << IntDiagMsg.second;
10440 });
10441 } else if (VectorizeLoop && InterleaveLoop) {
10442 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10443 << ") in " << DebugLocStr << '\n');
10444 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10445 }
10446
10447 bool DisableRuntimeUnroll = false;
10448 MDNode *OrigLoopID = L->getLoopID();
10449 {
10450 using namespace ore;
10451 if (!VectorizeLoop) {
10452 assert(IC > 1 && "interleave count should not be 1 or 0");
10453 // If we decided that it is not legal to vectorize the loop, then
10454 // interleave it.
10455 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10456 &CM, BFI, PSI, Checks);
10457
10458 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10459 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10460
10461 ORE->emit([&]() {
10462 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10463 L->getHeader())
10464 << "interleaved loop (interleaved count: "
10465 << NV("InterleaveCount", IC) << ")";
10466 });
10467 } else {
10468 // If we decided that it is *legal* to vectorize the loop, then do it.
10469
10470 // Consider vectorizing the epilogue too if it's profitable.
10471 VectorizationFactor EpilogueVF =
10472 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10473 if (EpilogueVF.Width.isVector()) {
10474
10475 // The first pass vectorizes the main loop and creates a scalar epilogue
10476 // to be vectorized by executing the plan (potentially with a different
10477 // factor) again shortly afterwards.
10478 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10479 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10480 EPI, &LVL, &CM, BFI, PSI, Checks);
10481
10482 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10483 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10484 DT, true);
10485 ++LoopsVectorized;
10486
10487 // Second pass vectorizes the epilogue and adjusts the control flow
10488 // edges from the first pass.
10489 EPI.MainLoopVF = EPI.EpilogueVF;
10490 EPI.MainLoopUF = EPI.EpilogueUF;
10491 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10492 ORE, EPI, &LVL, &CM, BFI, PSI,
10493 Checks);
10494
10495 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10496 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10497 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10498 Header->setName("vec.epilog.vector.body");
10499
10500 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10501 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10502 // before vectorizing the epilogue loop.
10503 for (VPRecipeBase &R : Header->phis()) {
10504 if (isa<VPCanonicalIVPHIRecipe>(&R))
10505 continue;
10506
10507 Value *ResumeV = nullptr;
10508 // TODO: Move setting of resume values to prepareToExecute.
10509 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10510 ResumeV = MainILV.getReductionResumeValue(
10511 ReductionPhi->getRecurrenceDescriptor());
10512 } else {
10513 // Create induction resume values for both widened pointer and
10514 // integer/fp inductions and update the start value of the induction
10515 // recipes to use the resume value.
10516 PHINode *IndPhi = nullptr;
10517 const InductionDescriptor *ID;
10518 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10519 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10520 ID = &Ind->getInductionDescriptor();
10521 } else {
10522 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10523 IndPhi = WidenInd->getPHINode();
10524 ID = &WidenInd->getInductionDescriptor();
10525 }
10526
10527 ResumeV = MainILV.createInductionResumeValue(
10528 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10529 }
10530 assert(ResumeV && "Must have a resume value");
10531 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10532 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10533 }
10534
10535 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10536 DT, true);
10537 ++LoopsEpilogueVectorized;
10538
10539 if (!MainILV.areSafetyChecksAdded())
10540 DisableRuntimeUnroll = true;
10541 } else {
10542 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10543 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10544 PSI, Checks);
10545
10546 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10547 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10548 ++LoopsVectorized;
10549
10550 // Add metadata to disable runtime unrolling a scalar loop when there
10551 // are no runtime checks about strides and memory. A scalar loop that is
10552 // rarely used is not worth unrolling.
10553 if (!LB.areSafetyChecksAdded())
10554 DisableRuntimeUnroll = true;
10555 }
10556 // Report the vectorization decision.
10557 ORE->emit([&]() {
10558 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10559 L->getHeader())
10560 << "vectorized loop (vectorization width: "
10561 << NV("VectorizationFactor", VF.Width)
10562 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10563 });
10564 }
10565
10566 if (ORE->allowExtraAnalysis(LV_NAME))
10567 checkMixedPrecision(L, ORE);
10568 }
10569
10570 std::optional<MDNode *> RemainderLoopID =
10571 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10572 LLVMLoopVectorizeFollowupEpilogue});
10573 if (RemainderLoopID) {
10574 L->setLoopID(*RemainderLoopID);
10575 } else {
10576 if (DisableRuntimeUnroll)
10577 AddRuntimeUnrollDisableMetaData(L);
10578
10579 // Mark the loop as already vectorized to avoid vectorizing again.
10580 Hints.setAlreadyVectorized();
10581 }
10582
10583 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10584 return true;
10585 }
10586
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo & BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)10587 LoopVectorizeResult LoopVectorizePass::runImpl(
10588 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10589 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10590 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10591 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10592 SE = &SE_;
10593 LI = &LI_;
10594 TTI = &TTI_;
10595 DT = &DT_;
10596 BFI = &BFI_;
10597 TLI = TLI_;
10598 AC = &AC_;
10599 LAIs = &LAIs_;
10600 DB = &DB_;
10601 ORE = &ORE_;
10602 PSI = PSI_;
10603
10604 // Don't attempt if
10605 // 1. the target claims to have no vector registers, and
10606 // 2. interleaving won't help ILP.
10607 //
10608 // The second condition is necessary because, even if the target has no
10609 // vector registers, loop vectorization may still enable scalar
10610 // interleaving.
10611 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10612 TTI->getMaxInterleaveFactor(1) < 2)
10613 return LoopVectorizeResult(false, false);
10614
10615 bool Changed = false, CFGChanged = false;
10616
10617 // The vectorizer requires loops to be in simplified form.
10618 // Since simplification may add new inner loops, it has to run before the
10619 // legality and profitability checks. This means running the loop vectorizer
10620 // will simplify all loops, regardless of whether anything end up being
10621 // vectorized.
10622 for (const auto &L : *LI)
10623 Changed |= CFGChanged |=
10624 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10625
10626 // Build up a worklist of inner-loops to vectorize. This is necessary as
10627 // the act of vectorizing or partially unrolling a loop creates new loops
10628 // and can invalidate iterators across the loops.
10629 SmallVector<Loop *, 8> Worklist;
10630
10631 for (Loop *L : *LI)
10632 collectSupportedLoops(*L, LI, ORE, Worklist);
10633
10634 LoopsAnalyzed += Worklist.size();
10635
10636 // Now walk the identified inner loops.
10637 while (!Worklist.empty()) {
10638 Loop *L = Worklist.pop_back_val();
10639
10640 // For the inner loops we actually process, form LCSSA to simplify the
10641 // transform.
10642 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10643
10644 Changed |= CFGChanged |= processLoop(L);
10645
10646 if (Changed)
10647 LAIs->clear();
10648 }
10649
10650 // Process each loop nest in the function.
10651 return LoopVectorizeResult(Changed, CFGChanged);
10652 }
10653
run(Function & F,FunctionAnalysisManager & AM)10654 PreservedAnalyses LoopVectorizePass::run(Function &F,
10655 FunctionAnalysisManager &AM) {
10656 auto &LI = AM.getResult<LoopAnalysis>(F);
10657 // There are no loops in the function. Return before computing other expensive
10658 // analyses.
10659 if (LI.empty())
10660 return PreservedAnalyses::all();
10661 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10662 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10663 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10664 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10665 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10666 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10667 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10668 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10669
10670 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10671 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10672 ProfileSummaryInfo *PSI =
10673 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10674 LoopVectorizeResult Result =
10675 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10676 if (!Result.MadeAnyChange)
10677 return PreservedAnalyses::all();
10678 PreservedAnalyses PA;
10679
10680 // We currently do not preserve loopinfo/dominator analyses with outer loop
10681 // vectorization. Until this is addressed, mark these analyses as preserved
10682 // only for non-VPlan-native path.
10683 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10684 if (!EnableVPlanNativePath) {
10685 PA.preserve<LoopAnalysis>();
10686 PA.preserve<DominatorTreeAnalysis>();
10687 }
10688
10689 if (Result.MadeCFGChange) {
10690 // Making CFG changes likely means a loop got vectorized. Indicate that
10691 // extra simplification passes should be run.
10692 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10693 // be run if runtime checks have been added.
10694 AM.getResult<ShouldRunExtraVectorPasses>(F);
10695 PA.preserve<ShouldRunExtraVectorPasses>();
10696 } else {
10697 PA.preserveSet<CFGAnalyses>();
10698 }
10699 return PA;
10700 }
10701
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10702 void LoopVectorizePass::printPipeline(
10703 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10704 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10705 OS, MapClassName2PassName);
10706
10707 OS << "<";
10708 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10709 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10710 OS << ">";
10711 }
10712