1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/STLExtras.h"
69 #include "llvm/ADT/SmallPtrSet.h"
70 #include "llvm/ADT/SmallSet.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/Statistic.h"
73 #include "llvm/ADT/StringRef.h"
74 #include "llvm/ADT/Twine.h"
75 #include "llvm/ADT/iterator_range.h"
76 #include "llvm/Analysis/AssumptionCache.h"
77 #include "llvm/Analysis/BasicAliasAnalysis.h"
78 #include "llvm/Analysis/BlockFrequencyInfo.h"
79 #include "llvm/Analysis/CFG.h"
80 #include "llvm/Analysis/CodeMetrics.h"
81 #include "llvm/Analysis/DemandedBits.h"
82 #include "llvm/Analysis/GlobalsModRef.h"
83 #include "llvm/Analysis/LoopAccessAnalysis.h"
84 #include "llvm/Analysis/LoopAnalysisManager.h"
85 #include "llvm/Analysis/LoopInfo.h"
86 #include "llvm/Analysis/LoopIterator.h"
87 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
88 #include "llvm/Analysis/ProfileSummaryInfo.h"
89 #include "llvm/Analysis/ScalarEvolution.h"
90 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
91 #include "llvm/Analysis/TargetLibraryInfo.h"
92 #include "llvm/Analysis/TargetTransformInfo.h"
93 #include "llvm/Analysis/ValueTracking.h"
94 #include "llvm/Analysis/VectorUtils.h"
95 #include "llvm/IR/Attributes.h"
96 #include "llvm/IR/BasicBlock.h"
97 #include "llvm/IR/CFG.h"
98 #include "llvm/IR/Constant.h"
99 #include "llvm/IR/Constants.h"
100 #include "llvm/IR/DataLayout.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/DebugLoc.h"
103 #include "llvm/IR/DerivedTypes.h"
104 #include "llvm/IR/DiagnosticInfo.h"
105 #include "llvm/IR/Dominators.h"
106 #include "llvm/IR/Function.h"
107 #include "llvm/IR/IRBuilder.h"
108 #include "llvm/IR/InstrTypes.h"
109 #include "llvm/IR/Instruction.h"
110 #include "llvm/IR/Instructions.h"
111 #include "llvm/IR/IntrinsicInst.h"
112 #include "llvm/IR/Intrinsics.h"
113 #include "llvm/IR/Metadata.h"
114 #include "llvm/IR/Module.h"
115 #include "llvm/IR/Operator.h"
116 #include "llvm/IR/PatternMatch.h"
117 #include "llvm/IR/Type.h"
118 #include "llvm/IR/Use.h"
119 #include "llvm/IR/User.h"
120 #include "llvm/IR/Value.h"
121 #include "llvm/IR/ValueHandle.h"
122 #include "llvm/IR/Verifier.h"
123 #include "llvm/InitializePasses.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
126 #include "llvm/Support/CommandLine.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
129 #include "llvm/Support/ErrorHandling.h"
130 #include "llvm/Support/InstructionCost.h"
131 #include "llvm/Support/MathExtras.h"
132 #include "llvm/Support/raw_ostream.h"
133 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
134 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
135 #include "llvm/Transforms/Utils/LoopSimplify.h"
136 #include "llvm/Transforms/Utils/LoopUtils.h"
137 #include "llvm/Transforms/Utils/LoopVersioning.h"
138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cmath>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks"));
202 
203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204 // that predication is preferred, and this lists all options. I.e., the
205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
206 // and predicate the instructions accordingly. If tail-folding fails, there are
207 // different fallback strategies depending on these values:
208 namespace PreferPredicateTy {
209   enum Option {
210     ScalarEpilogue = 0,
211     PredicateElseScalarEpilogue,
212     PredicateOrDontVectorize
213   };
214 } // namespace PreferPredicateTy
215 
216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217     "prefer-predicate-over-epilogue",
218     cl::init(PreferPredicateTy::ScalarEpilogue),
219     cl::Hidden,
220     cl::desc("Tail-folding and predication preferences over creating a scalar "
221              "epilogue loop."),
222     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
223                          "scalar-epilogue",
224                          "Don't tail-predicate loops, create scalar epilogue"),
225               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
226                          "predicate-else-scalar-epilogue",
227                          "prefer tail-folding, create scalar epilogue if tail "
228                          "folding fails."),
229               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
230                          "predicate-dont-vectorize",
231                          "prefers tail-folding, don't attempt vectorization if "
232                          "tail-folding fails.")));
233 
234 static cl::opt<bool> MaximizeBandwidth(
235     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236     cl::desc("Maximize bandwidth when selecting vectorization factor which "
237              "will be determined by the smallest type in loop."));
238 
239 static cl::opt<bool> EnableInterleavedMemAccesses(
240     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242 
243 /// An interleave-group may need masking if it resides in a block that needs
244 /// predication, or in order to mask away gaps.
245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248 
249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251     cl::desc("We don't interleave loops with a estimated constant trip count "
252              "below this number"));
253 
254 static cl::opt<unsigned> ForceTargetNumScalarRegs(
255     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256     cl::desc("A flag that overrides the target's number of scalar registers."));
257 
258 static cl::opt<unsigned> ForceTargetNumVectorRegs(
259     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of vector registers."));
261 
262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's max interleave factor for "
265              "scalar loops."));
266 
267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269     cl::desc("A flag that overrides the target's max interleave factor for "
270              "vectorized loops."));
271 
272 static cl::opt<unsigned> ForceTargetInstructionCost(
273     "force-target-instruction-cost", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's expected cost for "
275              "an instruction to a single constant value. Mostly "
276              "useful for getting consistent testing."));
277 
278 static cl::opt<bool> ForceTargetSupportsScalableVectors(
279     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280     cl::desc(
281         "Pretend that scalable vectors are supported, even if the target does "
282         "not support them. This flag should only be used for testing."));
283 
284 static cl::opt<unsigned> SmallLoopCost(
285     "small-loop-cost", cl::init(20), cl::Hidden,
286     cl::desc(
287         "The cost of a loop that is considered 'small' by the interleaver."));
288 
289 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291     cl::desc("Enable the use of the block frequency analysis to access PGO "
292              "heuristics minimizing code growth in cold regions and being more "
293              "aggressive in hot regions."));
294 
295 // Runtime interleave loops for load/store throughput.
296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298     cl::desc(
299         "Enable runtime interleaving until load/store ports are saturated"));
300 
301 /// Interleave small loops with scalar reductions.
302 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304     cl::desc("Enable interleaving for loops with small iteration counts that "
305              "contain scalar reductions to expose ILP."));
306 
307 /// The number of stores in a loop that are allowed to need predication.
308 static cl::opt<unsigned> NumberOfStoresToPredicate(
309     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310     cl::desc("Max number of stores to be predicated behind an if."));
311 
312 static cl::opt<bool> EnableIndVarRegisterHeur(
313     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314     cl::desc("Count the induction variable only once when interleaving"));
315 
316 static cl::opt<bool> EnableCondStoresVectorization(
317     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318     cl::desc("Enable if predication of stores during vectorization."));
319 
320 static cl::opt<unsigned> MaxNestedScalarReductionIC(
321     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322     cl::desc("The maximum interleave count to use when interleaving a scalar "
323              "reduction in a nested loop."));
324 
325 static cl::opt<bool>
326     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327                            cl::Hidden,
328                            cl::desc("Prefer in-loop vector reductions, "
329                                     "overriding the targets preference."));
330 
331 static cl::opt<bool> ForceOrderedReductions(
332     "force-ordered-reductions", cl::init(false), cl::Hidden,
333     cl::desc("Enable the vectorisation of loops with in-order (strict) "
334              "FP reductions"));
335 
336 static cl::opt<bool> PreferPredicatedReductionSelect(
337     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338     cl::desc(
339         "Prefer predicating a reduction operation over an after loop select."));
340 
341 cl::opt<bool> EnableVPlanNativePath(
342     "enable-vplan-native-path", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 static cl::opt<bool> PrintVPlansInDotFormat(
365     "vplan-print-in-dot-format", cl::Hidden,
366     cl::desc("Use dot format instead of plain text when dumping VPlans"));
367 
368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369     "force-widen-divrem-via-safe-divisor", cl::Hidden,
370     cl::desc(
371         "Override cost based safe divisor widening for div/rem instructions"));
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377   // Determine if an array of N elements of type Ty is "bitcast compatible"
378   // with a <N x Ty> vector.
379   // This is only true if there is no padding between the array elements.
380   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381 }
382 
383 /// A helper function that returns the reciprocal of the block probability of
384 /// predicated blocks. If we return X, we are assuming the predicated block
385 /// will execute once for every X iterations of the loop header.
386 ///
387 /// TODO: We should use actual block probability here, if available. Currently,
388 ///       we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()389 static unsigned getReciprocalPredBlockProb() { return 2; }
390 
391 /// A helper function that returns an integer or floating-point constant with
392 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395                            : ConstantFP::get(Ty, C);
396 }
397 
398 /// Returns "best known" trip count for the specified loop \p L as defined by
399 /// the following procedure:
400 ///   1) Returns exact trip count if it is known.
401 ///   2) Returns expected trip count according to profile data if any.
402 ///   3) Returns upper bound estimate if it is known.
403 ///   4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405                                                    Loop *L) {
406   // Check if exact trip count is known.
407   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408     return ExpectedTC;
409 
410   // Check if there is an expected trip count available from profile data.
411   if (LoopVectorizeWithBlockFrequency)
412     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413       return *EstimatedTC;
414 
415   // Check if upper bound estimate is known.
416   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417     return ExpectedTC;
418 
419   return std::nullopt;
420 }
421 
422 namespace {
423 // Forward declare GeneratedRTChecks.
424 class GeneratedRTChecks;
425 } // namespace
426 
427 namespace llvm {
428 
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       ElementCount MinProfitableTripCount,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464 
465     if (MinProfitableTripCount.isZero())
466       this->MinProfitableTripCount = VecWidth;
467     else
468       this->MinProfitableTripCount = MinProfitableTripCount;
469   }
470 
471   virtual ~InnerLoopVectorizer() = default;
472 
473   /// Create a new empty loop that will contain vectorized instructions later
474   /// on, while the old loop will be used as the scalar remainder. Control flow
475   /// is generated around the vectorized (and scalar epilogue) loops consisting
476   /// of various checks and bypasses. Return the pre-header block of the new
477   /// loop and the start value for the canonical induction, if it is != 0. The
478   /// latter is the case when vectorizing the epilogue loop. In the case of
479   /// epilogue vectorization, this function is overriden to handle the more
480   /// complex control flow around the loops.
481   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482 
483   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485 
486   // Return true if any runtime check is added.
areSafetyChecksAdded()487   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488 
489   /// A type for vectorized values in the new loop. Each value from the
490   /// original loop, when vectorized, is represented by UF vector values in the
491   /// new unrolled loop, where UF is the unroll factor.
492   using VectorParts = SmallVector<Value *, 2>;
493 
494   /// A helper function to scalarize a single Instruction in the innermost loop.
495   /// Generates a sequence of scalar instances for each lane between \p MinLane
496   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498   /// Instr's operands.
499   void scalarizeInstruction(const Instruction *Instr,
500                             VPReplicateRecipe *RepRecipe,
501                             const VPIteration &Instance, bool IfPredicateInstr,
502                             VPTransformState &State);
503 
504   /// Construct the vector value of a scalarized value \p V one lane at a time.
505   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506                                  VPTransformState &State);
507 
508   /// Try to vectorize interleaved access group \p Group with the base address
509   /// given in \p Addr, optionally masking the vector operations if \p
510   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511   /// values in the vectorized loop.
512   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513                                 ArrayRef<VPValue *> VPDefs,
514                                 VPTransformState &State, VPValue *Addr,
515                                 ArrayRef<VPValue *> StoredValues,
516                                 VPValue *BlockInMask = nullptr);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520 
521   /// Returns true if the reordering of FP operations is not allowed, but we are
522   /// able to vectorize with strict in-order reductions for the given RdxDesc.
523   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524 
525   /// Create a broadcast instruction. This method generates a broadcast
526   /// instruction (shuffle) for loop invariant values and for the induction
527   /// value. If this is the induction variable then we extend it to N, N+1, ...
528   /// this is needed because each iteration in the loop corresponds to a SIMD
529   /// element.
530   virtual Value *getBroadcastInstrs(Value *V);
531 
532   // Returns the resume value (bc.merge.rdx) for a reduction as
533   // generated by fixReduction.
534   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535 
536   /// Create a new phi node for the induction variable \p OrigPhi to resume
537   /// iteration count in the scalar epilogue, from where the vectorized loop
538   /// left off. In cases where the loop skeleton is more complicated (eg.
539   /// epilogue vectorization) and the resume values can come from an additional
540   /// bypass block, the \p AdditionalBypass pair provides information about the
541   /// bypass block and the end value on the edge from bypass to this loop.
542   PHINode *createInductionResumeValue(
543       PHINode *OrigPhi, const InductionDescriptor &ID,
544       ArrayRef<BasicBlock *> BypassBlocks,
545       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546 
547 protected:
548   friend class LoopVectorizationPlanner;
549 
550   /// A small list of PHINodes.
551   using PhiVector = SmallVector<PHINode *, 4>;
552 
553   /// A type for scalarized values in the new loop. Each value from the
554   /// original loop, when scalarized, is represented by UF x VF scalar values
555   /// in the new unrolled loop, where UF is the unroll factor and VF is the
556   /// vectorization factor.
557   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558 
559   /// Set up the values of the IVs correctly when exiting the vector loop.
560   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561                     Value *VectorTripCount, Value *EndValue,
562                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563                     VPlan &Plan);
564 
565   /// Handle all cross-iteration phis in the header.
566   void fixCrossIterationPHIs(VPTransformState &State);
567 
568   /// Create the exit value of first order recurrences in the middle block and
569   /// update their users.
570   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571                                VPTransformState &State);
572 
573   /// Create code for the loop exit value of the reduction.
574   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575 
576   /// Clear NSW/NUW flags from reduction instructions if necessary.
577   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Iteratively sink the scalarized operands of a predicated instruction into
581   /// the block that was created for it.
582   void sinkScalarOperands(Instruction *PredInst);
583 
584   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585   /// represented as.
586   void truncateToMinimalBitwidths(VPTransformState &State);
587 
588   /// Returns (and creates if needed) the original loop trip count.
589   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590 
591   /// Returns (and creates if needed) the trip count of the widened loop.
592   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593 
594   /// Returns a bitcasted value to the requested vector type.
595   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597                                 const DataLayout &DL);
598 
599   /// Emit a bypass check to see if the vector trip count is zero, including if
600   /// it overflows.
601   void emitIterationCountCheck(BasicBlock *Bypass);
602 
603   /// Emit a bypass check to see if all of the SCEV assumptions we've
604   /// had to make are correct. Returns the block containing the checks or
605   /// nullptr if no checks have been added.
606   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607 
608   /// Emit bypass checks to check any memory assumptions we may have made.
609   /// Returns the block containing the checks or nullptr if no checks have been
610   /// added.
611   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612 
613   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614   /// vector loop preheader, middle block and scalar preheader.
615   void createVectorLoopSkeleton(StringRef Prefix);
616 
617   /// Create new phi nodes for the induction variables to resume iteration count
618   /// in the scalar epilogue, from where the vectorized loop left off.
619   /// In cases where the loop skeleton is more complicated (eg. epilogue
620   /// vectorization) and the resume values can come from an additional bypass
621   /// block, the \p AdditionalBypass pair provides information about the bypass
622   /// block and the end value on the edge from bypass to this loop.
623   void createInductionResumeValues(
624       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625 
626   /// Complete the loop skeleton by adding debug MDs, creating appropriate
627   /// conditional branches in the middle block, preparing the builder and
628   /// running the verifier. Return the preheader of the completed vector loop.
629   BasicBlock *completeLoopSkeleton();
630 
631   /// Collect poison-generating recipes that may generate a poison value that is
632   /// used after vectorization, even when their operands are not poison. Those
633   /// recipes meet the following conditions:
634   ///  * Contribute to the address computation of a recipe generating a widen
635   ///    memory load/store (VPWidenMemoryInstructionRecipe or
636   ///    VPInterleaveRecipe).
637   ///  * Such a widen memory load/store has at least one underlying Instruction
638   ///    that is in a basic block that needs predication and after vectorization
639   ///    the generated instruction won't be predicated.
640   void collectPoisonGeneratingRecipes(VPTransformState &State);
641 
642   /// Allow subclasses to override and print debug traces before/after vplan
643   /// execution, when trace information is requested.
printDebugTracesAtStart()644   virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()645   virtual void printDebugTracesAtEnd(){};
646 
647   /// The original loop.
648   Loop *OrigLoop;
649 
650   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651   /// dynamic knowledge to simplify SCEV expressions and converts them to a
652   /// more usable form.
653   PredicatedScalarEvolution &PSE;
654 
655   /// Loop Info.
656   LoopInfo *LI;
657 
658   /// Dominator Tree.
659   DominatorTree *DT;
660 
661   /// Target Library Info.
662   const TargetLibraryInfo *TLI;
663 
664   /// Target Transform Info.
665   const TargetTransformInfo *TTI;
666 
667   /// Assumption Cache.
668   AssumptionCache *AC;
669 
670   /// Interface to emit optimization remarks.
671   OptimizationRemarkEmitter *ORE;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   ElementCount VF;
676 
677   ElementCount MinProfitableTripCount;
678 
679   /// The vectorization unroll factor to use. Each scalar is vectorized to this
680   /// many different vector instructions.
681   unsigned UF;
682 
683   /// The builder that we use
684   IRBuilder<> Builder;
685 
686   // --- Vectorization state ---
687 
688   /// The vector-loop preheader.
689   BasicBlock *LoopVectorPreHeader;
690 
691   /// The scalar-loop preheader.
692   BasicBlock *LoopScalarPreHeader;
693 
694   /// Middle Block between the vector and the scalar.
695   BasicBlock *LoopMiddleBlock;
696 
697   /// The unique ExitBlock of the scalar loop if one exists.  Note that
698   /// there can be multiple exiting edges reaching this block.
699   BasicBlock *LoopExitBlock;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// Store instructions that were predicated.
708   SmallVector<Instruction *, 4> PredicatedInstructions;
709 
710   /// Trip count of the original loop.
711   Value *TripCount = nullptr;
712 
713   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714   Value *VectorTripCount = nullptr;
715 
716   /// The legality analysis.
717   LoopVectorizationLegality *Legal;
718 
719   /// The profitablity analysis.
720   LoopVectorizationCostModel *Cost;
721 
722   // Record whether runtime checks are added.
723   bool AddedSafetyChecks = false;
724 
725   // Holds the end values for each induction variable. We save the end values
726   // so we can later fix-up the external users of the induction variables.
727   DenseMap<PHINode *, Value *> IVEndValues;
728 
729   /// BFI and PSI are used to check for profile guided size optimizations.
730   BlockFrequencyInfo *BFI;
731   ProfileSummaryInfo *PSI;
732 
733   // Whether this loop should be optimized for size based on profile guided size
734   // optimizatios.
735   bool OptForSizeBasedOnProfile;
736 
737   /// Structure to hold information about generated runtime checks, responsible
738   /// for cleaning the checks, if vectorization turns out unprofitable.
739   GeneratedRTChecks &RTChecks;
740 
741   // Holds the resume values for reductions in the loops, used to set the
742   // correct start value of reduction PHIs when vectorizing the epilogue.
743   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744       ReductionResumeValues;
745 };
746 
747 class InnerLoopUnroller : public InnerLoopVectorizer {
748 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)749   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750                     LoopInfo *LI, DominatorTree *DT,
751                     const TargetLibraryInfo *TLI,
752                     const TargetTransformInfo *TTI, AssumptionCache *AC,
753                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754                     LoopVectorizationLegality *LVL,
755                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758                             ElementCount::getFixed(1),
759                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760                             BFI, PSI, Check) {}
761 
762 private:
763   Value *getBroadcastInstrs(Value *V) override;
764 };
765 
766 /// Encapsulate information regarding vectorization of a loop and its epilogue.
767 /// This information is meant to be updated and used across two stages of
768 /// epilogue vectorization.
769 struct EpilogueLoopVectorizationInfo {
770   ElementCount MainLoopVF = ElementCount::getFixed(0);
771   unsigned MainLoopUF = 0;
772   ElementCount EpilogueVF = ElementCount::getFixed(0);
773   unsigned EpilogueUF = 0;
774   BasicBlock *MainLoopIterationCountCheck = nullptr;
775   BasicBlock *EpilogueIterationCountCheck = nullptr;
776   BasicBlock *SCEVSafetyCheck = nullptr;
777   BasicBlock *MemSafetyCheck = nullptr;
778   Value *TripCount = nullptr;
779   Value *VectorTripCount = nullptr;
780 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo781   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782                                 ElementCount EVF, unsigned EUF)
783       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784     assert(EUF == 1 &&
785            "A high UF for the epilogue loop is likely not beneficial.");
786   }
787 };
788 
789 /// An extension of the inner loop vectorizer that creates a skeleton for a
790 /// vectorized loop that has its epilogue (residual) also vectorized.
791 /// The idea is to run the vplan on a given loop twice, firstly to setup the
792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
793 /// from the first step and vectorize the epilogue.  This is achieved by
794 /// deriving two concrete strategy classes from this base class and invoking
795 /// them in succession from the loop vectorizer planner.
796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)798   InnerLoopAndEpilogueVectorizer(
799       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800       DominatorTree *DT, const TargetLibraryInfo *TLI,
801       const TargetTransformInfo *TTI, AssumptionCache *AC,
802       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805       GeneratedRTChecks &Checks)
806       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808                             CM, BFI, PSI, Checks),
809         EPI(EPI) {}
810 
811   // Override this function to handle the more complex control flow around the
812   // three loops.
createVectorizedLoopSkeleton()813   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814     return createEpilogueVectorizedLoopSkeleton();
815   }
816 
817   /// The interface for creating a vectorized skeleton using one of two
818   /// different strategies, each corresponding to one execution of the vplan
819   /// as described above.
820   virtual std::pair<BasicBlock *, Value *>
821   createEpilogueVectorizedLoopSkeleton() = 0;
822 
823   /// Holds and updates state information required to vectorize the main loop
824   /// and its epilogue in two separate passes. This setup helps us avoid
825   /// regenerating and recomputing runtime safety checks. It also helps us to
826   /// shorten the iteration-count-check path length for the cases where the
827   /// iteration count of the loop is so small that the main vector loop is
828   /// completely skipped.
829   EpilogueLoopVectorizationInfo &EPI;
830 };
831 
832 /// A specialized derived class of inner loop vectorizer that performs
833 /// vectorization of *main* loops in the process of vectorizing loops and their
834 /// epilogues.
835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)837   EpilogueVectorizerMainLoop(
838       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839       DominatorTree *DT, const TargetLibraryInfo *TLI,
840       const TargetTransformInfo *TTI, AssumptionCache *AC,
841       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844       GeneratedRTChecks &Check)
845       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846                                        EPI, LVL, CM, BFI, PSI, Check) {}
847   /// Implements the interface for creating a vectorized skeleton using the
848   /// *main loop* strategy (ie the first pass of vplan execution).
849   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850 
851 protected:
852   /// Emits an iteration count bypass check once for the main loop (when \p
853   /// ForEpilogue is false) and once for the epilogue loop (when \p
854   /// ForEpilogue is true).
855   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856   void printDebugTracesAtStart() override;
857   void printDebugTracesAtEnd() override;
858 };
859 
860 // A specialized derived class of inner loop vectorizer that performs
861 // vectorization of *epilogue* loops in the process of vectorizing loops and
862 // their epilogues.
863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)865   EpilogueVectorizerEpilogueLoop(
866       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867       DominatorTree *DT, const TargetLibraryInfo *TLI,
868       const TargetTransformInfo *TTI, AssumptionCache *AC,
869       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872       GeneratedRTChecks &Checks)
873       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874                                        EPI, LVL, CM, BFI, PSI, Checks) {
875     TripCount = EPI.TripCount;
876   }
877   /// Implements the interface for creating a vectorized skeleton using the
878   /// *epilogue loop* strategy (ie the second pass of vplan execution).
879   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880 
881 protected:
882   /// Emits an iteration count bypass check after the main vector loop has
883   /// finished to see if there are any iterations left to execute by either
884   /// the vector epilogue or the scalar epilogue.
885   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886                                                       BasicBlock *Bypass,
887                                                       BasicBlock *Insert);
888   void printDebugTracesAtStart() override;
889   void printDebugTracesAtEnd() override;
890 };
891 } // end namespace llvm
892 
893 /// Look for a meaningful debug location on the instruction or it's
894 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896   if (!I)
897     return I;
898 
899   DebugLoc Empty;
900   if (I->getDebugLoc() != Empty)
901     return I;
902 
903   for (Use &Op : I->operands()) {
904     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905       if (OpInst->getDebugLoc() != Empty)
906         return OpInst;
907   }
908 
909   return I;
910 }
911 
912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913 /// is passed, the message relates to that particular instruction.
914 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)915 static void debugVectorizationMessage(const StringRef Prefix,
916                                       const StringRef DebugMsg,
917                                       Instruction *I) {
918   dbgs() << "LV: " << Prefix << DebugMsg;
919   if (I != nullptr)
920     dbgs() << " " << *I;
921   else
922     dbgs() << '.';
923   dbgs() << '\n';
924 }
925 #endif
926 
927 /// Create an analysis remark that explains why vectorization failed
928 ///
929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
930 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
931 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
932 /// the location of the remark.  \return the remark object that can be
933 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936   Value *CodeRegion = TheLoop->getHeader();
937   DebugLoc DL = TheLoop->getStartLoc();
938 
939   if (I) {
940     CodeRegion = I->getParent();
941     // If there is no debug location attached to the instruction, revert back to
942     // using the loop's.
943     if (I->getDebugLoc())
944       DL = I->getDebugLoc();
945   }
946 
947   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948 }
949 
950 namespace llvm {
951 
952 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954                        int64_t Step) {
955   assert(Ty->isIntegerTy() && "Expected an integer step");
956   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958 }
959 
960 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963   return VF.isScalable() ? B.CreateVScale(EC) : EC;
964 }
965 
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE)966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
969 
970   ScalarEvolution &SE = *PSE.getSE();
971 
972   // The exit count might have the type of i64 while the phi is i32. This can
973   // happen if we have an induction variable that is sign extended before the
974   // compare. The only way that we get a backedge taken count is that the
975   // induction variable was signed and as such will not overflow. In such a case
976   // truncation is legal.
977   if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978       IdxTy->getPrimitiveSizeInBits())
979     BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980   BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981 
982   // Get the total trip count from the count by adding 1.
983   return SE.getAddExpr(BackedgeTakenCount,
984                        SE.getOne(BackedgeTakenCount->getType()));
985 }
986 
getRuntimeVFAsFloat(IRBuilderBase & B,Type * FTy,ElementCount VF)987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988                                   ElementCount VF) {
989   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
990   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992   return B.CreateUIToFP(RuntimeVF, FTy);
993 }
994 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)995 void reportVectorizationFailure(const StringRef DebugMsg,
996                                 const StringRef OREMsg, const StringRef ORETag,
997                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998                                 Instruction *I) {
999   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1000   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001   ORE->emit(
1002       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003       << "loop not vectorized: " << OREMsg);
1004 }
1005 
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008                              Instruction *I) {
1009   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1010   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011   ORE->emit(
1012       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013       << Msg);
1014 }
1015 
1016 } // end namespace llvm
1017 
1018 #ifndef NDEBUG
1019 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)1020 static std::string getDebugLocString(const Loop *L) {
1021   std::string Result;
1022   if (L) {
1023     raw_string_ostream OS(Result);
1024     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025       LoopDbgLoc.print(OS);
1026     else
1027       // Just print the module name.
1028       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029     OS.flush();
1030   }
1031   return Result;
1032 }
1033 #endif
1034 
collectPoisonGeneratingRecipes(VPTransformState & State)1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036     VPTransformState &State) {
1037 
1038   // Collect recipes in the backward slice of `Root` that may generate a poison
1039   // value that is used after vectorization.
1040   SmallPtrSet<VPRecipeBase *, 16> Visited;
1041   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042     SmallVector<VPRecipeBase *, 16> Worklist;
1043     Worklist.push_back(Root);
1044 
1045     // Traverse the backward slice of Root through its use-def chain.
1046     while (!Worklist.empty()) {
1047       VPRecipeBase *CurRec = Worklist.back();
1048       Worklist.pop_back();
1049 
1050       if (!Visited.insert(CurRec).second)
1051         continue;
1052 
1053       // Prune search if we find another recipe generating a widen memory
1054       // instruction. Widen memory instructions involved in address computation
1055       // will lead to gather/scatter instructions, which don't need to be
1056       // handled.
1057       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058           isa<VPInterleaveRecipe>(CurRec) ||
1059           isa<VPScalarIVStepsRecipe>(CurRec) ||
1060           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062         continue;
1063 
1064       // This recipe contributes to the address computation of a widen
1065       // load/store. Collect recipe if its underlying instruction has
1066       // poison-generating flags.
1067       Instruction *Instr = CurRec->getUnderlyingInstr();
1068       if (Instr && Instr->hasPoisonGeneratingFlags())
1069         State.MayGeneratePoisonRecipes.insert(CurRec);
1070 
1071       // Add new definitions to the worklist.
1072       for (VPValue *operand : CurRec->operands())
1073         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074           Worklist.push_back(OpDef);
1075     }
1076   });
1077 
1078   // Traverse all the recipes in the VPlan and collect the poison-generating
1079   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080   // VPInterleaveRecipe.
1081   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083     for (VPRecipeBase &Recipe : *VPBB) {
1084       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087         if (AddrDef && WidenRec->isConsecutive() &&
1088             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092         if (AddrDef) {
1093           // Check if any member of the interleave group needs predication.
1094           const InterleaveGroup<Instruction> *InterGroup =
1095               InterleaveRec->getInterleaveGroup();
1096           bool NeedPredication = false;
1097           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098                I < NumMembers; ++I) {
1099             Instruction *Member = InterGroup->getMember(I);
1100             if (Member)
1101               NeedPredication |=
1102                   Legal->blockNeedsPredication(Member->getParent());
1103           }
1104 
1105           if (NeedPredication)
1106             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107         }
1108       }
1109     }
1110   }
1111 }
1112 
getReductionResumeValue(const RecurrenceDescriptor & RdxDesc)1113 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114     const RecurrenceDescriptor &RdxDesc) {
1115   auto It = ReductionResumeValues.find(&RdxDesc);
1116   assert(It != ReductionResumeValues.end() &&
1117          "Expected to find a resume value for the reduction.");
1118   return It->second;
1119 }
1120 
1121 namespace llvm {
1122 
1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1124 // lowered.
1125 enum ScalarEpilogueLowering {
1126 
1127   // The default: allowing scalar epilogues.
1128   CM_ScalarEpilogueAllowed,
1129 
1130   // Vectorization with OptForSize: don't allow epilogues.
1131   CM_ScalarEpilogueNotAllowedOptSize,
1132 
1133   // A special case of vectorisation with OptForSize: loops with a very small
1134   // trip count are considered for vectorization under OptForSize, thereby
1135   // making sure the cost of their loop body is dominant, free of runtime
1136   // guards and scalar iteration overheads.
1137   CM_ScalarEpilogueNotAllowedLowTripLoop,
1138 
1139   // Loop hint predicate indicating an epilogue is undesired.
1140   CM_ScalarEpilogueNotNeededUsePredicate,
1141 
1142   // Directive indicating we must either tail fold or not vectorize
1143   CM_ScalarEpilogueNotAllowedUsePredicate
1144 };
1145 
1146 /// ElementCountComparator creates a total ordering for ElementCount
1147 /// for the purposes of using it in a set structure.
1148 struct ElementCountComparator {
operator ()llvm::ElementCountComparator1149   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152   }
1153 };
1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155 
1156 /// LoopVectorizationCostModel - estimates the expected speedups due to
1157 /// vectorization.
1158 /// In many cases vectorization is not profitable. This can happen because of
1159 /// a number of reasons. In this class we mainly attempt to predict the
1160 /// expected speedup/slowdowns due to the supported instruction set. We use the
1161 /// TargetTransformInfo to query the different backends for the cost of
1162 /// different operations.
1163 class LoopVectorizationCostModel {
1164 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1165   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167                              LoopVectorizationLegality *Legal,
1168                              const TargetTransformInfo &TTI,
1169                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1170                              AssumptionCache *AC,
1171                              OptimizationRemarkEmitter *ORE, const Function *F,
1172                              const LoopVectorizeHints *Hints,
1173                              InterleavedAccessInfo &IAI)
1174       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176         Hints(Hints), InterleaveInfo(IAI) {}
1177 
1178   /// \return An upper bound for the vectorization factors (both fixed and
1179   /// scalable). If the factors are 0, vectorization and interleaving should be
1180   /// avoided up front.
1181   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182 
1183   /// \return True if runtime checks are required for vectorization, and false
1184   /// otherwise.
1185   bool runtimeChecksRequired();
1186 
1187   /// \return The most profitable vectorization factor and the cost of that VF.
1188   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189   /// then this vectorization factor will be selected if vectorization is
1190   /// possible.
1191   VectorizationFactor
1192   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193 
1194   VectorizationFactor
1195   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196                                     const LoopVectorizationPlanner &LVP);
1197 
1198   /// Setup cost-based decisions for user vectorization factor.
1199   /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1200   bool selectUserVectorizationFactor(ElementCount UserVF) {
1201     collectUniformsAndScalars(UserVF);
1202     collectInstsToScalarize(UserVF);
1203     return expectedCost(UserVF).first.isValid();
1204   }
1205 
1206   /// \return The size (in bits) of the smallest and widest types in the code
1207   /// that needs to be vectorized. We ignore values that remain scalar such as
1208   /// 64 bit loop indices.
1209   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210 
1211   /// \return The desired interleave count.
1212   /// If interleave count has been specified by metadata it will be returned.
1213   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214   /// are the selected vectorization factor and the cost of the selected VF.
1215   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216 
1217   /// Memory access instruction may be vectorized in more than one way.
1218   /// Form of instruction after vectorization depends on cost.
1219   /// This function takes cost-based decisions for Load/Store instructions
1220   /// and collects them in a map. This decisions map is used for building
1221   /// the lists of loop-uniform and loop-scalar instructions.
1222   /// The calculated cost is saved with widening decision in order to
1223   /// avoid redundant calculations.
1224   void setCostBasedWideningDecision(ElementCount VF);
1225 
1226   /// A struct that represents some properties of the register usage
1227   /// of a loop.
1228   struct RegisterUsage {
1229     /// Holds the number of loop invariant values that are used in the loop.
1230     /// The key is ClassID of target-provided register class.
1231     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232     /// Holds the maximum number of concurrent live intervals in the loop.
1233     /// The key is ClassID of target-provided register class.
1234     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235   };
1236 
1237   /// \return Returns information about the register usages of the loop for the
1238   /// given vectorization factors.
1239   SmallVector<RegisterUsage, 8>
1240   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241 
1242   /// Collect values we want to ignore in the cost model.
1243   void collectValuesToIgnore();
1244 
1245   /// Collect all element types in the loop for which widening is needed.
1246   void collectElementTypesForWidening();
1247 
1248   /// Split reductions into those that happen in the loop, and those that happen
1249   /// outside. In loop reductions are collected into InLoopReductionChains.
1250   void collectInLoopReductions();
1251 
1252   /// Returns true if we should use strict in-order reductions for the given
1253   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255   /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const1256   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257     return !Hints->allowReordering() && RdxDesc.isOrdered();
1258   }
1259 
1260   /// \returns The smallest bitwidth each instruction can be represented with.
1261   /// The vector equivalents of these instructions should be truncated to this
1262   /// type.
getMinimalBitwidths() const1263   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264     return MinBWs;
1265   }
1266 
1267   /// \returns True if it is more profitable to scalarize instruction \p I for
1268   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1269   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270     assert(VF.isVector() &&
1271            "Profitable to scalarize relevant only for VF > 1.");
1272 
1273     // Cost model is not run in the VPlan-native path - return conservative
1274     // result until this changes.
1275     if (EnableVPlanNativePath)
1276       return false;
1277 
1278     auto Scalars = InstsToScalarize.find(VF);
1279     assert(Scalars != InstsToScalarize.end() &&
1280            "VF not yet analyzed for scalarization profitability");
1281     return Scalars->second.find(I) != Scalars->second.end();
1282   }
1283 
1284   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1285   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286     if (VF.isScalar())
1287       return true;
1288 
1289     // Cost model is not run in the VPlan-native path - return conservative
1290     // result until this changes.
1291     if (EnableVPlanNativePath)
1292       return false;
1293 
1294     auto UniformsPerVF = Uniforms.find(VF);
1295     assert(UniformsPerVF != Uniforms.end() &&
1296            "VF not yet analyzed for uniformity");
1297     return UniformsPerVF->second.count(I);
1298   }
1299 
1300   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1301   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302     if (VF.isScalar())
1303       return true;
1304 
1305     // Cost model is not run in the VPlan-native path - return conservative
1306     // result until this changes.
1307     if (EnableVPlanNativePath)
1308       return false;
1309 
1310     auto ScalarsPerVF = Scalars.find(VF);
1311     assert(ScalarsPerVF != Scalars.end() &&
1312            "Scalar values are not calculated for VF");
1313     return ScalarsPerVF->second.count(I);
1314   }
1315 
1316   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1318   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320            !isProfitableToScalarize(I, VF) &&
1321            !isScalarAfterVectorization(I, VF);
1322   }
1323 
1324   /// Decision that was taken during cost calculation for memory instruction.
1325   enum InstWidening {
1326     CM_Unknown,
1327     CM_Widen,         // For consecutive accesses with stride +1.
1328     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329     CM_Interleave,
1330     CM_GatherScatter,
1331     CM_Scalarize
1332   };
1333 
1334   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1336   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337                            InstructionCost Cost) {
1338     assert(VF.isVector() && "Expected VF >=2");
1339     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340   }
1341 
1342   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1344   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345                            ElementCount VF, InstWidening W,
1346                            InstructionCost Cost) {
1347     assert(VF.isVector() && "Expected VF >=2");
1348     /// Broadcast this decicion to all instructions inside the group.
1349     /// But the cost will be assigned to one instruction only.
1350     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351       if (auto *I = Grp->getMember(i)) {
1352         if (Grp->getInsertPos() == I)
1353           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354         else
1355           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356       }
1357     }
1358   }
1359 
1360   /// Return the cost model decision for the given instruction \p I and vector
1361   /// width \p VF. Return CM_Unknown if this instruction did not pass
1362   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1363   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364     assert(VF.isVector() && "Expected VF to be a vector VF");
1365     // Cost model is not run in the VPlan-native path - return conservative
1366     // result until this changes.
1367     if (EnableVPlanNativePath)
1368       return CM_GatherScatter;
1369 
1370     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371     auto Itr = WideningDecisions.find(InstOnVF);
1372     if (Itr == WideningDecisions.end())
1373       return CM_Unknown;
1374     return Itr->second.first;
1375   }
1376 
1377   /// Return the vectorization cost for the given instruction \p I and vector
1378   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1379   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380     assert(VF.isVector() && "Expected VF >=2");
1381     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1383            "The cost is not calculated");
1384     return WideningDecisions[InstOnVF].second;
1385   }
1386 
1387   /// Return True if instruction \p I is an optimizable truncate whose operand
1388   /// is an induction variable. Such a truncate will be removed by adding a new
1389   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1390   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391     // If the instruction is not a truncate, return false.
1392     auto *Trunc = dyn_cast<TruncInst>(I);
1393     if (!Trunc)
1394       return false;
1395 
1396     // Get the source and destination types of the truncate.
1397     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399 
1400     // If the truncate is free for the given types, return false. Replacing a
1401     // free truncate with an induction variable would add an induction variable
1402     // update instruction to each iteration of the loop. We exclude from this
1403     // check the primary induction variable since it will need an update
1404     // instruction regardless.
1405     Value *Op = Trunc->getOperand(0);
1406     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407       return false;
1408 
1409     // If the truncated value is not an induction variable, return false.
1410     return Legal->isInductionPhi(Op);
1411   }
1412 
1413   /// Collects the instructions to scalarize for each predicated instruction in
1414   /// the loop.
1415   void collectInstsToScalarize(ElementCount VF);
1416 
1417   /// Collect Uniform and Scalar values for the given \p VF.
1418   /// The sets depend on CM decision for Load/Store instructions
1419   /// that may be vectorized as interleave, gather-scatter or scalarized.
collectUniformsAndScalars(ElementCount VF)1420   void collectUniformsAndScalars(ElementCount VF) {
1421     // Do the analysis once.
1422     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423       return;
1424     setCostBasedWideningDecision(VF);
1425     collectLoopUniforms(VF);
1426     collectLoopScalars(VF);
1427   }
1428 
1429   /// Returns true if the target machine supports masked store operation
1430   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1431   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432     return Legal->isConsecutivePtr(DataType, Ptr) &&
1433            TTI.isLegalMaskedStore(DataType, Alignment);
1434   }
1435 
1436   /// Returns true if the target machine supports masked load operation
1437   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1438   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439     return Legal->isConsecutivePtr(DataType, Ptr) &&
1440            TTI.isLegalMaskedLoad(DataType, Alignment);
1441   }
1442 
1443   /// Returns true if the target machine can represent \p V as a masked gather
1444   /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF=ElementCount::getFixed (1))1445   bool isLegalGatherOrScatter(Value *V,
1446                               ElementCount VF = ElementCount::getFixed(1)) {
1447     bool LI = isa<LoadInst>(V);
1448     bool SI = isa<StoreInst>(V);
1449     if (!LI && !SI)
1450       return false;
1451     auto *Ty = getLoadStoreType(V);
1452     Align Align = getLoadStoreAlignment(V);
1453     if (VF.isVector())
1454       Ty = VectorType::get(Ty, VF);
1455     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457   }
1458 
1459   /// Returns true if the target machine supports all of the reduction
1460   /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1461   bool canVectorizeReductions(ElementCount VF) const {
1462     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465     }));
1466   }
1467 
1468   /// Given costs for both strategies, return true if the scalar predication
1469   /// lowering should be used for div/rem.  This incorporates an override
1470   /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1471   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472                                      InstructionCost SafeDivisorCost) const {
1473     switch (ForceSafeDivisor) {
1474     case cl::BOU_UNSET:
1475       return ScalarCost < SafeDivisorCost;
1476     case cl::BOU_TRUE:
1477       return false;
1478     case cl::BOU_FALSE:
1479       return true;
1480     };
1481     llvm_unreachable("impossible case value");
1482   }
1483 
1484   /// Returns true if \p I is an instruction which requires predication and
1485   /// for which our chosen predication strategy is scalarization (i.e. we
1486   /// don't have an alternate strategy such as masking available).
1487   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489 
1490   /// Returns true if \p I is an instruction that needs to be predicated
1491   /// at runtime.  The result is independent of the predication mechanism.
1492   /// Superset of instructions that return true for isScalarWithPredication.
1493   bool isPredicatedInst(Instruction *I) const;
1494 
1495   /// Return the costs for our two available strategies for lowering a
1496   /// div/rem operation which requires speculating at least one lane.
1497   /// First result is for scalarization (will be invalid for scalable
1498   /// vectors); second is for the safe-divisor strategy.
1499   std::pair<InstructionCost, InstructionCost>
1500   getDivRemSpeculationCost(Instruction *I,
1501                            ElementCount VF) const;
1502 
1503   /// Returns true if \p I is a memory instruction with consecutive memory
1504   /// access that can be widened.
1505   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506 
1507   /// Returns true if \p I is a memory instruction in an interleaved-group
1508   /// of memory accesses that can be vectorized with wide vector loads/stores
1509   /// and shuffles.
1510   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511 
1512   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1513   bool isAccessInterleaved(Instruction *Instr) {
1514     return InterleaveInfo.isInterleaved(Instr);
1515   }
1516 
1517   /// Get the interleaved access group that \p Instr belongs to.
1518   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1519   getInterleavedAccessGroup(Instruction *Instr) {
1520     return InterleaveInfo.getInterleaveGroup(Instr);
1521   }
1522 
1523   /// Returns true if we're required to use a scalar epilogue for at least
1524   /// the final iteration of the original loop.
requiresScalarEpilogue(ElementCount VF) const1525   bool requiresScalarEpilogue(ElementCount VF) const {
1526     if (!isScalarEpilogueAllowed())
1527       return false;
1528     // If we might exit from anywhere but the latch, must run the exiting
1529     // iteration in scalar form.
1530     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531       return true;
1532     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533   }
1534 
1535   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536   /// loop hint annotation.
isScalarEpilogueAllowed() const1537   bool isScalarEpilogueAllowed() const {
1538     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539   }
1540 
1541   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1542   bool foldTailByMasking() const { return FoldTailByMasking; }
1543 
1544   /// Returns true if were tail-folding and want to use the active lane mask
1545   /// for vector loop control flow.
useActiveLaneMaskForControlFlow() const1546   bool useActiveLaneMaskForControlFlow() const {
1547     return FoldTailByMasking &&
1548            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549   }
1550 
1551   /// Returns true if the instructions in this block requires predication
1552   /// for any reason, e.g. because tail folding now requires a predicate
1553   /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1554   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556   }
1557 
1558   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559   /// nodes to the chain of instructions representing the reductions. Uses a
1560   /// MapVector to ensure deterministic iteration order.
1561   using ReductionChainMap =
1562       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563 
1564   /// Return the chain of instructions representing an inloop reduction.
getInLoopReductionChains() const1565   const ReductionChainMap &getInLoopReductionChains() const {
1566     return InLoopReductionChains;
1567   }
1568 
1569   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1570   bool isInLoopReduction(PHINode *Phi) const {
1571     return InLoopReductionChains.count(Phi);
1572   }
1573 
1574   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575   /// with factor VF.  Return the cost of the instruction, including
1576   /// scalarization overhead if it's needed.
1577   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578 
1579   /// Estimate cost of a call instruction CI if it were vectorized with factor
1580   /// VF. Return the cost of the instruction, including scalarization overhead
1581   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582   /// scalarized -
1583   /// i.e. either vector version isn't available, or is too expensive.
1584   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585                                     bool &NeedToScalarize) const;
1586 
1587   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588   /// that of B.
1589   bool isMoreProfitable(const VectorizationFactor &A,
1590                         const VectorizationFactor &B) const;
1591 
1592   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1593   void invalidateCostModelingDecisions() {
1594     WideningDecisions.clear();
1595     Uniforms.clear();
1596     Scalars.clear();
1597   }
1598 
1599   /// Convenience function that returns the value of vscale_range iff
1600   /// vscale_range.min == vscale_range.max or otherwise returns the value
1601   /// returned by the corresponding TLI method.
1602   std::optional<unsigned> getVScaleForTuning() const;
1603 
1604 private:
1605   unsigned NumPredStores = 0;
1606 
1607   /// \return An upper bound for the vectorization factors for both
1608   /// fixed and scalable vectorization, where the minimum-known number of
1609   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610   /// disabled or unsupported, then the scalable part will be equal to
1611   /// ElementCount::getScalable(0).
1612   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613                                            ElementCount UserVF,
1614                                            bool FoldTailByMasking);
1615 
1616   /// \return the maximized element count based on the targets vector
1617   /// registers and the loop trip-count, but limited to a maximum safe VF.
1618   /// This is a helper function of computeFeasibleMaxVF.
1619   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620                                        unsigned SmallestType,
1621                                        unsigned WidestType,
1622                                        ElementCount MaxSafeVF,
1623                                        bool FoldTailByMasking);
1624 
1625   /// \return the maximum legal scalable VF, based on the safe max number
1626   /// of elements.
1627   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628 
1629   /// The vectorization cost is a combination of the cost itself and a boolean
1630   /// indicating whether any of the contributing operations will actually
1631   /// operate on vector values after type legalization in the backend. If this
1632   /// latter value is false, then all operations will be scalarized (i.e. no
1633   /// vectorization has actually taken place).
1634   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635 
1636   /// Returns the expected execution cost. The unit of the cost does
1637   /// not matter because we use the 'cost' units to compare different
1638   /// vector widths. The cost that is returned is *not* normalized by
1639   /// the factor width. If \p Invalid is not nullptr, this function
1640   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641   /// each instruction that has an Invalid cost for the given VF.
1642   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643   VectorizationCostTy
1644   expectedCost(ElementCount VF,
1645                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646 
1647   /// Returns the execution time cost of an instruction for a given vector
1648   /// width. Vector width of one means scalar.
1649   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost-computation logic from getInstructionCost which provides
1652   /// the vector type as an output parameter.
1653   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654                                      Type *&VectorTy);
1655 
1656   /// Return the cost of instructions in an inloop reduction pattern, if I is
1657   /// part of that pattern.
1658   std::optional<InstructionCost>
1659   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660                           TTI::TargetCostKind CostKind);
1661 
1662   /// Calculate vectorization cost of memory instruction \p I.
1663   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664 
1665   /// The cost computation for scalarized memory instruction.
1666   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667 
1668   /// The cost computation for interleaving group of memory instructions.
1669   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670 
1671   /// The cost computation for Gather/Scatter instruction.
1672   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673 
1674   /// The cost computation for widening instruction \p I with consecutive
1675   /// memory access.
1676   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677 
1678   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679   /// Load: scalar load + broadcast.
1680   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681   /// element)
1682   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683 
1684   /// Estimate the overhead of scalarizing an instruction. This is a
1685   /// convenience wrapper for the type-based getScalarizationOverhead API.
1686   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687                                            TTI::TargetCostKind CostKind) const;
1688 
1689   /// Returns true if an artificially high cost for emulated masked memrefs
1690   /// should be used.
1691   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692 
1693   /// Map of scalar integer values to the smallest bitwidth they can be legally
1694   /// represented as. The vector equivalents of these values should be truncated
1695   /// to this type.
1696   MapVector<Instruction *, uint64_t> MinBWs;
1697 
1698   /// A type representing the costs for instructions if they were to be
1699   /// scalarized rather than vectorized. The entries are Instruction-Cost
1700   /// pairs.
1701   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702 
1703   /// A set containing all BasicBlocks that are known to present after
1704   /// vectorization as a predicated block.
1705   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706       PredicatedBBsAfterVectorization;
1707 
1708   /// Records whether it is allowed to have the original scalar loop execute at
1709   /// least once. This may be needed as a fallback loop in case runtime
1710   /// aliasing/dependence checks fail, or to handle the tail/remainder
1711   /// iterations when the trip count is unknown or doesn't divide by the VF,
1712   /// or as a peel-loop to handle gaps in interleave-groups.
1713   /// Under optsize and when the trip count is very small we don't allow any
1714   /// iterations to execute in the scalar loop.
1715   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716 
1717   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718   bool FoldTailByMasking = false;
1719 
1720   /// A map holding scalar costs for different vectorization factors. The
1721   /// presence of a cost for an instruction in the mapping indicates that the
1722   /// instruction will be scalarized when vectorizing with the associated
1723   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725 
1726   /// Holds the instructions known to be uniform after vectorization.
1727   /// The data is collected per VF.
1728   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729 
1730   /// Holds the instructions known to be scalar after vectorization.
1731   /// The data is collected per VF.
1732   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733 
1734   /// Holds the instructions (address computations) that are forced to be
1735   /// scalarized.
1736   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737 
1738   /// PHINodes of the reductions that should be expanded in-loop along with
1739   /// their associated chains of reduction operations, in program order from top
1740   /// (PHI) to bottom
1741   ReductionChainMap InLoopReductionChains;
1742 
1743   /// A Map of inloop reduction operations and their immediate chain operand.
1744   /// FIXME: This can be removed once reductions can be costed correctly in
1745   /// vplan. This was added to allow quick lookup to the inloop operations,
1746   /// without having to loop through InLoopReductionChains.
1747   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748 
1749   /// Returns the expected difference in cost from scalarizing the expression
1750   /// feeding a predicated instruction \p PredInst. The instructions to
1751   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752   /// non-negative return value implies the expression will be scalarized.
1753   /// Currently, only single-use chains are considered for scalarization.
1754   InstructionCost computePredInstDiscount(Instruction *PredInst,
1755                                           ScalarCostsTy &ScalarCosts,
1756                                           ElementCount VF);
1757 
1758   /// Collect the instructions that are uniform after vectorization. An
1759   /// instruction is uniform if we represent it with a single scalar value in
1760   /// the vectorized loop corresponding to each vector iteration. Examples of
1761   /// uniform instructions include pointer operands of consecutive or
1762   /// interleaved memory accesses. Note that although uniformity implies an
1763   /// instruction will be scalar, the reverse is not true. In general, a
1764   /// scalarized instruction will be represented by VF scalar values in the
1765   /// vectorized loop, each corresponding to an iteration of the original
1766   /// scalar loop.
1767   void collectLoopUniforms(ElementCount VF);
1768 
1769   /// Collect the instructions that are scalar after vectorization. An
1770   /// instruction is scalar if it is known to be uniform or will be scalarized
1771   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772   /// to the list if they are used by a load/store instruction that is marked as
1773   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774   /// VF values in the vectorized loop, each corresponding to an iteration of
1775   /// the original scalar loop.
1776   void collectLoopScalars(ElementCount VF);
1777 
1778   /// Keeps cost model vectorization decision and cost for instructions.
1779   /// Right now it is used for memory instructions only.
1780   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781                                 std::pair<InstWidening, InstructionCost>>;
1782 
1783   DecisionList WideningDecisions;
1784 
1785   /// Returns true if \p V is expected to be vectorized and it needs to be
1786   /// extracted.
needsExtract(Value * V,ElementCount VF) const1787   bool needsExtract(Value *V, ElementCount VF) const {
1788     Instruction *I = dyn_cast<Instruction>(V);
1789     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790         TheLoop->isLoopInvariant(I))
1791       return false;
1792 
1793     // Assume we can vectorize V (and hence we need extraction) if the
1794     // scalars are not computed yet. This can happen, because it is called
1795     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796     // the scalars are collected. That should be a safe assumption in most
1797     // cases, because we check if the operands have vectorizable types
1798     // beforehand in LoopVectorizationLegality.
1799     return Scalars.find(VF) == Scalars.end() ||
1800            !isScalarAfterVectorization(I, VF);
1801   };
1802 
1803   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1804   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805                                                    ElementCount VF) const {
1806     return SmallVector<Value *, 4>(make_filter_range(
1807         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808   }
1809 
1810   /// Determines if we have the infrastructure to vectorize loop \p L and its
1811   /// epilogue, assuming the main loop is vectorized by \p VF.
1812   bool isCandidateForEpilogueVectorization(const Loop &L,
1813                                            const ElementCount VF) const;
1814 
1815   /// Returns true if epilogue vectorization is considered profitable, and
1816   /// false otherwise.
1817   /// \p VF is the vectorization factor chosen for the original loop.
1818   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819 
1820 public:
1821   /// The loop that we evaluate.
1822   Loop *TheLoop;
1823 
1824   /// Predicated scalar evolution analysis.
1825   PredicatedScalarEvolution &PSE;
1826 
1827   /// Loop Info analysis.
1828   LoopInfo *LI;
1829 
1830   /// Vectorization legality.
1831   LoopVectorizationLegality *Legal;
1832 
1833   /// Vector target information.
1834   const TargetTransformInfo &TTI;
1835 
1836   /// Target Library Info.
1837   const TargetLibraryInfo *TLI;
1838 
1839   /// Demanded bits analysis.
1840   DemandedBits *DB;
1841 
1842   /// Assumption cache.
1843   AssumptionCache *AC;
1844 
1845   /// Interface to emit optimization remarks.
1846   OptimizationRemarkEmitter *ORE;
1847 
1848   const Function *TheFunction;
1849 
1850   /// Loop Vectorize Hint.
1851   const LoopVectorizeHints *Hints;
1852 
1853   /// The interleave access information contains groups of interleaved accesses
1854   /// with the same stride and close to each other.
1855   InterleavedAccessInfo &InterleaveInfo;
1856 
1857   /// Values to ignore in the cost model.
1858   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859 
1860   /// Values to ignore in the cost model when VF > 1.
1861   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862 
1863   /// All element types found in the loop.
1864   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865 
1866   /// Profitable vector factors.
1867   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868 };
1869 } // end namespace llvm
1870 
1871 namespace {
1872 /// Helper struct to manage generating runtime checks for vectorization.
1873 ///
1874 /// The runtime checks are created up-front in temporary blocks to allow better
1875 /// estimating the cost and un-linked from the existing IR. After deciding to
1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1877 /// temporary blocks are completely removed.
1878 class GeneratedRTChecks {
1879   /// Basic block which contains the generated SCEV checks, if any.
1880   BasicBlock *SCEVCheckBlock = nullptr;
1881 
1882   /// The value representing the result of the generated SCEV checks. If it is
1883   /// nullptr, either no SCEV checks have been generated or they have been used.
1884   Value *SCEVCheckCond = nullptr;
1885 
1886   /// Basic block which contains the generated memory runtime checks, if any.
1887   BasicBlock *MemCheckBlock = nullptr;
1888 
1889   /// The value representing the result of the generated memory runtime checks.
1890   /// If it is nullptr, either no memory runtime checks have been generated or
1891   /// they have been used.
1892   Value *MemRuntimeCheckCond = nullptr;
1893 
1894   DominatorTree *DT;
1895   LoopInfo *LI;
1896   TargetTransformInfo *TTI;
1897 
1898   SCEVExpander SCEVExp;
1899   SCEVExpander MemCheckExp;
1900 
1901   bool CostTooHigh = false;
1902 
1903 public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL)1904   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905                     TargetTransformInfo *TTI, const DataLayout &DL)
1906       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907         MemCheckExp(SE, DL, "scev.check") {}
1908 
1909   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910   /// accurately estimate the cost of the runtime checks. The blocks are
1911   /// un-linked from the IR and is added back during vector code generation. If
1912   /// there is no vector code generation, the check blocks are removed
1913   /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1914   void Create(Loop *L, const LoopAccessInfo &LAI,
1915               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916 
1917     // Hard cutoff to limit compile-time increase in case a very large number of
1918     // runtime checks needs to be generated.
1919     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920     // profile info.
1921     CostTooHigh =
1922         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923     if (CostTooHigh)
1924       return;
1925 
1926     BasicBlock *LoopHeader = L->getHeader();
1927     BasicBlock *Preheader = L->getLoopPreheader();
1928 
1929     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931     // may be used by SCEVExpander. The blocks will be un-linked from their
1932     // predecessors and removed from LI & DT at the end of the function.
1933     if (!UnionPred.isAlwaysTrue()) {
1934       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935                                   nullptr, "vector.scevcheck");
1936 
1937       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938           &UnionPred, SCEVCheckBlock->getTerminator());
1939     }
1940 
1941     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942     if (RtPtrChecking.Need) {
1943       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945                                  "vector.memcheck");
1946 
1947       auto DiffChecks = RtPtrChecking.getDiffChecks();
1948       if (DiffChecks) {
1949         Value *RuntimeVF = nullptr;
1950         MemRuntimeCheckCond = addDiffRuntimeChecks(
1951             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953               if (!RuntimeVF)
1954                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955               return RuntimeVF;
1956             },
1957             IC);
1958       } else {
1959         MemRuntimeCheckCond =
1960             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961                              RtPtrChecking.getChecks(), MemCheckExp);
1962       }
1963       assert(MemRuntimeCheckCond &&
1964              "no RT checks generated although RtPtrChecking "
1965              "claimed checks are required");
1966     }
1967 
1968     if (!MemCheckBlock && !SCEVCheckBlock)
1969       return;
1970 
1971     // Unhook the temporary block with the checks, update various places
1972     // accordingly.
1973     if (SCEVCheckBlock)
1974       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975     if (MemCheckBlock)
1976       MemCheckBlock->replaceAllUsesWith(Preheader);
1977 
1978     if (SCEVCheckBlock) {
1979       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981       Preheader->getTerminator()->eraseFromParent();
1982     }
1983     if (MemCheckBlock) {
1984       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986       Preheader->getTerminator()->eraseFromParent();
1987     }
1988 
1989     DT->changeImmediateDominator(LoopHeader, Preheader);
1990     if (MemCheckBlock) {
1991       DT->eraseNode(MemCheckBlock);
1992       LI->removeBlock(MemCheckBlock);
1993     }
1994     if (SCEVCheckBlock) {
1995       DT->eraseNode(SCEVCheckBlock);
1996       LI->removeBlock(SCEVCheckBlock);
1997     }
1998   }
1999 
getCost()2000   InstructionCost getCost() {
2001     if (SCEVCheckBlock || MemCheckBlock)
2002       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2003 
2004     if (CostTooHigh) {
2005       InstructionCost Cost;
2006       Cost.setInvalid();
2007       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2008       return Cost;
2009     }
2010 
2011     InstructionCost RTCheckCost = 0;
2012     if (SCEVCheckBlock)
2013       for (Instruction &I : *SCEVCheckBlock) {
2014         if (SCEVCheckBlock->getTerminator() == &I)
2015           continue;
2016         InstructionCost C =
2017             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2019         RTCheckCost += C;
2020       }
2021     if (MemCheckBlock)
2022       for (Instruction &I : *MemCheckBlock) {
2023         if (MemCheckBlock->getTerminator() == &I)
2024           continue;
2025         InstructionCost C =
2026             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2028         RTCheckCost += C;
2029       }
2030 
2031     if (SCEVCheckBlock || MemCheckBlock)
2032       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2033                         << "\n");
2034 
2035     return RTCheckCost;
2036   }
2037 
2038   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039   /// unused.
~GeneratedRTChecks()2040   ~GeneratedRTChecks() {
2041     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043     if (!SCEVCheckCond)
2044       SCEVCleaner.markResultUsed();
2045 
2046     if (!MemRuntimeCheckCond)
2047       MemCheckCleaner.markResultUsed();
2048 
2049     if (MemRuntimeCheckCond) {
2050       auto &SE = *MemCheckExp.getSE();
2051       // Memory runtime check generation creates compares that use expanded
2052       // values. Remove them before running the SCEVExpanderCleaners.
2053       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054         if (MemCheckExp.isInsertedInstruction(&I))
2055           continue;
2056         SE.forgetValue(&I);
2057         I.eraseFromParent();
2058       }
2059     }
2060     MemCheckCleaner.cleanup();
2061     SCEVCleaner.cleanup();
2062 
2063     if (SCEVCheckCond)
2064       SCEVCheckBlock->eraseFromParent();
2065     if (MemRuntimeCheckCond)
2066       MemCheckBlock->eraseFromParent();
2067   }
2068 
2069   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071   /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)2072   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073                              BasicBlock *LoopVectorPreHeader,
2074                              BasicBlock *LoopExitBlock) {
2075     if (!SCEVCheckCond)
2076       return nullptr;
2077 
2078     Value *Cond = SCEVCheckCond;
2079     // Mark the check as used, to prevent it from being removed during cleanup.
2080     SCEVCheckCond = nullptr;
2081     if (auto *C = dyn_cast<ConstantInt>(Cond))
2082       if (C->isZero())
2083         return nullptr;
2084 
2085     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086 
2087     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088     // Create new preheader for vector loop.
2089     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091 
2092     SCEVCheckBlock->getTerminator()->eraseFromParent();
2093     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095                                                 SCEVCheckBlock);
2096 
2097     DT->addNewBlock(SCEVCheckBlock, Pred);
2098     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099 
2100     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102     return SCEVCheckBlock;
2103   }
2104 
2105   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106   /// the branches to branch to the vector preheader or \p Bypass, depending on
2107   /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)2108   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109                                    BasicBlock *LoopVectorPreHeader) {
2110     // Check if we generated code that checks in runtime if arrays overlap.
2111     if (!MemRuntimeCheckCond)
2112       return nullptr;
2113 
2114     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116                                                 MemCheckBlock);
2117 
2118     DT->addNewBlock(MemCheckBlock, Pred);
2119     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121 
2122     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124 
2125     ReplaceInstWithInst(
2126         MemCheckBlock->getTerminator(),
2127         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128     MemCheckBlock->getTerminator()->setDebugLoc(
2129         Pred->getTerminator()->getDebugLoc());
2130 
2131     // Mark the check as used, to prevent it from being removed during cleanup.
2132     MemRuntimeCheckCond = nullptr;
2133     return MemCheckBlock;
2134   }
2135 };
2136 } // namespace
2137 
2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139 // vectorization. The loop needs to be annotated with #pragma omp simd
2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141 // vector length information is not provided, vectorization is not considered
2142 // explicit. Interleave hints are not allowed either. These limitations will be
2143 // relaxed in the future.
2144 // Please, note that we are currently forced to abuse the pragma 'clang
2145 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147 // provides *explicit vectorization hints* (LV can bypass legal checks and
2148 // assume that vectorization is legal). However, both hints are implemented
2149 // using the same metadata (llvm.loop.vectorize, processed by
2150 // LoopVectorizeHints). This will be fixed in the future when the native IR
2151 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2152 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153                                    OptimizationRemarkEmitter *ORE) {
2154   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2155   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156 
2157   // Only outer loops with an explicit vectorization hint are supported.
2158   // Unannotated outer loops are ignored.
2159   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160     return false;
2161 
2162   Function *Fn = OuterLp->getHeader()->getParent();
2163   if (!Hints.allowVectorization(Fn, OuterLp,
2164                                 true /*VectorizeOnlyWhenForced*/)) {
2165     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2166     return false;
2167   }
2168 
2169   if (Hints.getInterleave() > 1) {
2170     // TODO: Interleave support is future work.
2171     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2172                          "outer loops.\n");
2173     Hints.emitRemarkWithHints();
2174     return false;
2175   }
2176 
2177   return true;
2178 }
2179 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181                                   OptimizationRemarkEmitter *ORE,
2182                                   SmallVectorImpl<Loop *> &V) {
2183   // Collect inner loops and outer loops without irreducible control flow. For
2184   // now, only collect outer loops that have explicit vectorization hints. If we
2185   // are stress testing the VPlan H-CFG construction, we collect the outermost
2186   // loop of every loop nest.
2187   if (L.isInnermost() || VPlanBuildStressTest ||
2188       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189     LoopBlocksRPO RPOT(&L);
2190     RPOT.perform(LI);
2191     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192       V.push_back(&L);
2193       // TODO: Collect inner loops inside marked outer loops in case
2194       // vectorization fails for the outer loop. Do not invoke
2195       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196       // already known to be reducible. We can use an inherited attribute for
2197       // that.
2198       return;
2199     }
2200   }
2201   for (Loop *InnerL : L)
2202     collectSupportedLoops(*InnerL, LI, ORE, V);
2203 }
2204 
2205 namespace {
2206 
2207 /// The LoopVectorize Pass.
2208 struct LoopVectorize : public FunctionPass {
2209   /// Pass identification, replacement for typeid
2210   static char ID;
2211 
2212   LoopVectorizePass Impl;
2213 
LoopVectorize__anonb4be857a0711::LoopVectorize2214   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215                          bool VectorizeOnlyWhenForced = false)
2216       : FunctionPass(ID),
2217         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219   }
2220 
runOnFunction__anonb4be857a0711::LoopVectorize2221   bool runOnFunction(Function &F) override {
2222     if (skipFunction(F))
2223       return false;
2224 
2225     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233     auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237 
2238     return Impl
2239         .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240         .MadeAnyChange;
2241   }
2242 
getAnalysisUsage__anonb4be857a0711::LoopVectorize2243   void getAnalysisUsage(AnalysisUsage &AU) const override {
2244     AU.addRequired<AssumptionCacheTracker>();
2245     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246     AU.addRequired<DominatorTreeWrapperPass>();
2247     AU.addRequired<LoopInfoWrapperPass>();
2248     AU.addRequired<ScalarEvolutionWrapperPass>();
2249     AU.addRequired<TargetTransformInfoWrapperPass>();
2250     AU.addRequired<LoopAccessLegacyAnalysis>();
2251     AU.addRequired<DemandedBitsWrapperPass>();
2252     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253     AU.addRequired<InjectTLIMappingsLegacy>();
2254 
2255     // We currently do not preserve loopinfo/dominator analyses with outer loop
2256     // vectorization. Until this is addressed, mark these analyses as preserved
2257     // only for non-VPlan-native path.
2258     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259     if (!EnableVPlanNativePath) {
2260       AU.addPreserved<LoopInfoWrapperPass>();
2261       AU.addPreserved<DominatorTreeWrapperPass>();
2262     }
2263 
2264     AU.addPreserved<BasicAAWrapperPass>();
2265     AU.addPreserved<GlobalsAAWrapperPass>();
2266     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267   }
2268 };
2269 
2270 } // end anonymous namespace
2271 
2272 //===----------------------------------------------------------------------===//
2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2275 //===----------------------------------------------------------------------===//
2276 
getBroadcastInstrs(Value * V)2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278   // We need to place the broadcast of invariant variables outside the loop,
2279   // but only if it's proven safe to do so. Else, broadcast will be inside
2280   // vector loop body.
2281   Instruction *Instr = dyn_cast<Instruction>(V);
2282   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283                      (!Instr ||
2284                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285   // Place the code for broadcasting invariant variables in the new preheader.
2286   IRBuilder<>::InsertPointGuard Guard(Builder);
2287   if (SafeToHoist)
2288     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289 
2290   // Broadcast the scalar into all locations in the vector.
2291   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292 
2293   return Shuf;
2294 }
2295 
2296 /// This function adds
2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298 /// to each vector element of Val. The sequence starts at StartIndex.
2299 /// \p Opcode is relevant for FP induction variable.
getStepVector(Value * Val,Value * StartIdx,Value * Step,Instruction::BinaryOps BinOp,ElementCount VF,IRBuilderBase & Builder)2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301                             Instruction::BinaryOps BinOp, ElementCount VF,
2302                             IRBuilderBase &Builder) {
2303   assert(VF.isVector() && "only vector VFs are supported");
2304 
2305   // Create and check the types.
2306   auto *ValVTy = cast<VectorType>(Val->getType());
2307   ElementCount VLen = ValVTy->getElementCount();
2308 
2309   Type *STy = Val->getType()->getScalarType();
2310   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2311          "Induction Step must be an integer or FP");
2312   assert(Step->getType() == STy && "Step has wrong type");
2313 
2314   SmallVector<Constant *, 8> Indices;
2315 
2316   // Create a vector of consecutive numbers from zero to VF.
2317   VectorType *InitVecValVTy = ValVTy;
2318   if (STy->isFloatingPointTy()) {
2319     Type *InitVecValSTy =
2320         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322   }
2323   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324 
2325   // Splat the StartIdx
2326   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327 
2328   if (STy->isIntegerTy()) {
2329     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330     Step = Builder.CreateVectorSplat(VLen, Step);
2331     assert(Step->getType() == Val->getType() && "Invalid step vec");
2332     // FIXME: The newly created binary instructions should contain nsw/nuw
2333     // flags, which can be found from the original scalar operations.
2334     Step = Builder.CreateMul(InitVec, Step);
2335     return Builder.CreateAdd(Val, Step, "induction");
2336   }
2337 
2338   // Floating point induction.
2339   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2340          "Binary Opcode should be specified for FP induction");
2341   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343 
2344   Step = Builder.CreateVectorSplat(VLen, Step);
2345   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347 }
2348 
2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350 /// variable on which to base the steps, \p Step is the size of the step.
buildScalarSteps(Value * ScalarIV,Value * Step,const InductionDescriptor & ID,VPValue * Def,VPTransformState & State)2351 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352                              const InductionDescriptor &ID, VPValue *Def,
2353                              VPTransformState &State) {
2354   IRBuilderBase &Builder = State.Builder;
2355 
2356   // Ensure step has the same type as that of scalar IV.
2357   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358   if (ScalarIVTy != Step->getType()) {
2359     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360     // avoid separate truncate here.
2361     assert(Step->getType()->isIntegerTy() &&
2362            "Truncation requires an integer step");
2363     Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364   }
2365 
2366   // We build scalar steps for both integer and floating-point induction
2367   // variables. Here, we determine the kind of arithmetic we will perform.
2368   Instruction::BinaryOps AddOp;
2369   Instruction::BinaryOps MulOp;
2370   if (ScalarIVTy->isIntegerTy()) {
2371     AddOp = Instruction::Add;
2372     MulOp = Instruction::Mul;
2373   } else {
2374     AddOp = ID.getInductionOpcode();
2375     MulOp = Instruction::FMul;
2376   }
2377 
2378   // Determine the number of scalars we need to generate for each unroll
2379   // iteration.
2380   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381   // Compute the scalar steps and save the results in State.
2382   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383                                      ScalarIVTy->getScalarSizeInBits());
2384   Type *VecIVTy = nullptr;
2385   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386   if (!FirstLaneOnly && State.VF.isScalable()) {
2387     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388     UnitStepVec =
2389         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392   }
2393 
2394   unsigned StartPart = 0;
2395   unsigned EndPart = State.UF;
2396   unsigned StartLane = 0;
2397   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398   if (State.Instance) {
2399     StartPart = State.Instance->Part;
2400     EndPart = StartPart + 1;
2401     StartLane = State.Instance->Lane.getKnownLane();
2402     EndLane = StartLane + 1;
2403   }
2404   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406 
2407     if (!FirstLaneOnly && State.VF.isScalable()) {
2408       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410       if (ScalarIVTy->isFloatingPointTy())
2411         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414       State.set(Def, Add, Part);
2415       // It's useful to record the lane values too for the known minimum number
2416       // of elements so we do those below. This improves the code quality when
2417       // trying to extract the first element, for example.
2418     }
2419 
2420     if (ScalarIVTy->isFloatingPointTy())
2421       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422 
2423     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424       Value *StartIdx = Builder.CreateBinOp(
2425           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426       // The step returned by `createStepForVF` is a runtime-evaluated value
2427       // when VF is scalable. Otherwise, it should be folded into a Constant.
2428       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2429              "Expected StartIdx to be folded to a constant when VF is not "
2430              "scalable");
2431       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433       State.set(Def, Add, VPIteration(Part, Lane));
2434     }
2435   }
2436 }
2437 
2438 // Generate code for the induction step. Note that induction steps are
2439 // required to be loop-invariant
CreateStepValue(const SCEV * Step,ScalarEvolution & SE,Instruction * InsertBefore,Loop * OrigLoop=nullptr)2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441                               Instruction *InsertBefore,
2442                               Loop *OrigLoop = nullptr) {
2443   const DataLayout &DL = SE.getDataLayout();
2444   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2445          "Induction step should be loop invariant");
2446   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447     return E->getValue();
2448 
2449   SCEVExpander Exp(SE, DL, "induction");
2450   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451 }
2452 
2453 /// Compute the transformed value of Index at offset StartValue using step
2454 /// StepValue.
2455 /// For integer induction, returns StartValue + Index * StepValue.
2456 /// For pointer induction, returns StartValue[Index * StepValue].
2457 /// FIXME: The newly created binary instructions should contain nsw/nuw
2458 /// flags, which can be found from the original scalar operations.
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,const InductionDescriptor & ID)2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460                                    Value *StartValue, Value *Step,
2461                                    const InductionDescriptor &ID) {
2462   Type *StepTy = Step->getType();
2463   Value *CastedIndex = StepTy->isIntegerTy()
2464                            ? B.CreateSExtOrTrunc(Index, StepTy)
2465                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466   if (CastedIndex != Index) {
2467     CastedIndex->setName(CastedIndex->getName() + ".cast");
2468     Index = CastedIndex;
2469   }
2470 
2471   // Note: the IR at this point is broken. We cannot use SE to create any new
2472   // SCEV and then expand it, hoping that SCEV's simplification will give us
2473   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474   // lead to various SCEV crashes. So all we can do is to use builder and rely
2475   // on InstCombine for future simplifications. Here we handle some trivial
2476   // cases only.
2477   auto CreateAdd = [&B](Value *X, Value *Y) {
2478     assert(X->getType() == Y->getType() && "Types don't match!");
2479     if (auto *CX = dyn_cast<ConstantInt>(X))
2480       if (CX->isZero())
2481         return Y;
2482     if (auto *CY = dyn_cast<ConstantInt>(Y))
2483       if (CY->isZero())
2484         return X;
2485     return B.CreateAdd(X, Y);
2486   };
2487 
2488   // We allow X to be a vector type, in which case Y will potentially be
2489   // splatted into a vector with the same element count.
2490   auto CreateMul = [&B](Value *X, Value *Y) {
2491     assert(X->getType()->getScalarType() == Y->getType() &&
2492            "Types don't match!");
2493     if (auto *CX = dyn_cast<ConstantInt>(X))
2494       if (CX->isOne())
2495         return Y;
2496     if (auto *CY = dyn_cast<ConstantInt>(Y))
2497       if (CY->isOne())
2498         return X;
2499     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500     if (XVTy && !isa<VectorType>(Y->getType()))
2501       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502     return B.CreateMul(X, Y);
2503   };
2504 
2505   switch (ID.getKind()) {
2506   case InductionDescriptor::IK_IntInduction: {
2507     assert(!isa<VectorType>(Index->getType()) &&
2508            "Vector indices not supported for integer inductions yet");
2509     assert(Index->getType() == StartValue->getType() &&
2510            "Index type does not match StartValue type");
2511     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512       return B.CreateSub(StartValue, Index);
2513     auto *Offset = CreateMul(Index, Step);
2514     return CreateAdd(StartValue, Offset);
2515   }
2516   case InductionDescriptor::IK_PtrInduction: {
2517     assert(isa<Constant>(Step) &&
2518            "Expected constant step for pointer induction");
2519     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520   }
2521   case InductionDescriptor::IK_FpInduction: {
2522     assert(!isa<VectorType>(Index->getType()) &&
2523            "Vector indices not supported for FP inductions yet");
2524     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2525     auto InductionBinOp = ID.getInductionBinOp();
2526     assert(InductionBinOp &&
2527            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2528             InductionBinOp->getOpcode() == Instruction::FSub) &&
2529            "Original bin op should be defined for FP induction");
2530 
2531     Value *MulExp = B.CreateFMul(Step, Index);
2532     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533                          "induction");
2534   }
2535   case InductionDescriptor::IK_NoInduction:
2536     return nullptr;
2537   }
2538   llvm_unreachable("invalid enum");
2539 }
2540 
packScalarIntoVectorValue(VPValue * Def,const VPIteration & Instance,VPTransformState & State)2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542                                                     const VPIteration &Instance,
2543                                                     VPTransformState &State) {
2544   Value *ScalarInst = State.get(Def, Instance);
2545   Value *VectorValue = State.get(Def, Instance.Part);
2546   VectorValue = Builder.CreateInsertElement(
2547       VectorValue, ScalarInst,
2548       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549   State.set(Def, VectorValue, Instance.Part);
2550 }
2551 
2552 // Return whether we allow using masked interleave-groups (for dealing with
2553 // strided loads/stores that reside in predicated blocks, or for dealing
2554 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556   // If an override option has been passed in for interleaved accesses, use it.
2557   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558     return EnableMaskedInterleavedMemAccesses;
2559 
2560   return TTI.enableMaskedInterleavedAccessVectorization();
2561 }
2562 
2563 // Try to vectorize the interleave group that \p Instr belongs to.
2564 //
2565 // E.g. Translate following interleaved load group (factor = 3):
2566 //   for (i = 0; i < N; i+=3) {
2567 //     R = Pic[i];             // Member of index 0
2568 //     G = Pic[i+1];           // Member of index 1
2569 //     B = Pic[i+2];           // Member of index 2
2570 //     ... // do something to R, G, B
2571 //   }
2572 // To:
2573 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2574 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2575 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2576 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2577 //
2578 // Or translate following interleaved store group (factor = 3):
2579 //   for (i = 0; i < N; i+=3) {
2580 //     ... do something to R, G, B
2581 //     Pic[i]   = R;           // Member of index 0
2582 //     Pic[i+1] = G;           // Member of index 1
2583 //     Pic[i+2] = B;           // Member of index 2
2584 //   }
2585 // To:
2586 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2590 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,ArrayRef<VPValue * > VPDefs,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask)2591 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594     VPValue *BlockInMask) {
2595   Instruction *Instr = Group->getInsertPos();
2596   const DataLayout &DL = Instr->getModule()->getDataLayout();
2597 
2598   // Prepare for the vector type of the interleaved load/store.
2599   Type *ScalarTy = getLoadStoreType(Instr);
2600   unsigned InterleaveFactor = Group->getFactor();
2601   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2602   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603 
2604   // Prepare for the new pointers.
2605   SmallVector<Value *, 2> AddrParts;
2606   unsigned Index = Group->getIndex(Instr);
2607 
2608   // TODO: extend the masked interleaved-group support to reversed access.
2609   assert((!BlockInMask || !Group->isReverse()) &&
2610          "Reversed masked interleave-group not supported.");
2611 
2612   // If the group is reverse, adjust the index to refer to the last vector lane
2613   // instead of the first. We adjust the index from the first vector lane,
2614   // rather than directly getting the pointer for lane VF - 1, because the
2615   // pointer operand of the interleaved access is supposed to be uniform. For
2616   // uniform instructions, we're only required to generate a value for the
2617   // first vector lane in each unroll iteration.
2618   if (Group->isReverse())
2619     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620 
2621   for (unsigned Part = 0; Part < UF; Part++) {
2622     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623     State.setDebugLocFromInst(AddrPart);
2624 
2625     // Notice current instruction could be any index. Need to adjust the address
2626     // to the member of index 0.
2627     //
2628     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2629     //       b = A[i];       // Member of index 0
2630     // Current pointer is pointed to A[i+1], adjust it to A[i].
2631     //
2632     // E.g.  A[i+1] = a;     // Member of index 1
2633     //       A[i]   = b;     // Member of index 0
2634     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2635     // Current pointer is pointed to A[i+2], adjust it to A[i].
2636 
2637     bool InBounds = false;
2638     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639       InBounds = gep->isInBounds();
2640     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642 
2643     // Cast to the vector pointer type.
2644     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647   }
2648 
2649   State.setDebugLocFromInst(Instr);
2650   Value *PoisonVec = PoisonValue::get(VecTy);
2651 
2652   Value *MaskForGaps = nullptr;
2653   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2656   }
2657 
2658   // Vectorize the interleaved load group.
2659   if (isa<LoadInst>(Instr)) {
2660     // For each unroll part, create a wide load for the group.
2661     SmallVector<Value *, 2> NewLoads;
2662     for (unsigned Part = 0; Part < UF; Part++) {
2663       Instruction *NewLoad;
2664       if (BlockInMask || MaskForGaps) {
2665         assert(useMaskedInterleavedAccesses(*TTI) &&
2666                "masked interleaved groups are not allowed.");
2667         Value *GroupMask = MaskForGaps;
2668         if (BlockInMask) {
2669           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670           Value *ShuffledMask = Builder.CreateShuffleVector(
2671               BlockInMaskPart,
2672               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673               "interleaved.mask");
2674           GroupMask = MaskForGaps
2675                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676                                                 MaskForGaps)
2677                           : ShuffledMask;
2678         }
2679         NewLoad =
2680             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681                                      GroupMask, PoisonVec, "wide.masked.vec");
2682       }
2683       else
2684         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685                                             Group->getAlign(), "wide.vec");
2686       Group->addMetadata(NewLoad);
2687       NewLoads.push_back(NewLoad);
2688     }
2689 
2690     // For each member in the group, shuffle out the appropriate data from the
2691     // wide loads.
2692     unsigned J = 0;
2693     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694       Instruction *Member = Group->getMember(I);
2695 
2696       // Skip the gaps in the group.
2697       if (!Member)
2698         continue;
2699 
2700       auto StrideMask =
2701           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702       for (unsigned Part = 0; Part < UF; Part++) {
2703         Value *StridedVec = Builder.CreateShuffleVector(
2704             NewLoads[Part], StrideMask, "strided.vec");
2705 
2706         // If this member has different type, cast the result type.
2707         if (Member->getType() != ScalarTy) {
2708           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2709           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711         }
2712 
2713         if (Group->isReverse())
2714           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715 
2716         State.set(VPDefs[J], StridedVec, Part);
2717       }
2718       ++J;
2719     }
2720     return;
2721   }
2722 
2723   // The sub vector type for current instruction.
2724   auto *SubVT = VectorType::get(ScalarTy, VF);
2725 
2726   // Vectorize the interleaved store group.
2727   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2729          "masked interleaved groups are not allowed.");
2730   assert((!MaskForGaps || !VF.isScalable()) &&
2731          "masking gaps for scalable vectors is not yet supported.");
2732   for (unsigned Part = 0; Part < UF; Part++) {
2733     // Collect the stored vector from each member.
2734     SmallVector<Value *, 4> StoredVecs;
2735     unsigned StoredIdx = 0;
2736     for (unsigned i = 0; i < InterleaveFactor; i++) {
2737       assert((Group->getMember(i) || MaskForGaps) &&
2738              "Fail to get a member from an interleaved store group");
2739       Instruction *Member = Group->getMember(i);
2740 
2741       // Skip the gaps in the group.
2742       if (!Member) {
2743         Value *Undef = PoisonValue::get(SubVT);
2744         StoredVecs.push_back(Undef);
2745         continue;
2746       }
2747 
2748       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749       ++StoredIdx;
2750 
2751       if (Group->isReverse())
2752         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753 
2754       // If this member has different type, cast it to a unified type.
2755 
2756       if (StoredVec->getType() != SubVT)
2757         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758 
2759       StoredVecs.push_back(StoredVec);
2760     }
2761 
2762     // Concatenate all vectors into a wide vector.
2763     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764 
2765     // Interleave the elements in the wide vector.
2766     Value *IVec = Builder.CreateShuffleVector(
2767         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768         "interleaved.vec");
2769 
2770     Instruction *NewStoreInstr;
2771     if (BlockInMask || MaskForGaps) {
2772       Value *GroupMask = MaskForGaps;
2773       if (BlockInMask) {
2774         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775         Value *ShuffledMask = Builder.CreateShuffleVector(
2776             BlockInMaskPart,
2777             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778             "interleaved.mask");
2779         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780                                                       ShuffledMask, MaskForGaps)
2781                                 : ShuffledMask;
2782       }
2783       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784                                                 Group->getAlign(), GroupMask);
2785     } else
2786       NewStoreInstr =
2787           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788 
2789     Group->addMetadata(NewStoreInstr);
2790   }
2791 }
2792 
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,bool IfPredicateInstr,VPTransformState & State)2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794                                                VPReplicateRecipe *RepRecipe,
2795                                                const VPIteration &Instance,
2796                                                bool IfPredicateInstr,
2797                                                VPTransformState &State) {
2798   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2799 
2800   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801   // the first lane and part.
2802   if (isa<NoAliasScopeDeclInst>(Instr))
2803     if (!Instance.isFirstIteration())
2804       return;
2805 
2806   // Does this instruction return a value ?
2807   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808 
2809   Instruction *Cloned = Instr->clone();
2810   if (!IsVoidRetTy)
2811     Cloned->setName(Instr->getName() + ".cloned");
2812 
2813   // If the scalarized instruction contributes to the address computation of a
2814   // widen masked load/store which was in a basic block that needed predication
2815   // and is not predicated after vectorization, we can't propagate
2816   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817   // instruction could feed a poison value to the base address of the widen
2818   // load/store.
2819   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820     Cloned->dropPoisonGeneratingFlags();
2821 
2822   if (Instr->getDebugLoc())
2823     State.setDebugLocFromInst(Instr);
2824 
2825   // Replace the operands of the cloned instructions with their scalar
2826   // equivalents in the new loop.
2827   for (const auto &I : enumerate(RepRecipe->operands())) {
2828     auto InputInstance = Instance;
2829     VPValue *Operand = I.value();
2830     if (vputils::isUniformAfterVectorization(Operand))
2831       InputInstance.Lane = VPLane::getFirstLane();
2832     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833   }
2834   State.addNewMetadata(Cloned, Instr);
2835 
2836   // Place the cloned scalar in the new loop.
2837   State.Builder.Insert(Cloned);
2838 
2839   State.set(RepRecipe, Cloned, Instance);
2840 
2841   // If we just cloned a new assumption, add it the assumption cache.
2842   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843     AC->registerAssumption(II);
2844 
2845   // End if-block.
2846   if (IfPredicateInstr)
2847     PredicatedInstructions.push_back(Cloned);
2848 }
2849 
getOrCreateTripCount(BasicBlock * InsertBlock)2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851   if (TripCount)
2852     return TripCount;
2853 
2854   assert(InsertBlock);
2855   IRBuilder<> Builder(InsertBlock->getTerminator());
2856   // Find the loop boundaries.
2857   Type *IdxTy = Legal->getWidestInductionType();
2858   assert(IdxTy && "No type for induction");
2859   const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860 
2861   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862 
2863   // Expand the trip count and place the new instructions in the preheader.
2864   // Notice that the pre-header does not change, only the loop body.
2865   SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866 
2867   // Count holds the overall loop count (N).
2868   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869                                 InsertBlock->getTerminator());
2870 
2871   if (TripCount->getType()->isPointerTy())
2872     TripCount =
2873         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874                                     InsertBlock->getTerminator());
2875 
2876   return TripCount;
2877 }
2878 
2879 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881   if (VectorTripCount)
2882     return VectorTripCount;
2883 
2884   Value *TC = getOrCreateTripCount(InsertBlock);
2885   IRBuilder<> Builder(InsertBlock->getTerminator());
2886 
2887   Type *Ty = TC->getType();
2888   // This is where we can make the step a runtime constant.
2889   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890 
2891   // If the tail is to be folded by masking, round the number of iterations N
2892   // up to a multiple of Step instead of rounding down. This is done by first
2893   // adding Step-1 and then rounding down. Note that it's ok if this addition
2894   // overflows: the vector induction variable will eventually wrap to zero given
2895   // that it starts at zero and its Step is a power of two; the loop will then
2896   // exit, with the last early-exit vector comparison also producing all-true.
2897   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898   // is accounted for in emitIterationCountCheck that adds an overflow check.
2899   if (Cost->foldTailByMasking()) {
2900     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2901            "VF*UF must be a power of 2 when folding tail by masking");
2902     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903     TC = Builder.CreateAdd(
2904         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905   }
2906 
2907   // Now we need to generate the expression for the part of the loop that the
2908   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909   // iterations are not required for correctness, or N - Step, otherwise. Step
2910   // is equal to the vectorization factor (number of SIMD elements) times the
2911   // unroll factor (number of SIMD instructions).
2912   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913 
2914   // There are cases where we *must* run at least one iteration in the remainder
2915   // loop.  See the cost model for when this can happen.  If the step evenly
2916   // divides the trip count, we set the remainder to be equal to the step. If
2917   // the step does not evenly divide the trip count, no adjustment is necessary
2918   // since there will already be scalar iterations. Note that the minimum
2919   // iterations check ensures that N >= Step.
2920   if (Cost->requiresScalarEpilogue(VF)) {
2921     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922     R = Builder.CreateSelect(IsZero, Step, R);
2923   }
2924 
2925   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926 
2927   return VectorTripCount;
2928 }
2929 
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931                                                    const DataLayout &DL) {
2932   // Verify that V is a vector type with same number of elements as DstVTy.
2933   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934   unsigned VF = DstFVTy->getNumElements();
2935   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2937   Type *SrcElemTy = SrcVecTy->getElementType();
2938   Type *DstElemTy = DstFVTy->getElementType();
2939   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2940          "Vector elements must have same size");
2941 
2942   // Do a direct cast if element types are castable.
2943   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945   }
2946   // V cannot be directly casted to desired vector type.
2947   // May happen when V is a floating point vector but DstVTy is a vector of
2948   // pointers or vice-versa. Handle this using a two-step bitcast using an
2949   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2951          "Only one type should be a pointer type");
2952   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2953          "Only one type should be a floating point type");
2954   Type *IntTy =
2955       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959 }
2960 
emitIterationCountCheck(BasicBlock * Bypass)2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963   // Reuse existing vector loop preheader for TC checks.
2964   // Note that new preheader block is generated for vector loop.
2965   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967 
2968   // Generate code to check if the loop's trip count is less than VF * UF, or
2969   // equal to it in case a scalar epilogue is required; this implies that the
2970   // vector trip count is zero. This check also covers the case where adding one
2971   // to the backedge-taken count overflowed leading to an incorrect trip count
2972   // of zero. In this case we will also jump to the scalar loop.
2973   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974                                             : ICmpInst::ICMP_ULT;
2975 
2976   // If tail is to be folded, vector loop takes care of all iterations.
2977   Type *CountTy = Count->getType();
2978   Value *CheckMinIters = Builder.getFalse();
2979   auto CreateStep = [&]() -> Value * {
2980     // Create step with max(MinProTripCount, UF * VF).
2981     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982       return createStepForVF(Builder, CountTy, VF, UF);
2983 
2984     Value *MinProfTC =
2985         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986     if (!VF.isScalable())
2987       return MinProfTC;
2988     return Builder.CreateBinaryIntrinsic(
2989         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990   };
2991 
2992   if (!Cost->foldTailByMasking())
2993     CheckMinIters =
2994         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995   else if (VF.isScalable()) {
2996     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997     // an overflow to zero when updating induction variables and so an
2998     // additional overflow check is required before entering the vector loop.
2999 
3000     // Get the maximum unsigned value for the type.
3001     Value *MaxUIntTripCount =
3002         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004 
3005     // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007   }
3008 
3009   // Create new preheader for vector loop.
3010   LoopVectorPreHeader =
3011       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012                  "vector.ph");
3013 
3014   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3015                                DT->getNode(Bypass)->getIDom()) &&
3016          "TC check is expected to dominate Bypass");
3017 
3018   // Update dominator for Bypass & LoopExit (if needed).
3019   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020   if (!Cost->requiresScalarEpilogue(VF))
3021     // If there is an epilogue which must run, there's no edge from the
3022     // middle block to exit blocks  and thus no need to update the immediate
3023     // dominator of the exit blocks.
3024     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025 
3026   ReplaceInstWithInst(
3027       TCCheckBlock->getTerminator(),
3028       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029   LoopBypassBlocks.push_back(TCCheckBlock);
3030 }
3031 
emitSCEVChecks(BasicBlock * Bypass)3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033   BasicBlock *const SCEVCheckBlock =
3034       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035   if (!SCEVCheckBlock)
3036     return nullptr;
3037 
3038   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3039            (OptForSizeBasedOnProfile &&
3040             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3041          "Cannot SCEV check stride or overflow when optimizing for size");
3042 
3043 
3044   // Update dominator only if this is first RT check.
3045   if (LoopBypassBlocks.empty()) {
3046     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047     if (!Cost->requiresScalarEpilogue(VF))
3048       // If there is an epilogue which must run, there's no edge from the
3049       // middle block to exit blocks  and thus no need to update the immediate
3050       // dominator of the exit blocks.
3051       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052   }
3053 
3054   LoopBypassBlocks.push_back(SCEVCheckBlock);
3055   AddedSafetyChecks = true;
3056   return SCEVCheckBlock;
3057 }
3058 
emitMemRuntimeChecks(BasicBlock * Bypass)3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060   // VPlan-native path does not do any analysis for runtime checks currently.
3061   if (EnableVPlanNativePath)
3062     return nullptr;
3063 
3064   BasicBlock *const MemCheckBlock =
3065       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066 
3067   // Check if we generated code that checks in runtime if arrays overlap. We put
3068   // the checks into a separate block to make the more common case of few
3069   // elements faster.
3070   if (!MemCheckBlock)
3071     return nullptr;
3072 
3073   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3075            "Cannot emit memory checks when optimizing for size, unless forced "
3076            "to vectorize.");
3077     ORE->emit([&]() {
3078       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3079                                         OrigLoop->getStartLoc(),
3080                                         OrigLoop->getHeader())
3081              << "Code-size may be reduced by not forcing "
3082                 "vectorization, or by source-code modifications "
3083                 "eliminating the need for runtime checks "
3084                 "(e.g., adding 'restrict').";
3085     });
3086   }
3087 
3088   LoopBypassBlocks.push_back(MemCheckBlock);
3089 
3090   AddedSafetyChecks = true;
3091 
3092   return MemCheckBlock;
3093 }
3094 
createVectorLoopSkeleton(StringRef Prefix)3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096   LoopScalarBody = OrigLoop->getHeader();
3097   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098   assert(LoopVectorPreHeader && "Invalid loop structure");
3099   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3101          "multiple exit loop without required epilogue?");
3102 
3103   LoopMiddleBlock =
3104       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105                  LI, nullptr, Twine(Prefix) + "middle.block");
3106   LoopScalarPreHeader =
3107       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108                  nullptr, Twine(Prefix) + "scalar.ph");
3109 
3110   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111 
3112   // Set up the middle block terminator.  Two cases:
3113   // 1) If we know that we must execute the scalar epilogue, emit an
3114   //    unconditional branch.
3115   // 2) Otherwise, we must have a single unique exit block (due to how we
3116   //    implement the multiple exit case).  In this case, set up a conditional
3117   //    branch from the middle block to the loop scalar preheader, and the
3118   //    exit block.  completeLoopSkeleton will update the condition to use an
3119   //    iteration check, if required to decide whether to execute the remainder.
3120   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121     BranchInst::Create(LoopScalarPreHeader) :
3122     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123                        Builder.getTrue());
3124   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126 
3127   // Update dominator for loop exit. During skeleton creation, only the vector
3128   // pre-header and the middle block are created. The vector loop is entirely
3129   // created during VPlan exection.
3130   if (!Cost->requiresScalarEpilogue(VF))
3131     // If there is an epilogue which must run, there's no edge from the
3132     // middle block to exit blocks  and thus no need to update the immediate
3133     // dominator of the exit blocks.
3134     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135 }
3136 
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)3137 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138     PHINode *OrigPhi, const InductionDescriptor &II,
3139     ArrayRef<BasicBlock *> BypassBlocks,
3140     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142   assert(VectorTripCount && "Expected valid arguments");
3143 
3144   Instruction *OldInduction = Legal->getPrimaryInduction();
3145   Value *&EndValue = IVEndValues[OrigPhi];
3146   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147   if (OrigPhi == OldInduction) {
3148     // We know what the end value is.
3149     EndValue = VectorTripCount;
3150   } else {
3151     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152 
3153     // Fast-math-flags propagate from the original induction instruction.
3154     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156 
3157     Value *Step =
3158         CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159     EndValue =
3160         emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161     EndValue->setName("ind.end");
3162 
3163     // Compute the end value for the additional bypass (if applicable).
3164     if (AdditionalBypass.first) {
3165       B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166       Value *Step =
3167           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168       EndValueFromAdditionalBypass = emitTransformedIndex(
3169           B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170       EndValueFromAdditionalBypass->setName("ind.end");
3171     }
3172   }
3173 
3174   // Create phi nodes to merge from the  backedge-taken check block.
3175   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176                                          LoopScalarPreHeader->getTerminator());
3177   // Copy original phi DL over to the new one.
3178   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179 
3180   // The new PHI merges the original incoming value, in case of a bypass,
3181   // or the value at the end of the vectorized loop.
3182   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183 
3184   // Fix the scalar body counter (PHI node).
3185   // The old induction's phi node in the scalar body needs the truncated
3186   // value.
3187   for (BasicBlock *BB : BypassBlocks)
3188     BCResumeVal->addIncoming(II.getStartValue(), BB);
3189 
3190   if (AdditionalBypass.first)
3191     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192                                           EndValueFromAdditionalBypass);
3193   return BCResumeVal;
3194 }
3195 
createInductionResumeValues(std::pair<BasicBlock *,Value * > AdditionalBypass)3196 void InnerLoopVectorizer::createInductionResumeValues(
3197     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3199           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3200          "Inconsistent information about additional bypass.");
3201   // We are going to resume the execution of the scalar loop.
3202   // Go over all of the induction variables that we found and fix the
3203   // PHIs that are left in the scalar version of the loop.
3204   // The starting values of PHI nodes depend on the counter of the last
3205   // iteration in the vectorized loop.
3206   // If we come from a bypass edge then we need to start from the original
3207   // start value.
3208   for (const auto &InductionEntry : Legal->getInductionVars()) {
3209     PHINode *OrigPhi = InductionEntry.first;
3210     const InductionDescriptor &II = InductionEntry.second;
3211     PHINode *BCResumeVal = createInductionResumeValue(
3212         OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214   }
3215 }
3216 
completeLoopSkeleton()3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218   // The trip counts should be cached by now.
3219   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221 
3222   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223 
3224   // Add a check in the middle block to see if we have completed
3225   // all of the iterations in the first vector loop.  Three cases:
3226   // 1) If we require a scalar epilogue, there is no conditional branch as
3227   //    we unconditionally branch to the scalar preheader.  Do nothing.
3228   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229   //    Thus if tail is to be folded, we know we don't need to run the
3230   //    remainder and we can use the previous value for the condition (true).
3231   // 3) Otherwise, construct a runtime check.
3232   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234                                         Count, VectorTripCount, "cmp.n",
3235                                         LoopMiddleBlock->getTerminator());
3236 
3237     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238     // of the corresponding compare because they may have ended up with
3239     // different line numbers and we want to avoid awkward line stepping while
3240     // debugging. Eg. if the compare has got a line number inside the loop.
3241     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243   }
3244 
3245 #ifdef EXPENSIVE_CHECKS
3246   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3247 #endif
3248 
3249   return LoopVectorPreHeader;
3250 }
3251 
3252 std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton()3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254   /*
3255    In this function we generate a new loop. The new loop will contain
3256    the vectorized instructions while the old loop will continue to run the
3257    scalar remainder.
3258 
3259        [ ] <-- loop iteration number check.
3260     /   |
3261    /    v
3262   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3263   |  /  |
3264   | /   v
3265   ||   [ ]     <-- vector pre header.
3266   |/    |
3267   |     v
3268   |    [  ] \
3269   |    [  ]_|   <-- vector loop (created during VPlan execution).
3270   |     |
3271   |     v
3272   \   -[ ]   <--- middle-block.
3273    \/   |
3274    /\   v
3275    | ->[ ]     <--- new preheader.
3276    |    |
3277  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3278    |   [ ] \
3279    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3280     \   |
3281      \  v
3282       >[ ]     <-- exit block(s).
3283    ...
3284    */
3285 
3286   // Create an empty vector loop, and prepare basic blocks for the runtime
3287   // checks.
3288   createVectorLoopSkeleton("");
3289 
3290   // Now, compare the new count to zero. If it is zero skip the vector loop and
3291   // jump to the scalar loop. This check also covers the case where the
3292   // backedge-taken count is uint##_max: adding one to it will overflow leading
3293   // to an incorrect trip count of zero. In this (rare) case we will also jump
3294   // to the scalar loop.
3295   emitIterationCountCheck(LoopScalarPreHeader);
3296 
3297   // Generate the code to check any assumptions that we've made for SCEV
3298   // expressions.
3299   emitSCEVChecks(LoopScalarPreHeader);
3300 
3301   // Generate the code that checks in runtime if arrays overlap. We put the
3302   // checks into a separate block to make the more common case of few elements
3303   // faster.
3304   emitMemRuntimeChecks(LoopScalarPreHeader);
3305 
3306   // Emit phis for the new starting index of the scalar loop.
3307   createInductionResumeValues();
3308 
3309   return {completeLoopSkeleton(), nullptr};
3310 }
3311 
3312 // Fix up external users of the induction variable. At this point, we are
3313 // in LCSSA form, with all external PHIs that use the IV having one input value,
3314 // coming from the remainder loop. We need those PHIs to also have a correct
3315 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan)3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317                                        const InductionDescriptor &II,
3318                                        Value *VectorTripCount, Value *EndValue,
3319                                        BasicBlock *MiddleBlock,
3320                                        BasicBlock *VectorHeader, VPlan &Plan) {
3321   // There are two kinds of external IV usages - those that use the value
3322   // computed in the last iteration (the PHI) and those that use the penultimate
3323   // value (the value that feeds into the phi from the loop latch).
3324   // We allow both, but they, obviously, have different values.
3325 
3326   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3327 
3328   DenseMap<Value *, Value *> MissingVals;
3329 
3330   // An external user of the last iteration's value should see the value that
3331   // the remainder loop uses to initialize its own IV.
3332   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333   for (User *U : PostInc->users()) {
3334     Instruction *UI = cast<Instruction>(U);
3335     if (!OrigLoop->contains(UI)) {
3336       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3337       MissingVals[UI] = EndValue;
3338     }
3339   }
3340 
3341   // An external user of the penultimate value need to see EndValue - Step.
3342   // The simplest way to get this is to recompute it from the constituent SCEVs,
3343   // that is Start + (Step * (CRD - 1)).
3344   for (User *U : OrigPhi->users()) {
3345     auto *UI = cast<Instruction>(U);
3346     if (!OrigLoop->contains(UI)) {
3347       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3348 
3349       IRBuilder<> B(MiddleBlock->getTerminator());
3350 
3351       // Fast-math-flags propagate from the original induction instruction.
3352       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354 
3355       Value *CountMinusOne = B.CreateSub(
3356           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357       CountMinusOne->setName("cmo");
3358       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359                                     VectorHeader->getTerminator());
3360       Value *Escape =
3361           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362       Escape->setName("ind.escape");
3363       MissingVals[UI] = Escape;
3364     }
3365   }
3366 
3367   for (auto &I : MissingVals) {
3368     PHINode *PHI = cast<PHINode>(I.first);
3369     // One corner case we have to handle is two IVs "chasing" each-other,
3370     // that is %IV2 = phi [...], [ %IV1, %latch ]
3371     // In this case, if IV1 has an external use, we need to avoid adding both
3372     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373     // don't already have an incoming value for the middle block.
3374     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375       PHI->addIncoming(I.second, MiddleBlock);
3376       Plan.removeLiveOut(PHI);
3377     }
3378   }
3379 }
3380 
3381 namespace {
3382 
3383 struct CSEDenseMapInfo {
canHandle__anonb4be857a0c11::CSEDenseMapInfo3384   static bool canHandle(const Instruction *I) {
3385     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387   }
3388 
getEmptyKey__anonb4be857a0c11::CSEDenseMapInfo3389   static inline Instruction *getEmptyKey() {
3390     return DenseMapInfo<Instruction *>::getEmptyKey();
3391   }
3392 
getTombstoneKey__anonb4be857a0c11::CSEDenseMapInfo3393   static inline Instruction *getTombstoneKey() {
3394     return DenseMapInfo<Instruction *>::getTombstoneKey();
3395   }
3396 
getHashValue__anonb4be857a0c11::CSEDenseMapInfo3397   static unsigned getHashValue(const Instruction *I) {
3398     assert(canHandle(I) && "Unknown instruction!");
3399     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400                                                            I->value_op_end()));
3401   }
3402 
isEqual__anonb4be857a0c11::CSEDenseMapInfo3403   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406       return LHS == RHS;
3407     return LHS->isIdenticalTo(RHS);
3408   }
3409 };
3410 
3411 } // end anonymous namespace
3412 
3413 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3414 static void cse(BasicBlock *BB) {
3415   // Perform simple cse.
3416   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418     if (!CSEDenseMapInfo::canHandle(&In))
3419       continue;
3420 
3421     // Check if we can replace this instruction with any of the
3422     // visited instructions.
3423     if (Instruction *V = CSEMap.lookup(&In)) {
3424       In.replaceAllUsesWith(V);
3425       In.eraseFromParent();
3426       continue;
3427     }
3428 
3429     CSEMap[&In] = &In;
3430   }
3431 }
3432 
3433 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF,bool & NeedToScalarize) const3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435                                               bool &NeedToScalarize) const {
3436   Function *F = CI->getCalledFunction();
3437   Type *ScalarRetTy = CI->getType();
3438   SmallVector<Type *, 4> Tys, ScalarTys;
3439   for (auto &ArgOp : CI->args())
3440     ScalarTys.push_back(ArgOp->getType());
3441 
3442   // Estimate cost of scalarized vector call. The source operands are assumed
3443   // to be vectors, so we need to extract individual elements from there,
3444   // execute VF scalar calls, and then gather the result into the vector return
3445   // value.
3446   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447   InstructionCost ScalarCallCost =
3448       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449   if (VF.isScalar())
3450     return ScalarCallCost;
3451 
3452   // Compute corresponding vector type for return value and arguments.
3453   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454   for (Type *ScalarTy : ScalarTys)
3455     Tys.push_back(ToVectorTy(ScalarTy, VF));
3456 
3457   // Compute costs of unpacking argument values for the scalar calls and
3458   // packing the return values to a vector.
3459   InstructionCost ScalarizationCost =
3460       getScalarizationOverhead(CI, VF, CostKind);
3461 
3462   InstructionCost Cost =
3463       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464 
3465   // If we can't emit a vector call for this function, then the currently found
3466   // cost is the cost we need to return.
3467   NeedToScalarize = true;
3468   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470 
3471   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472     return Cost;
3473 
3474   // If the corresponding vector cost is cheaper, return its cost.
3475   InstructionCost VectorCallCost =
3476       TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477   if (VectorCallCost < Cost) {
3478     NeedToScalarize = false;
3479     Cost = VectorCallCost;
3480   }
3481   return Cost;
3482 }
3483 
MaybeVectorizeType(Type * Elt,ElementCount VF)3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486     return Elt;
3487   return VectorType::get(Elt, VF);
3488 }
3489 
3490 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492                                                    ElementCount VF) const {
3493   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494   assert(ID && "Expected intrinsic call!");
3495   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496   FastMathFlags FMF;
3497   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498     FMF = FPMO->getFastMathFlags();
3499 
3500   SmallVector<const Value *> Arguments(CI->args());
3501   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502   SmallVector<Type *> ParamTys;
3503   std::transform(FTy->param_begin(), FTy->param_end(),
3504                  std::back_inserter(ParamTys),
3505                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506 
3507   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508                                     dyn_cast<IntrinsicInst>(CI));
3509   return TTI.getIntrinsicInstrCost(CostAttrs,
3510                                    TargetTransformInfo::TCK_RecipThroughput);
3511 }
3512 
smallestIntegerVectorType(Type * T1,Type * T2)3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517 }
3518 
largestIntegerVectorType(Type * T1,Type * T2)3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523 }
3524 
truncateToMinimalBitwidths(VPTransformState & State)3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526   // For every instruction `I` in MinBWs, truncate the operands, create a
3527   // truncated version of `I` and reextend its result. InstCombine runs
3528   // later and will remove any ext/trunc pairs.
3529   SmallPtrSet<Value *, 4> Erased;
3530   for (const auto &KV : Cost->getMinimalBitwidths()) {
3531     // If the value wasn't vectorized, we must maintain the original scalar
3532     // type. The absence of the value from State indicates that it
3533     // wasn't vectorized.
3534     // FIXME: Should not rely on getVPValue at this point.
3535     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536     if (!State.hasAnyVectorValue(Def))
3537       continue;
3538     for (unsigned Part = 0; Part < UF; ++Part) {
3539       Value *I = State.get(Def, Part);
3540       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541         continue;
3542       Type *OriginalTy = I->getType();
3543       Type *ScalarTruncatedTy =
3544           IntegerType::get(OriginalTy->getContext(), KV.second);
3545       auto *TruncatedTy = VectorType::get(
3546           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547       if (TruncatedTy == OriginalTy)
3548         continue;
3549 
3550       IRBuilder<> B(cast<Instruction>(I));
3551       auto ShrinkOperand = [&](Value *V) -> Value * {
3552         if (auto *ZI = dyn_cast<ZExtInst>(V))
3553           if (ZI->getSrcTy() == TruncatedTy)
3554             return ZI->getOperand(0);
3555         return B.CreateZExtOrTrunc(V, TruncatedTy);
3556       };
3557 
3558       // The actual instruction modification depends on the instruction type,
3559       // unfortunately.
3560       Value *NewI = nullptr;
3561       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563                              ShrinkOperand(BO->getOperand(1)));
3564 
3565         // Any wrapping introduced by shrinking this operation shouldn't be
3566         // considered undefined behavior. So, we can't unconditionally copy
3567         // arithmetic wrapping flags to NewI.
3568         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570         NewI =
3571             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572                          ShrinkOperand(CI->getOperand(1)));
3573       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574         NewI = B.CreateSelect(SI->getCondition(),
3575                               ShrinkOperand(SI->getTrueValue()),
3576                               ShrinkOperand(SI->getFalseValue()));
3577       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578         switch (CI->getOpcode()) {
3579         default:
3580           llvm_unreachable("Unhandled cast!");
3581         case Instruction::Trunc:
3582           NewI = ShrinkOperand(CI->getOperand(0));
3583           break;
3584         case Instruction::SExt:
3585           NewI = B.CreateSExtOrTrunc(
3586               CI->getOperand(0),
3587               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588           break;
3589         case Instruction::ZExt:
3590           NewI = B.CreateZExtOrTrunc(
3591               CI->getOperand(0),
3592               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593           break;
3594         }
3595       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596         auto Elements0 =
3597             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598         auto *O0 = B.CreateZExtOrTrunc(
3599             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600         auto Elements1 =
3601             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602         auto *O1 = B.CreateZExtOrTrunc(
3603             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604 
3605         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607         // Don't do anything with the operands, just extend the result.
3608         continue;
3609       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610         auto Elements =
3611             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612         auto *O0 = B.CreateZExtOrTrunc(
3613             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617         auto Elements =
3618             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619         auto *O0 = B.CreateZExtOrTrunc(
3620             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622       } else {
3623         // If we don't know what to do, be conservative and don't do anything.
3624         continue;
3625       }
3626 
3627       // Lastly, extend the result.
3628       NewI->takeName(cast<Instruction>(I));
3629       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630       I->replaceAllUsesWith(Res);
3631       cast<Instruction>(I)->eraseFromParent();
3632       Erased.insert(I);
3633       State.reset(Def, Res, Part);
3634     }
3635   }
3636 
3637   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638   for (const auto &KV : Cost->getMinimalBitwidths()) {
3639     // If the value wasn't vectorized, we must maintain the original scalar
3640     // type. The absence of the value from State indicates that it
3641     // wasn't vectorized.
3642     // FIXME: Should not rely on getVPValue at this point.
3643     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644     if (!State.hasAnyVectorValue(Def))
3645       continue;
3646     for (unsigned Part = 0; Part < UF; ++Part) {
3647       Value *I = State.get(Def, Part);
3648       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649       if (Inst && Inst->use_empty()) {
3650         Value *NewI = Inst->getOperand(0);
3651         Inst->eraseFromParent();
3652         State.reset(Def, NewI, Part);
3653       }
3654     }
3655   }
3656 }
3657 
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659                                             VPlan &Plan) {
3660   // Insert truncates and extends for any truncated instructions as hints to
3661   // InstCombine.
3662   if (VF.isVector())
3663     truncateToMinimalBitwidths(State);
3664 
3665   // Fix widened non-induction PHIs by setting up the PHI operands.
3666   if (EnableVPlanNativePath)
3667     fixNonInductionPHIs(Plan, State);
3668 
3669   // At this point every instruction in the original loop is widened to a
3670   // vector form. Now we need to fix the recurrences in the loop. These PHI
3671   // nodes are currently empty because we did not want to introduce cycles.
3672   // This is the second stage of vectorizing recurrences.
3673   fixCrossIterationPHIs(State);
3674 
3675   // Forget the original basic block.
3676   PSE.getSE()->forgetLoop(OrigLoop);
3677 
3678   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680   if (Cost->requiresScalarEpilogue(VF)) {
3681     // No edge from the middle block to the unique exit block has been inserted
3682     // and there is nothing to fix from vector loop; phis should have incoming
3683     // from scalar loop only.
3684     Plan.clearLiveOuts();
3685   } else {
3686     // If we inserted an edge from the middle block to the unique exit block,
3687     // update uses outside the loop (phis) to account for the newly inserted
3688     // edge.
3689 
3690     // Fix-up external users of the induction variables.
3691     for (const auto &Entry : Legal->getInductionVars())
3692       fixupIVUsers(Entry.first, Entry.second,
3693                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694                    IVEndValues[Entry.first], LoopMiddleBlock,
3695                    VectorLoop->getHeader(), Plan);
3696   }
3697 
3698   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699   // in the exit block, so update the builder.
3700   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701   for (const auto &KV : Plan.getLiveOuts())
3702     KV.second->fixPhi(Plan, State);
3703 
3704   for (Instruction *PI : PredicatedInstructions)
3705     sinkScalarOperands(&*PI);
3706 
3707   // Remove redundant induction instructions.
3708   cse(VectorLoop->getHeader());
3709 
3710   // Set/update profile weights for the vector and remainder loops as original
3711   // loop iterations are now distributed among them. Note that original loop
3712   // represented by LoopScalarBody becomes remainder loop after vectorization.
3713   //
3714   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715   // end up getting slightly roughened result but that should be OK since
3716   // profile is not inherently precise anyway. Note also possible bypass of
3717   // vector code caused by legality checks is ignored, assigning all the weight
3718   // to the vector loop, optimistically.
3719   //
3720   // For scalable vectorization we can't know at compile time how many iterations
3721   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722   // vscale of '1'.
3723   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724                                LI->getLoopFor(LoopScalarBody),
3725                                VF.getKnownMinValue() * UF);
3726 }
3727 
fixCrossIterationPHIs(VPTransformState & State)3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729   // In order to support recurrences we need to be able to vectorize Phi nodes.
3730   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731   // stage #2: We now need to fix the recurrences by adding incoming edges to
3732   // the currently empty PHI nodes. At this point every instruction in the
3733   // original loop is widened to a vector form so we can use them to construct
3734   // the incoming edges.
3735   VPBasicBlock *Header =
3736       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737   for (VPRecipeBase &R : Header->phis()) {
3738     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739       fixReduction(ReductionPhi, State);
3740     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741       fixFixedOrderRecurrence(FOR, State);
3742   }
3743 }
3744 
fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe * PhiR,VPTransformState & State)3745 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747   // This is the second phase of vectorizing first-order recurrences. An
3748   // overview of the transformation is described below. Suppose we have the
3749   // following loop.
3750   //
3751   //   for (int i = 0; i < n; ++i)
3752   //     b[i] = a[i] - a[i - 1];
3753   //
3754   // There is a first-order recurrence on "a". For this loop, the shorthand
3755   // scalar IR looks like:
3756   //
3757   //   scalar.ph:
3758   //     s_init = a[-1]
3759   //     br scalar.body
3760   //
3761   //   scalar.body:
3762   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3763   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764   //     s2 = a[i]
3765   //     b[i] = s2 - s1
3766   //     br cond, scalar.body, ...
3767   //
3768   // In this example, s1 is a recurrence because it's value depends on the
3769   // previous iteration. In the first phase of vectorization, we created a
3770   // vector phi v1 for s1. We now complete the vectorization and produce the
3771   // shorthand vector IR shown below (for VF = 4, UF = 1).
3772   //
3773   //   vector.ph:
3774   //     v_init = vector(..., ..., ..., a[-1])
3775   //     br vector.body
3776   //
3777   //   vector.body
3778   //     i = phi [0, vector.ph], [i+4, vector.body]
3779   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3780   //     v2 = a[i, i+1, i+2, i+3];
3781   //     v3 = vector(v1(3), v2(0, 1, 2))
3782   //     b[i, i+1, i+2, i+3] = v2 - v3
3783   //     br cond, vector.body, middle.block
3784   //
3785   //   middle.block:
3786   //     x = v2(3)
3787   //     br scalar.ph
3788   //
3789   //   scalar.ph:
3790   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3791   //     br scalar.body
3792   //
3793   // After execution completes the vector loop, we extract the next value of
3794   // the recurrence (x) to use as the initial value in the scalar loop.
3795 
3796   // Extract the last vector element in the middle block. This will be the
3797   // initial value for the recurrence when jumping to the scalar loop.
3798   VPValue *PreviousDef = PhiR->getBackedgeValue();
3799   Value *Incoming = State.get(PreviousDef, UF - 1);
3800   auto *ExtractForScalar = Incoming;
3801   auto *IdxTy = Builder.getInt32Ty();
3802   if (VF.isVector()) {
3803     auto *One = ConstantInt::get(IdxTy, 1);
3804     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808                                                     "vector.recur.extract");
3809   }
3810   // Extract the second last element in the middle block if the
3811   // Phi is used outside the loop. We need to extract the phi itself
3812   // and not the last element (the phi update in the current iteration). This
3813   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814   // when the scalar loop is not run at all.
3815   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816   if (VF.isVector()) {
3817     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820         Incoming, Idx, "vector.recur.extract.for.phi");
3821   } else if (UF > 1)
3822     // When loop is unrolled without vectorizing, initialize
3823     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824     // of `Incoming`. This is analogous to the vectorized case above: extracting
3825     // the second last element when VF > 1.
3826     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827 
3828   // Fix the initial value of the original recurrence in the scalar loop.
3829   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835     Start->addIncoming(Incoming, BB);
3836   }
3837 
3838   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839   Phi->setName("scalar.recur");
3840 
3841   // Finally, fix users of the recurrence outside the loop. The users will need
3842   // either the last value of the scalar recurrence or the last value of the
3843   // vector recurrence we extracted in the middle block. Since the loop is in
3844   // LCSSA form, we just need to find all the phi nodes for the original scalar
3845   // recurrence in the exit block, and then add an edge for the middle block.
3846   // Note that LCSSA does not imply single entry when the original scalar loop
3847   // had multiple exiting edges (as we always run the last iteration in the
3848   // scalar epilogue); in that case, there is no edge from middle to exit and
3849   // and thus no phis which needed updated.
3850   if (!Cost->requiresScalarEpilogue(VF))
3851     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854         State.Plan->removeLiveOut(&LCSSAPhi);
3855       }
3856 }
3857 
fixReduction(VPReductionPHIRecipe * PhiR,VPTransformState & State)3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859                                        VPTransformState &State) {
3860   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861   // Get it's reduction variable descriptor.
3862   assert(Legal->isReductionVariable(OrigPhi) &&
3863          "Unable to find the reduction variable");
3864   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865 
3866   RecurKind RK = RdxDesc.getRecurrenceKind();
3867   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869   State.setDebugLocFromInst(ReductionStartValue);
3870 
3871   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872   // This is the vector-clone of the value that leaves the loop.
3873   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874 
3875   // Wrap flags are in general invalid after vectorization, clear them.
3876   clearReductionWrapFlags(PhiR, State);
3877 
3878   // Before each round, move the insertion point right between
3879   // the PHIs and the values we are going to write.
3880   // This allows us to write both PHINodes and the extractelement
3881   // instructions.
3882   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883 
3884   State.setDebugLocFromInst(LoopExitInst);
3885 
3886   Type *PhiTy = OrigPhi->getType();
3887 
3888   VPBasicBlock *LatchVPBB =
3889       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891   // If tail is folded by masking, the vector value to leave the loop should be
3892   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893   // instead of the former. For an inloop reduction the reduction will already
3894   // be predicated, and does not need to be handled here.
3895   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898       SelectInst *Sel = nullptr;
3899       for (User *U : VecLoopExitInst->users()) {
3900         if (isa<SelectInst>(U)) {
3901           assert(!Sel && "Reduction exit feeding two selects");
3902           Sel = cast<SelectInst>(U);
3903         } else
3904           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3905       }
3906       assert(Sel && "Reduction exit feeds no select");
3907       State.reset(LoopExitInstDef, Sel, Part);
3908 
3909       if (isa<FPMathOperator>(Sel))
3910         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911 
3912       // If the target can create a predicated operator for the reduction at no
3913       // extra cost in the loop (for example a predicated vadd), it can be
3914       // cheaper for the select to remain in the loop than be sunk out of it,
3915       // and so use the select value for the phi instead of the old
3916       // LoopExitValue.
3917       if (PreferPredicatedReductionSelect ||
3918           TTI->preferPredicatedReductionSelect(
3919               RdxDesc.getOpcode(), PhiTy,
3920               TargetTransformInfo::ReductionFlags())) {
3921         auto *VecRdxPhi =
3922             cast<PHINode>(State.get(PhiR, Part));
3923         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924       }
3925     }
3926   }
3927 
3928   // If the vector reduction can be performed in a smaller type, we truncate
3929   // then extend the loop exit value to enable InstCombine to evaluate the
3930   // entire expression in the smaller type.
3931   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3933     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935     VectorParts RdxParts(UF);
3936     for (unsigned Part = 0; Part < UF; ++Part) {
3937       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940                                         : Builder.CreateZExt(Trunc, VecTy);
3941       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942         if (U != Trunc) {
3943           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944           RdxParts[Part] = Extnd;
3945         }
3946     }
3947     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948     for (unsigned Part = 0; Part < UF; ++Part) {
3949       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951     }
3952   }
3953 
3954   // Reduce all of the unrolled parts into a single vector.
3955   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957 
3958   // The middle block terminator has already been assigned a DebugLoc here (the
3959   // OrigLoop's single latch terminator). We want the whole middle block to
3960   // appear to execute on this line because: (a) it is all compiler generated,
3961   // (b) these instructions are always executed after evaluating the latch
3962   // conditional branch, and (c) other passes may add new predecessors which
3963   // terminate on this line. This is the easiest way to ensure we don't
3964   // accidentally cause an extra step back into the loop while debugging.
3965   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966   if (PhiR->isOrdered())
3967     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968   else {
3969     // Floating-point operations should have some FMF to enable the reduction.
3970     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972     for (unsigned Part = 1; Part < UF; ++Part) {
3973       Value *RdxPart = State.get(LoopExitInstDef, Part);
3974       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975         ReducedPartRdx = Builder.CreateBinOp(
3976             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979                                            ReducedPartRdx, RdxPart);
3980       else
3981         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982     }
3983   }
3984 
3985   // Create the reduction after the loop. Note that inloop reductions create the
3986   // target reduction in the loop using a Reduction recipe.
3987   if (VF.isVector() && !PhiR->isInLoop()) {
3988     ReducedPartRdx =
3989         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990     // If the reduction can be performed in a smaller type, we need to extend
3991     // the reduction to the wider type before we branch to the original loop.
3992     if (PhiTy != RdxDesc.getRecurrenceType())
3993       ReducedPartRdx = RdxDesc.isSigned()
3994                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996   }
3997 
3998   PHINode *ResumePhi =
3999       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000 
4001   // Create a phi node that merges control-flow from the backedge-taken check
4002   // block and the middle block.
4003   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004                                         LoopScalarPreHeader->getTerminator());
4005 
4006   // If we are fixing reductions in the epilogue loop then we should already
4007   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008   // we carry over the incoming values correctly.
4009   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010     if (Incoming == LoopMiddleBlock)
4011       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014                               Incoming);
4015     else
4016       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017   }
4018 
4019   // Set the resume value for this reduction
4020   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021 
4022   // If there were stores of the reduction value to a uniform memory address
4023   // inside the loop, create the final store here.
4024   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025     StoreInst *NewSI =
4026         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027     propagateMetadata(NewSI, SI);
4028 
4029     // If the reduction value is used in other places,
4030     // then let the code below create PHI's for that.
4031   }
4032 
4033   // Now, we need to fix the users of the reduction variable
4034   // inside and outside of the scalar remainder loop.
4035 
4036   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037   // in the exit blocks.  See comment on analogous loop in
4038   // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039   if (!Cost->requiresScalarEpilogue(VF))
4040     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043         State.Plan->removeLiveOut(&LCSSAPhi);
4044       }
4045 
4046   // Fix the scalar loop reduction variable with the incoming reduction sum
4047   // from the vector body and from the backedge value.
4048   int IncomingEdgeBlockIdx =
4049       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4051   // Pick the other block.
4052   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055 }
4056 
clearReductionWrapFlags(VPReductionPHIRecipe * PhiR,VPTransformState & State)4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058                                                   VPTransformState &State) {
4059   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060   RecurKind RK = RdxDesc.getRecurrenceKind();
4061   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062     return;
4063 
4064   SmallVector<VPValue *, 8> Worklist;
4065   SmallPtrSet<VPValue *, 8> Visited;
4066   Worklist.push_back(PhiR);
4067   Visited.insert(PhiR);
4068 
4069   while (!Worklist.empty()) {
4070     VPValue *Cur = Worklist.pop_back_val();
4071     for (unsigned Part = 0; Part < UF; ++Part) {
4072       Value *V = State.get(Cur, Part);
4073       if (!isa<OverflowingBinaryOperator>(V))
4074         break;
4075       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076       }
4077 
4078       for (VPUser *U : Cur->users()) {
4079         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080         if (!UserRecipe)
4081           continue;
4082         for (VPValue *V : UserRecipe->definedValues())
4083           if (Visited.insert(V).second)
4084             Worklist.push_back(V);
4085       }
4086   }
4087 }
4088 
sinkScalarOperands(Instruction * PredInst)4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090   // The basic block and loop containing the predicated instruction.
4091   auto *PredBB = PredInst->getParent();
4092   auto *VectorLoop = LI->getLoopFor(PredBB);
4093 
4094   // Initialize a worklist with the operands of the predicated instruction.
4095   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096 
4097   // Holds instructions that we need to analyze again. An instruction may be
4098   // reanalyzed if we don't yet know if we can sink it or not.
4099   SmallVector<Instruction *, 8> InstsToReanalyze;
4100 
4101   // Returns true if a given use occurs in the predicated block. Phi nodes use
4102   // their operands in their corresponding predecessor blocks.
4103   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104     auto *I = cast<Instruction>(U.getUser());
4105     BasicBlock *BB = I->getParent();
4106     if (auto *Phi = dyn_cast<PHINode>(I))
4107       BB = Phi->getIncomingBlock(
4108           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109     return BB == PredBB;
4110   };
4111 
4112   // Iteratively sink the scalarized operands of the predicated instruction
4113   // into the block we created for it. When an instruction is sunk, it's
4114   // operands are then added to the worklist. The algorithm ends after one pass
4115   // through the worklist doesn't sink a single instruction.
4116   bool Changed;
4117   do {
4118     // Add the instructions that need to be reanalyzed to the worklist, and
4119     // reset the changed indicator.
4120     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121     InstsToReanalyze.clear();
4122     Changed = false;
4123 
4124     while (!Worklist.empty()) {
4125       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126 
4127       // We can't sink an instruction if it is a phi node, is not in the loop,
4128       // or may have side effects.
4129       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130           I->mayHaveSideEffects())
4131         continue;
4132 
4133       // If the instruction is already in PredBB, check if we can sink its
4134       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135       // sinking the scalar instruction I, hence it appears in PredBB; but it
4136       // may have failed to sink I's operands (recursively), which we try
4137       // (again) here.
4138       if (I->getParent() == PredBB) {
4139         Worklist.insert(I->op_begin(), I->op_end());
4140         continue;
4141       }
4142 
4143       // It's legal to sink the instruction if all its uses occur in the
4144       // predicated block. Otherwise, there's nothing to do yet, and we may
4145       // need to reanalyze the instruction.
4146       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147         InstsToReanalyze.push_back(I);
4148         continue;
4149       }
4150 
4151       // Move the instruction to the beginning of the predicated block, and add
4152       // it's operands to the worklist.
4153       I->moveBefore(&*PredBB->getFirstInsertionPt());
4154       Worklist.insert(I->op_begin(), I->op_end());
4155 
4156       // The sinking may have enabled other instructions to be sunk, so we will
4157       // need to iterate.
4158       Changed = true;
4159     }
4160   } while (Changed);
4161 }
4162 
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164                                               VPTransformState &State) {
4165   auto Iter = vp_depth_first_deep(Plan.getEntry());
4166   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167     for (VPRecipeBase &P : VPBB->phis()) {
4168       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169       if (!VPPhi)
4170         continue;
4171       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172       // Make sure the builder has a valid insert point.
4173       Builder.SetInsertPoint(NewPhi);
4174       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175         VPValue *Inc = VPPhi->getIncomingValue(i);
4176         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178       }
4179     }
4180   }
4181 }
4182 
useOrderedReductions(const RecurrenceDescriptor & RdxDesc)4183 bool InnerLoopVectorizer::useOrderedReductions(
4184     const RecurrenceDescriptor &RdxDesc) {
4185   return Cost->useOrderedReductions(RdxDesc);
4186 }
4187 
collectLoopScalars(ElementCount VF)4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189   // We should not collect Scalars more than once per VF. Right now, this
4190   // function is called from collectUniformsAndScalars(), which already does
4191   // this check. Collecting Scalars for VF=1 does not make any sense.
4192   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4193          "This function should not be visited twice for the same VF");
4194 
4195   // This avoids any chances of creating a REPLICATE recipe during planning
4196   // since that would result in generation of scalarized code during execution,
4197   // which is not supported for scalable vectors.
4198   if (VF.isScalable()) {
4199     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200     return;
4201   }
4202 
4203   SmallSetVector<Instruction *, 8> Worklist;
4204 
4205   // These sets are used to seed the analysis with pointers used by memory
4206   // accesses that will remain scalar.
4207   SmallSetVector<Instruction *, 8> ScalarPtrs;
4208   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209   auto *Latch = TheLoop->getLoopLatch();
4210 
4211   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212   // The pointer operands of loads and stores will be scalar as long as the
4213   // memory access is not a gather or scatter operation. The value operand of a
4214   // store will remain scalar if the store is scalarized.
4215   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217     assert(WideningDecision != CM_Unknown &&
4218            "Widening decision should be ready at this moment");
4219     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220       if (Ptr == Store->getValueOperand())
4221         return WideningDecision == CM_Scalarize;
4222     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4223            "Ptr is neither a value or pointer operand");
4224     return WideningDecision != CM_GatherScatter;
4225   };
4226 
4227   // A helper that returns true if the given value is a bitcast or
4228   // getelementptr instruction contained in the loop.
4229   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231             isa<GetElementPtrInst>(V)) &&
4232            !TheLoop->isLoopInvariant(V);
4233   };
4234 
4235   // A helper that evaluates a memory access's use of a pointer. If the use will
4236   // be a scalar use and the pointer is only used by memory accesses, we place
4237   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238   // PossibleNonScalarPtrs.
4239   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240     // We only care about bitcast and getelementptr instructions contained in
4241     // the loop.
4242     if (!isLoopVaryingBitCastOrGEP(Ptr))
4243       return;
4244 
4245     // If the pointer has already been identified as scalar (e.g., if it was
4246     // also identified as uniform), there's nothing to do.
4247     auto *I = cast<Instruction>(Ptr);
4248     if (Worklist.count(I))
4249       return;
4250 
4251     // If the use of the pointer will be a scalar use, and all users of the
4252     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253     // place the pointer in PossibleNonScalarPtrs.
4254     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255           return isa<LoadInst>(U) || isa<StoreInst>(U);
4256         }))
4257       ScalarPtrs.insert(I);
4258     else
4259       PossibleNonScalarPtrs.insert(I);
4260   };
4261 
4262   // We seed the scalars analysis with three classes of instructions: (1)
4263   // instructions marked uniform-after-vectorization and (2) bitcast,
4264   // getelementptr and (pointer) phi instructions used by memory accesses
4265   // requiring a scalar use.
4266   //
4267   // (1) Add to the worklist all instructions that have been identified as
4268   // uniform-after-vectorization.
4269   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270 
4271   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272   // memory accesses requiring a scalar use. The pointer operands of loads and
4273   // stores will be scalar as long as the memory accesses is not a gather or
4274   // scatter operation. The value operand of a store will remain scalar if the
4275   // store is scalarized.
4276   for (auto *BB : TheLoop->blocks())
4277     for (auto &I : *BB) {
4278       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279         evaluatePtrUse(Load, Load->getPointerOperand());
4280       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281         evaluatePtrUse(Store, Store->getPointerOperand());
4282         evaluatePtrUse(Store, Store->getValueOperand());
4283       }
4284     }
4285   for (auto *I : ScalarPtrs)
4286     if (!PossibleNonScalarPtrs.count(I)) {
4287       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4288       Worklist.insert(I);
4289     }
4290 
4291   // Insert the forced scalars.
4292   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293   // induction variable when the PHI user is scalarized.
4294   auto ForcedScalar = ForcedScalars.find(VF);
4295   if (ForcedScalar != ForcedScalars.end())
4296     for (auto *I : ForcedScalar->second) {
4297       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4298       Worklist.insert(I);
4299     }
4300 
4301   // Expand the worklist by looking through any bitcasts and getelementptr
4302   // instructions we've already identified as scalar. This is similar to the
4303   // expansion step in collectLoopUniforms(); however, here we're only
4304   // expanding to include additional bitcasts and getelementptr instructions.
4305   unsigned Idx = 0;
4306   while (Idx != Worklist.size()) {
4307     Instruction *Dst = Worklist[Idx++];
4308     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309       continue;
4310     auto *Src = cast<Instruction>(Dst->getOperand(0));
4311     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312           auto *J = cast<Instruction>(U);
4313           return !TheLoop->contains(J) || Worklist.count(J) ||
4314                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315                   isScalarUse(J, Src));
4316         })) {
4317       Worklist.insert(Src);
4318       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4319     }
4320   }
4321 
4322   // An induction variable will remain scalar if all users of the induction
4323   // variable and induction variable update remain scalar.
4324   for (const auto &Induction : Legal->getInductionVars()) {
4325     auto *Ind = Induction.first;
4326     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327 
4328     // If tail-folding is applied, the primary induction variable will be used
4329     // to feed a vector compare.
4330     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331       continue;
4332 
4333     // Returns true if \p Indvar is a pointer induction that is used directly by
4334     // load/store instruction \p I.
4335     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336                                               Instruction *I) {
4337       return Induction.second.getKind() ==
4338                  InductionDescriptor::IK_PtrInduction &&
4339              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341     };
4342 
4343     // Determine if all users of the induction variable are scalar after
4344     // vectorization.
4345     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346       auto *I = cast<Instruction>(U);
4347       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349     });
4350     if (!ScalarInd)
4351       continue;
4352 
4353     // Determine if all users of the induction variable update instruction are
4354     // scalar after vectorization.
4355     auto ScalarIndUpdate =
4356         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357           auto *I = cast<Instruction>(U);
4358           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360         });
4361     if (!ScalarIndUpdate)
4362       continue;
4363 
4364     // The induction variable and its update instruction will remain scalar.
4365     Worklist.insert(Ind);
4366     Worklist.insert(IndUpdate);
4367     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4368     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4369                       << "\n");
4370   }
4371 
4372   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373 }
4374 
isScalarWithPredication(Instruction * I,ElementCount VF) const4375 bool LoopVectorizationCostModel::isScalarWithPredication(
4376     Instruction *I, ElementCount VF) const {
4377   if (!isPredicatedInst(I))
4378     return false;
4379 
4380   // Do we have a non-scalar lowering for this predicated
4381   // instruction? No - it is scalar with predication.
4382   switch(I->getOpcode()) {
4383   default:
4384     return true;
4385   case Instruction::Load:
4386   case Instruction::Store: {
4387     auto *Ptr = getLoadStorePointerOperand(I);
4388     auto *Ty = getLoadStoreType(I);
4389     Type *VTy = Ty;
4390     if (VF.isVector())
4391       VTy = VectorType::get(Ty, VF);
4392     const Align Alignment = getLoadStoreAlignment(I);
4393     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394                                 TTI.isLegalMaskedGather(VTy, Alignment))
4395                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4397   }
4398   case Instruction::UDiv:
4399   case Instruction::SDiv:
4400   case Instruction::SRem:
4401   case Instruction::URem: {
4402     // We have the option to use the safe-divisor idiom to avoid predication.
4403     // The cost based decision here will always select safe-divisor for
4404     // scalable vectors as scalarization isn't legal.
4405     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407   }
4408   }
4409 }
4410 
isPredicatedInst(Instruction * I) const4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413     return false;
4414 
4415   // Can we prove this instruction is safe to unconditionally execute?
4416   // If not, we must use some form of predication.
4417   switch(I->getOpcode()) {
4418   default:
4419     return false;
4420   case Instruction::Load:
4421   case Instruction::Store: {
4422     if (!Legal->isMaskRequired(I))
4423       return false;
4424     // When we know the load's address is loop invariant and the instruction
4425     // in the original scalar loop was unconditionally executed then we
4426     // don't need to mark it as a predicated instruction. Tail folding may
4427     // introduce additional predication, but we're guaranteed to always have
4428     // at least one active lane.  We call Legal->blockNeedsPredication here
4429     // because it doesn't query tail-folding.  For stores, we need to prove
4430     // both speculation safety (which follows from the same argument as loads),
4431     // but also must prove the value being stored is correct.  The easiest
4432     // form of the later is to require that all values stored are the same.
4433     if (Legal->isUniformMemOp(*I) &&
4434       (isa<LoadInst>(I) ||
4435        (isa<StoreInst>(I) &&
4436         TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437         !Legal->blockNeedsPredication(I->getParent()))
4438       return false;
4439     return true;
4440   }
4441   case Instruction::UDiv:
4442   case Instruction::SDiv:
4443   case Instruction::SRem:
4444   case Instruction::URem:
4445     // TODO: We can use the loop-preheader as context point here and get
4446     // context sensitive reasoning
4447     return !isSafeToSpeculativelyExecute(I);
4448   }
4449 }
4450 
4451 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453                                                     ElementCount VF) const {
4454   assert(I->getOpcode() == Instruction::UDiv ||
4455          I->getOpcode() == Instruction::SDiv ||
4456          I->getOpcode() == Instruction::SRem ||
4457          I->getOpcode() == Instruction::URem);
4458   assert(!isSafeToSpeculativelyExecute(I));
4459 
4460   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461 
4462   // Scalarization isn't legal for scalable vector types
4463   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464   if (!VF.isScalable()) {
4465     // Get the scalarization cost and scale this amount by the probability of
4466     // executing the predicated block. If the instruction is not predicated,
4467     // we fall through to the next case.
4468     ScalarizationCost = 0;
4469 
4470     // These instructions have a non-void type, so account for the phi nodes
4471     // that we will create. This cost is likely to be zero. The phi node
4472     // cost, if any, should be scaled by the block probability because it
4473     // models a copy at the end of each predicated block.
4474     ScalarizationCost += VF.getKnownMinValue() *
4475       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476 
4477     // The cost of the non-predicated instruction.
4478     ScalarizationCost += VF.getKnownMinValue() *
4479       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480 
4481     // The cost of insertelement and extractelement instructions needed for
4482     // scalarization.
4483     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484 
4485     // Scale the cost by the probability of executing the predicated blocks.
4486     // This assumes the predicated block for each vector lane is equally
4487     // likely.
4488     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489   }
4490   InstructionCost SafeDivisorCost = 0;
4491 
4492   auto *VecTy = ToVectorTy(I->getType(), VF);
4493 
4494   // The cost of the select guard to ensure all lanes are well defined
4495   // after we speculate above any internal control flow.
4496   SafeDivisorCost += TTI.getCmpSelInstrCost(
4497     Instruction::Select, VecTy,
4498     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500 
4501   // Certain instructions can be cheaper to vectorize if they have a constant
4502   // second vector operand. One example of this are shifts on x86.
4503   Value *Op2 = I->getOperand(1);
4504   auto Op2Info = TTI.getOperandInfo(Op2);
4505   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507 
4508   SmallVector<const Value *, 4> Operands(I->operand_values());
4509   SafeDivisorCost += TTI.getArithmeticInstrCost(
4510     I->getOpcode(), VecTy, CostKind,
4511     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512     Op2Info, Operands, I);
4513   return {ScalarizationCost, SafeDivisorCost};
4514 }
4515 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517     Instruction *I, ElementCount VF) {
4518   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4519   assert(getWideningDecision(I, VF) == CM_Unknown &&
4520          "Decision should not be set yet.");
4521   auto *Group = getInterleavedAccessGroup(I);
4522   assert(Group && "Must have a group.");
4523 
4524   // If the instruction's allocated size doesn't equal it's type size, it
4525   // requires padding and will be scalarized.
4526   auto &DL = I->getModule()->getDataLayout();
4527   auto *ScalarTy = getLoadStoreType(I);
4528   if (hasIrregularType(ScalarTy, DL))
4529     return false;
4530 
4531   // If the group involves a non-integral pointer, we may not be able to
4532   // losslessly cast all values to a common type.
4533   unsigned InterleaveFactor = Group->getFactor();
4534   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535   for (unsigned i = 0; i < InterleaveFactor; i++) {
4536     Instruction *Member = Group->getMember(i);
4537     if (!Member)
4538       continue;
4539     auto *MemberTy = getLoadStoreType(Member);
4540     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541     // Don't coerce non-integral pointers to integers or vice versa.
4542     if (MemberNI != ScalarNI) {
4543       // TODO: Consider adding special nullptr value case here
4544       return false;
4545     } else if (MemberNI && ScalarNI &&
4546                ScalarTy->getPointerAddressSpace() !=
4547                MemberTy->getPointerAddressSpace()) {
4548       return false;
4549     }
4550   }
4551 
4552   // Check if masking is required.
4553   // A Group may need masking for one of two reasons: it resides in a block that
4554   // needs predication, or it was decided to use masking to deal with gaps
4555   // (either a gap at the end of a load-access that may result in a speculative
4556   // load, or any gaps in a store-access).
4557   bool PredicatedAccessRequiresMasking =
4558       blockNeedsPredicationForAnyReason(I->getParent()) &&
4559       Legal->isMaskRequired(I);
4560   bool LoadAccessWithGapsRequiresEpilogMasking =
4561       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562       !isScalarEpilogueAllowed();
4563   bool StoreAccessWithGapsRequiresMasking =
4564       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565   if (!PredicatedAccessRequiresMasking &&
4566       !LoadAccessWithGapsRequiresEpilogMasking &&
4567       !StoreAccessWithGapsRequiresMasking)
4568     return true;
4569 
4570   // If masked interleaving is required, we expect that the user/target had
4571   // enabled it, because otherwise it either wouldn't have been created or
4572   // it should have been invalidated by the CostModel.
4573   assert(useMaskedInterleavedAccesses(TTI) &&
4574          "Masked interleave-groups for predicated accesses are not enabled.");
4575 
4576   if (Group->isReverse())
4577     return false;
4578 
4579   auto *Ty = getLoadStoreType(I);
4580   const Align Alignment = getLoadStoreAlignment(I);
4581   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582                           : TTI.isLegalMaskedStore(Ty, Alignment);
4583 }
4584 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586     Instruction *I, ElementCount VF) {
4587   // Get and ensure we have a valid memory instruction.
4588   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4589 
4590   auto *Ptr = getLoadStorePointerOperand(I);
4591   auto *ScalarTy = getLoadStoreType(I);
4592 
4593   // In order to be widened, the pointer should be consecutive, first of all.
4594   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595     return false;
4596 
4597   // If the instruction is a store located in a predicated block, it will be
4598   // scalarized.
4599   if (isScalarWithPredication(I, VF))
4600     return false;
4601 
4602   // If the instruction's allocated size doesn't equal it's type size, it
4603   // requires padding and will be scalarized.
4604   auto &DL = I->getModule()->getDataLayout();
4605   if (hasIrregularType(ScalarTy, DL))
4606     return false;
4607 
4608   return true;
4609 }
4610 
collectLoopUniforms(ElementCount VF)4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612   // We should not collect Uniforms more than once per VF. Right now,
4613   // this function is called from collectUniformsAndScalars(), which
4614   // already does this check. Collecting Uniforms for VF=1 does not make any
4615   // sense.
4616 
4617   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4618          "This function should not be visited twice for the same VF");
4619 
4620   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621   // not analyze again.  Uniforms.count(VF) will return 1.
4622   Uniforms[VF].clear();
4623 
4624   // We now know that the loop is vectorizable!
4625   // Collect instructions inside the loop that will remain uniform after
4626   // vectorization.
4627 
4628   // Global values, params and instructions outside of current loop are out of
4629   // scope.
4630   auto isOutOfScope = [&](Value *V) -> bool {
4631     Instruction *I = dyn_cast<Instruction>(V);
4632     return (!I || !TheLoop->contains(I));
4633   };
4634 
4635   // Worklist containing uniform instructions demanding lane 0.
4636   SetVector<Instruction *> Worklist;
4637   BasicBlock *Latch = TheLoop->getLoopLatch();
4638 
4639   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640   // that are scalar with predication must not be considered uniform after
4641   // vectorization, because that would create an erroneous replicating region
4642   // where only a single instance out of VF should be formed.
4643   // TODO: optimize such seldom cases if found important, see PR40816.
4644   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645     if (isOutOfScope(I)) {
4646       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4647                         << *I << "\n");
4648       return;
4649     }
4650     if (isScalarWithPredication(I, VF)) {
4651       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4652                         << *I << "\n");
4653       return;
4654     }
4655     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4656     Worklist.insert(I);
4657   };
4658 
4659   // Start with the conditional branch. If the branch condition is an
4660   // instruction contained in the loop that is only used by the branch, it is
4661   // uniform.
4662   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664     addToWorklistIfAllowed(Cmp);
4665 
4666   // Return true if all lanes perform the same memory operation, and we can
4667   // thus chose to execute only one.
4668   auto isUniformMemOpUse = [&](Instruction *I) {
4669     if (!Legal->isUniformMemOp(*I))
4670       return false;
4671     if (isa<LoadInst>(I))
4672       // Loading the same address always produces the same result - at least
4673       // assuming aliasing and ordering which have already been checked.
4674       return true;
4675     // Storing the same value on every iteration.
4676     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677   };
4678 
4679   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680     InstWidening WideningDecision = getWideningDecision(I, VF);
4681     assert(WideningDecision != CM_Unknown &&
4682            "Widening decision should be ready at this moment");
4683 
4684     if (isUniformMemOpUse(I))
4685       return true;
4686 
4687     return (WideningDecision == CM_Widen ||
4688             WideningDecision == CM_Widen_Reverse ||
4689             WideningDecision == CM_Interleave);
4690   };
4691 
4692 
4693   // Returns true if Ptr is the pointer operand of a memory access instruction
4694   // I, and I is known to not require scalarization.
4695   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4697   };
4698 
4699   // Holds a list of values which are known to have at least one uniform use.
4700   // Note that there may be other uses which aren't uniform.  A "uniform use"
4701   // here is something which only demands lane 0 of the unrolled iterations;
4702   // it does not imply that all lanes produce the same value (e.g. this is not
4703   // the usual meaning of uniform)
4704   SetVector<Value *> HasUniformUse;
4705 
4706   // Scan the loop for instructions which are either a) known to have only
4707   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4708   for (auto *BB : TheLoop->blocks())
4709     for (auto &I : *BB) {
4710       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4711         switch (II->getIntrinsicID()) {
4712         case Intrinsic::sideeffect:
4713         case Intrinsic::experimental_noalias_scope_decl:
4714         case Intrinsic::assume:
4715         case Intrinsic::lifetime_start:
4716         case Intrinsic::lifetime_end:
4717           if (TheLoop->hasLoopInvariantOperands(&I))
4718             addToWorklistIfAllowed(&I);
4719           break;
4720         default:
4721           break;
4722         }
4723       }
4724 
4725       // ExtractValue instructions must be uniform, because the operands are
4726       // known to be loop-invariant.
4727       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4728         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4729                "Expected aggregate value to be loop invariant");
4730         addToWorklistIfAllowed(EVI);
4731         continue;
4732       }
4733 
4734       // If there's no pointer operand, there's nothing to do.
4735       auto *Ptr = getLoadStorePointerOperand(&I);
4736       if (!Ptr)
4737         continue;
4738 
4739       if (isUniformMemOpUse(&I))
4740         addToWorklistIfAllowed(&I);
4741 
4742       if (isUniformDecision(&I, VF)) {
4743         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4744         HasUniformUse.insert(Ptr);
4745       }
4746     }
4747 
4748   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4749   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4750   // disallows uses outside the loop as well.
4751   for (auto *V : HasUniformUse) {
4752     if (isOutOfScope(V))
4753       continue;
4754     auto *I = cast<Instruction>(V);
4755     auto UsersAreMemAccesses =
4756       llvm::all_of(I->users(), [&](User *U) -> bool {
4757         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4758       });
4759     if (UsersAreMemAccesses)
4760       addToWorklistIfAllowed(I);
4761   }
4762 
4763   // Expand Worklist in topological order: whenever a new instruction
4764   // is added , its users should be already inside Worklist.  It ensures
4765   // a uniform instruction will only be used by uniform instructions.
4766   unsigned idx = 0;
4767   while (idx != Worklist.size()) {
4768     Instruction *I = Worklist[idx++];
4769 
4770     for (auto *OV : I->operand_values()) {
4771       // isOutOfScope operands cannot be uniform instructions.
4772       if (isOutOfScope(OV))
4773         continue;
4774       // First order recurrence Phi's should typically be considered
4775       // non-uniform.
4776       auto *OP = dyn_cast<PHINode>(OV);
4777       if (OP && Legal->isFixedOrderRecurrence(OP))
4778         continue;
4779       // If all the users of the operand are uniform, then add the
4780       // operand into the uniform worklist.
4781       auto *OI = cast<Instruction>(OV);
4782       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4783             auto *J = cast<Instruction>(U);
4784             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4785           }))
4786         addToWorklistIfAllowed(OI);
4787     }
4788   }
4789 
4790   // For an instruction to be added into Worklist above, all its users inside
4791   // the loop should also be in Worklist. However, this condition cannot be
4792   // true for phi nodes that form a cyclic dependence. We must process phi
4793   // nodes separately. An induction variable will remain uniform if all users
4794   // of the induction variable and induction variable update remain uniform.
4795   // The code below handles both pointer and non-pointer induction variables.
4796   for (const auto &Induction : Legal->getInductionVars()) {
4797     auto *Ind = Induction.first;
4798     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4799 
4800     // Determine if all users of the induction variable are uniform after
4801     // vectorization.
4802     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4803       auto *I = cast<Instruction>(U);
4804       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4805              isVectorizedMemAccessUse(I, Ind);
4806     });
4807     if (!UniformInd)
4808       continue;
4809 
4810     // Determine if all users of the induction variable update instruction are
4811     // uniform after vectorization.
4812     auto UniformIndUpdate =
4813         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4814           auto *I = cast<Instruction>(U);
4815           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4816                  isVectorizedMemAccessUse(I, IndUpdate);
4817         });
4818     if (!UniformIndUpdate)
4819       continue;
4820 
4821     // The induction variable and its update instruction will remain uniform.
4822     addToWorklistIfAllowed(Ind);
4823     addToWorklistIfAllowed(IndUpdate);
4824   }
4825 
4826   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4827 }
4828 
runtimeChecksRequired()4829 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4830   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4831 
4832   if (Legal->getRuntimePointerChecking()->Need) {
4833     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4834         "runtime pointer checks needed. Enable vectorization of this "
4835         "loop with '#pragma clang loop vectorize(enable)' when "
4836         "compiling with -Os/-Oz",
4837         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838     return true;
4839   }
4840 
4841   if (!PSE.getPredicate().isAlwaysTrue()) {
4842     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4843         "runtime SCEV checks needed. Enable vectorization of this "
4844         "loop with '#pragma clang loop vectorize(enable)' when "
4845         "compiling with -Os/-Oz",
4846         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4847     return true;
4848   }
4849 
4850   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4851   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4852     reportVectorizationFailure("Runtime stride check for small trip count",
4853         "runtime stride == 1 checks needed. Enable vectorization of "
4854         "this loop without such check by compiling with -Os/-Oz",
4855         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4856     return true;
4857   }
4858 
4859   return false;
4860 }
4861 
4862 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)4863 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4864   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4865     return ElementCount::getScalable(0);
4866 
4867   if (Hints->isScalableVectorizationDisabled()) {
4868     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4869                             "ScalableVectorizationDisabled", ORE, TheLoop);
4870     return ElementCount::getScalable(0);
4871   }
4872 
4873   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4874 
4875   auto MaxScalableVF = ElementCount::getScalable(
4876       std::numeric_limits<ElementCount::ScalarTy>::max());
4877 
4878   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4879   // FIXME: While for scalable vectors this is currently sufficient, this should
4880   // be replaced by a more detailed mechanism that filters out specific VFs,
4881   // instead of invalidating vectorization for a whole set of VFs based on the
4882   // MaxVF.
4883 
4884   // Disable scalable vectorization if the loop contains unsupported reductions.
4885   if (!canVectorizeReductions(MaxScalableVF)) {
4886     reportVectorizationInfo(
4887         "Scalable vectorization not supported for the reduction "
4888         "operations found in this loop.",
4889         "ScalableVFUnfeasible", ORE, TheLoop);
4890     return ElementCount::getScalable(0);
4891   }
4892 
4893   // Disable scalable vectorization if the loop contains any instructions
4894   // with element types not supported for scalable vectors.
4895   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4896         return !Ty->isVoidTy() &&
4897                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4898       })) {
4899     reportVectorizationInfo("Scalable vectorization is not supported "
4900                             "for all element types found in this loop.",
4901                             "ScalableVFUnfeasible", ORE, TheLoop);
4902     return ElementCount::getScalable(0);
4903   }
4904 
4905   if (Legal->isSafeForAnyVectorWidth())
4906     return MaxScalableVF;
4907 
4908   // Limit MaxScalableVF by the maximum safe dependence distance.
4909   std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4910   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4911     MaxVScale =
4912         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4913   MaxScalableVF =
4914       ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4915   if (!MaxScalableVF)
4916     reportVectorizationInfo(
4917         "Max legal vector width too small, scalable vectorization "
4918         "unfeasible.",
4919         "ScalableVFUnfeasible", ORE, TheLoop);
4920 
4921   return MaxScalableVF;
4922 }
4923 
computeFeasibleMaxVF(unsigned ConstTripCount,ElementCount UserVF,bool FoldTailByMasking)4924 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4925     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4926   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4927   unsigned SmallestType, WidestType;
4928   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4929 
4930   // Get the maximum safe dependence distance in bits computed by LAA.
4931   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4932   // the memory accesses that is most restrictive (involved in the smallest
4933   // dependence distance).
4934   unsigned MaxSafeElements =
4935       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4936 
4937   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4938   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4939 
4940   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4941                     << ".\n");
4942   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4943                     << ".\n");
4944 
4945   // First analyze the UserVF, fall back if the UserVF should be ignored.
4946   if (UserVF) {
4947     auto MaxSafeUserVF =
4948         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4949 
4950     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4951       // If `VF=vscale x N` is safe, then so is `VF=N`
4952       if (UserVF.isScalable())
4953         return FixedScalableVFPair(
4954             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4955       else
4956         return UserVF;
4957     }
4958 
4959     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4960 
4961     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4962     // is better to ignore the hint and let the compiler choose a suitable VF.
4963     if (!UserVF.isScalable()) {
4964       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4965                         << " is unsafe, clamping to max safe VF="
4966                         << MaxSafeFixedVF << ".\n");
4967       ORE->emit([&]() {
4968         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4969                                           TheLoop->getStartLoc(),
4970                                           TheLoop->getHeader())
4971                << "User-specified vectorization factor "
4972                << ore::NV("UserVectorizationFactor", UserVF)
4973                << " is unsafe, clamping to maximum safe vectorization factor "
4974                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4975       });
4976       return MaxSafeFixedVF;
4977     }
4978 
4979     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4980       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4981                         << " is ignored because scalable vectors are not "
4982                            "available.\n");
4983       ORE->emit([&]() {
4984         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4985                                           TheLoop->getStartLoc(),
4986                                           TheLoop->getHeader())
4987                << "User-specified vectorization factor "
4988                << ore::NV("UserVectorizationFactor", UserVF)
4989                << " is ignored because the target does not support scalable "
4990                   "vectors. The compiler will pick a more suitable value.";
4991       });
4992     } else {
4993       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4994                         << " is unsafe. Ignoring scalable UserVF.\n");
4995       ORE->emit([&]() {
4996         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4997                                           TheLoop->getStartLoc(),
4998                                           TheLoop->getHeader())
4999                << "User-specified vectorization factor "
5000                << ore::NV("UserVectorizationFactor", UserVF)
5001                << " is unsafe. Ignoring the hint to let the compiler pick a "
5002                   "more suitable value.";
5003       });
5004     }
5005   }
5006 
5007   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5008                     << " / " << WidestType << " bits.\n");
5009 
5010   FixedScalableVFPair Result(ElementCount::getFixed(1),
5011                              ElementCount::getScalable(0));
5012   if (auto MaxVF =
5013           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5014                                   MaxSafeFixedVF, FoldTailByMasking))
5015     Result.FixedVF = MaxVF;
5016 
5017   if (auto MaxVF =
5018           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5019                                   MaxSafeScalableVF, FoldTailByMasking))
5020     if (MaxVF.isScalable()) {
5021       Result.ScalableVF = MaxVF;
5022       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5023                         << "\n");
5024     }
5025 
5026   return Result;
5027 }
5028 
5029 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)5030 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5031   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5032     // TODO: It may by useful to do since it's still likely to be dynamically
5033     // uniform if the target can skip.
5034     reportVectorizationFailure(
5035         "Not inserting runtime ptr check for divergent target",
5036         "runtime pointer checks needed. Not enabled for divergent target",
5037         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5038     return FixedScalableVFPair::getNone();
5039   }
5040 
5041   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5042   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5043   if (TC == 1) {
5044     reportVectorizationFailure("Single iteration (non) loop",
5045         "loop trip count is one, irrelevant for vectorization",
5046         "SingleIterationLoop", ORE, TheLoop);
5047     return FixedScalableVFPair::getNone();
5048   }
5049 
5050   switch (ScalarEpilogueStatus) {
5051   case CM_ScalarEpilogueAllowed:
5052     return computeFeasibleMaxVF(TC, UserVF, false);
5053   case CM_ScalarEpilogueNotAllowedUsePredicate:
5054     [[fallthrough]];
5055   case CM_ScalarEpilogueNotNeededUsePredicate:
5056     LLVM_DEBUG(
5057         dbgs() << "LV: vector predicate hint/switch found.\n"
5058                << "LV: Not allowing scalar epilogue, creating predicated "
5059                << "vector loop.\n");
5060     break;
5061   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5062     // fallthrough as a special case of OptForSize
5063   case CM_ScalarEpilogueNotAllowedOptSize:
5064     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5065       LLVM_DEBUG(
5066           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5067     else
5068       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5069                         << "count.\n");
5070 
5071     // Bail if runtime checks are required, which are not good when optimising
5072     // for size.
5073     if (runtimeChecksRequired())
5074       return FixedScalableVFPair::getNone();
5075 
5076     break;
5077   }
5078 
5079   // The only loops we can vectorize without a scalar epilogue, are loops with
5080   // a bottom-test and a single exiting block. We'd have to handle the fact
5081   // that not every instruction executes on the last iteration.  This will
5082   // require a lane mask which varies through the vector loop body.  (TODO)
5083   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5084     // If there was a tail-folding hint/switch, but we can't fold the tail by
5085     // masking, fallback to a vectorization with a scalar epilogue.
5086     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5087       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5088                            "scalar epilogue instead.\n");
5089       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5090       return computeFeasibleMaxVF(TC, UserVF, false);
5091     }
5092     return FixedScalableVFPair::getNone();
5093   }
5094 
5095   // Now try the tail folding
5096 
5097   // Invalidate interleave groups that require an epilogue if we can't mask
5098   // the interleave-group.
5099   if (!useMaskedInterleavedAccesses(TTI)) {
5100     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5101            "No decisions should have been taken at this point");
5102     // Note: There is no need to invalidate any cost modeling decisions here, as
5103     // non where taken so far.
5104     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5105   }
5106 
5107   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5108   // Avoid tail folding if the trip count is known to be a multiple of any VF
5109   // we chose.
5110   // FIXME: The condition below pessimises the case for fixed-width vectors,
5111   // when scalable VFs are also candidates for vectorization.
5112   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5113     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5114     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5115            "MaxFixedVF must be a power of 2");
5116     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5117                                    : MaxFixedVF.getFixedValue();
5118     ScalarEvolution *SE = PSE.getSE();
5119     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5120     const SCEV *ExitCount = SE->getAddExpr(
5121         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5122     const SCEV *Rem = SE->getURemExpr(
5123         SE->applyLoopGuards(ExitCount, TheLoop),
5124         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5125     if (Rem->isZero()) {
5126       // Accept MaxFixedVF if we do not have a tail.
5127       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5128       return MaxFactors;
5129     }
5130   }
5131 
5132   // If we don't know the precise trip count, or if the trip count that we
5133   // found modulo the vectorization factor is not zero, try to fold the tail
5134   // by masking.
5135   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5136   if (Legal->prepareToFoldTailByMasking()) {
5137     FoldTailByMasking = true;
5138     return MaxFactors;
5139   }
5140 
5141   // If there was a tail-folding hint/switch, but we can't fold the tail by
5142   // masking, fallback to a vectorization with a scalar epilogue.
5143   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5144     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5145                          "scalar epilogue instead.\n");
5146     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5147     return MaxFactors;
5148   }
5149 
5150   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5151     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5152     return FixedScalableVFPair::getNone();
5153   }
5154 
5155   if (TC == 0) {
5156     reportVectorizationFailure(
5157         "Unable to calculate the loop count due to complex control flow",
5158         "unable to calculate the loop count due to complex control flow",
5159         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5160     return FixedScalableVFPair::getNone();
5161   }
5162 
5163   reportVectorizationFailure(
5164       "Cannot optimize for size and vectorize at the same time.",
5165       "cannot optimize for size and vectorize at the same time. "
5166       "Enable vectorization of this loop with '#pragma clang loop "
5167       "vectorize(enable)' when compiling with -Os/-Oz",
5168       "NoTailLoopWithOptForSize", ORE, TheLoop);
5169   return FixedScalableVFPair::getNone();
5170 }
5171 
getMaximizedVFForTarget(unsigned ConstTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)5172 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5173     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5174     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5175   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5176   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5177       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5178                            : TargetTransformInfo::RGK_FixedWidthVector);
5179 
5180   // Convenience function to return the minimum of two ElementCounts.
5181   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5182     assert((LHS.isScalable() == RHS.isScalable()) &&
5183            "Scalable flags must match");
5184     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5185   };
5186 
5187   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5188   // Note that both WidestRegister and WidestType may not be a powers of 2.
5189   auto MaxVectorElementCount = ElementCount::get(
5190       PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5191       ComputeScalableMaxVF);
5192   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5193   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5194                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5195 
5196   if (!MaxVectorElementCount) {
5197     LLVM_DEBUG(dbgs() << "LV: The target has no "
5198                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5199                       << " vector registers.\n");
5200     return ElementCount::getFixed(1);
5201   }
5202 
5203   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5204   if (MaxVectorElementCount.isScalable() &&
5205       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5206     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5207     auto Min = Attr.getVScaleRangeMin();
5208     WidestRegisterMinEC *= Min;
5209   }
5210   if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5211       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5212     // If loop trip count (TC) is known at compile time there is no point in
5213     // choosing VF greater than TC (as done in the loop below). Select maximum
5214     // power of two which doesn't exceed TC.
5215     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5216     // when the TC is less than or equal to the known number of lanes.
5217     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5218     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5219                          "exceeding the constant trip count: "
5220                       << ClampedConstTripCount << "\n");
5221     return ElementCount::getFixed(ClampedConstTripCount);
5222   }
5223 
5224   TargetTransformInfo::RegisterKind RegKind =
5225       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5226                            : TargetTransformInfo::RGK_FixedWidthVector;
5227   ElementCount MaxVF = MaxVectorElementCount;
5228   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5229                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5230     auto MaxVectorElementCountMaxBW = ElementCount::get(
5231         PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5232         ComputeScalableMaxVF);
5233     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5234 
5235     // Collect all viable vectorization factors larger than the default MaxVF
5236     // (i.e. MaxVectorElementCount).
5237     SmallVector<ElementCount, 8> VFs;
5238     for (ElementCount VS = MaxVectorElementCount * 2;
5239          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5240       VFs.push_back(VS);
5241 
5242     // For each VF calculate its register usage.
5243     auto RUs = calculateRegisterUsage(VFs);
5244 
5245     // Select the largest VF which doesn't require more registers than existing
5246     // ones.
5247     for (int i = RUs.size() - 1; i >= 0; --i) {
5248       bool Selected = true;
5249       for (auto &pair : RUs[i].MaxLocalUsers) {
5250         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5251         if (pair.second > TargetNumRegisters)
5252           Selected = false;
5253       }
5254       if (Selected) {
5255         MaxVF = VFs[i];
5256         break;
5257       }
5258     }
5259     if (ElementCount MinVF =
5260             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5261       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5262         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5263                           << ") with target's minimum: " << MinVF << '\n');
5264         MaxVF = MinVF;
5265       }
5266     }
5267 
5268     // Invalidate any widening decisions we might have made, in case the loop
5269     // requires prediction (decided later), but we have already made some
5270     // load/store widening decisions.
5271     invalidateCostModelingDecisions();
5272   }
5273   return MaxVF;
5274 }
5275 
getVScaleForTuning() const5276 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5277   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5278     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5279     auto Min = Attr.getVScaleRangeMin();
5280     auto Max = Attr.getVScaleRangeMax();
5281     if (Max && Min == Max)
5282       return Max;
5283   }
5284 
5285   return TTI.getVScaleForTuning();
5286 }
5287 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const5288 bool LoopVectorizationCostModel::isMoreProfitable(
5289     const VectorizationFactor &A, const VectorizationFactor &B) const {
5290   InstructionCost CostA = A.Cost;
5291   InstructionCost CostB = B.Cost;
5292 
5293   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5294 
5295   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5296       MaxTripCount) {
5297     // If we are folding the tail and the trip count is a known (possibly small)
5298     // constant, the trip count will be rounded up to an integer number of
5299     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5300     // which we compare directly. When not folding the tail, the total cost will
5301     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5302     // approximated with the per-lane cost below instead of using the tripcount
5303     // as here.
5304     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5305     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5306     return RTCostA < RTCostB;
5307   }
5308 
5309   // Improve estimate for the vector width if it is scalable.
5310   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5311   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5312   if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5313     if (A.Width.isScalable())
5314       EstimatedWidthA *= *VScale;
5315     if (B.Width.isScalable())
5316       EstimatedWidthB *= *VScale;
5317   }
5318 
5319   // Assume vscale may be larger than 1 (or the value being tuned for),
5320   // so that scalable vectorization is slightly favorable over fixed-width
5321   // vectorization.
5322   if (A.Width.isScalable() && !B.Width.isScalable())
5323     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5324 
5325   // To avoid the need for FP division:
5326   //      (CostA / A.Width) < (CostB / B.Width)
5327   // <=>  (CostA * B.Width) < (CostB * A.Width)
5328   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5329 }
5330 
selectVectorizationFactor(const ElementCountSet & VFCandidates)5331 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5332     const ElementCountSet &VFCandidates) {
5333   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5334   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5335   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5336   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5337          "Expected Scalar VF to be a candidate");
5338 
5339   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5340                                        ExpectedCost);
5341   VectorizationFactor ChosenFactor = ScalarCost;
5342 
5343   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5344   if (ForceVectorization && VFCandidates.size() > 1) {
5345     // Ignore scalar width, because the user explicitly wants vectorization.
5346     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5347     // evaluation.
5348     ChosenFactor.Cost = InstructionCost::getMax();
5349   }
5350 
5351   SmallVector<InstructionVFPair> InvalidCosts;
5352   for (const auto &i : VFCandidates) {
5353     // The cost for scalar VF=1 is already calculated, so ignore it.
5354     if (i.isScalar())
5355       continue;
5356 
5357     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5358     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5359 
5360 #ifndef NDEBUG
5361     unsigned AssumedMinimumVscale = 1;
5362     if (std::optional<unsigned> VScale = getVScaleForTuning())
5363       AssumedMinimumVscale = *VScale;
5364     unsigned Width =
5365         Candidate.Width.isScalable()
5366             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5367             : Candidate.Width.getFixedValue();
5368     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5369                       << " costs: " << (Candidate.Cost / Width));
5370     if (i.isScalable())
5371       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5372                         << AssumedMinimumVscale << ")");
5373     LLVM_DEBUG(dbgs() << ".\n");
5374 #endif
5375 
5376     if (!C.second && !ForceVectorization) {
5377       LLVM_DEBUG(
5378           dbgs() << "LV: Not considering vector loop of width " << i
5379                  << " because it will not generate any vector instructions.\n");
5380       continue;
5381     }
5382 
5383     // If profitable add it to ProfitableVF list.
5384     if (isMoreProfitable(Candidate, ScalarCost))
5385       ProfitableVFs.push_back(Candidate);
5386 
5387     if (isMoreProfitable(Candidate, ChosenFactor))
5388       ChosenFactor = Candidate;
5389   }
5390 
5391   // Emit a report of VFs with invalid costs in the loop.
5392   if (!InvalidCosts.empty()) {
5393     // Group the remarks per instruction, keeping the instruction order from
5394     // InvalidCosts.
5395     std::map<Instruction *, unsigned> Numbering;
5396     unsigned I = 0;
5397     for (auto &Pair : InvalidCosts)
5398       if (!Numbering.count(Pair.first))
5399         Numbering[Pair.first] = I++;
5400 
5401     // Sort the list, first on instruction(number) then on VF.
5402     llvm::sort(InvalidCosts,
5403                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5404                  if (Numbering[A.first] != Numbering[B.first])
5405                    return Numbering[A.first] < Numbering[B.first];
5406                  ElementCountComparator ECC;
5407                  return ECC(A.second, B.second);
5408                });
5409 
5410     // For a list of ordered instruction-vf pairs:
5411     //   [(load, vf1), (load, vf2), (store, vf1)]
5412     // Group the instructions together to emit separate remarks for:
5413     //   load  (vf1, vf2)
5414     //   store (vf1)
5415     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5416     auto Subset = ArrayRef<InstructionVFPair>();
5417     do {
5418       if (Subset.empty())
5419         Subset = Tail.take_front(1);
5420 
5421       Instruction *I = Subset.front().first;
5422 
5423       // If the next instruction is different, or if there are no other pairs,
5424       // emit a remark for the collated subset. e.g.
5425       //   [(load, vf1), (load, vf2))]
5426       // to emit:
5427       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5428       if (Subset == Tail || Tail[Subset.size()].first != I) {
5429         std::string OutString;
5430         raw_string_ostream OS(OutString);
5431         assert(!Subset.empty() && "Unexpected empty range");
5432         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5433         for (const auto &Pair : Subset)
5434           OS << (Pair.second == Subset.front().second ? "" : ", ")
5435              << Pair.second;
5436         OS << "):";
5437         if (auto *CI = dyn_cast<CallInst>(I))
5438           OS << " call to " << CI->getCalledFunction()->getName();
5439         else
5440           OS << " " << I->getOpcodeName();
5441         OS.flush();
5442         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5443         Tail = Tail.drop_front(Subset.size());
5444         Subset = {};
5445       } else
5446         // Grow the subset by one element
5447         Subset = Tail.take_front(Subset.size() + 1);
5448     } while (!Tail.empty());
5449   }
5450 
5451   if (!EnableCondStoresVectorization && NumPredStores) {
5452     reportVectorizationFailure("There are conditional stores.",
5453         "store that is conditionally executed prevents vectorization",
5454         "ConditionalStore", ORE, TheLoop);
5455     ChosenFactor = ScalarCost;
5456   }
5457 
5458   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5459                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5460              << "LV: Vectorization seems to be not beneficial, "
5461              << "but was forced by a user.\n");
5462   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5463   return ChosenFactor;
5464 }
5465 
isCandidateForEpilogueVectorization(const Loop & L,ElementCount VF) const5466 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5467     const Loop &L, ElementCount VF) const {
5468   // Cross iteration phis such as reductions need special handling and are
5469   // currently unsupported.
5470   if (any_of(L.getHeader()->phis(),
5471              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5472     return false;
5473 
5474   // Phis with uses outside of the loop require special handling and are
5475   // currently unsupported.
5476   for (const auto &Entry : Legal->getInductionVars()) {
5477     // Look for uses of the value of the induction at the last iteration.
5478     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5479     for (User *U : PostInc->users())
5480       if (!L.contains(cast<Instruction>(U)))
5481         return false;
5482     // Look for uses of penultimate value of the induction.
5483     for (User *U : Entry.first->users())
5484       if (!L.contains(cast<Instruction>(U)))
5485         return false;
5486   }
5487 
5488   // Epilogue vectorization code has not been auditted to ensure it handles
5489   // non-latch exits properly.  It may be fine, but it needs auditted and
5490   // tested.
5491   if (L.getExitingBlock() != L.getLoopLatch())
5492     return false;
5493 
5494   return true;
5495 }
5496 
isEpilogueVectorizationProfitable(const ElementCount VF) const5497 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5498     const ElementCount VF) const {
5499   // FIXME: We need a much better cost-model to take different parameters such
5500   // as register pressure, code size increase and cost of extra branches into
5501   // account. For now we apply a very crude heuristic and only consider loops
5502   // with vectorization factors larger than a certain value.
5503 
5504   // Allow the target to opt out entirely.
5505   if (!TTI.preferEpilogueVectorization())
5506     return false;
5507 
5508   // We also consider epilogue vectorization unprofitable for targets that don't
5509   // consider interleaving beneficial (eg. MVE).
5510   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5511     return false;
5512   // FIXME: We should consider changing the threshold for scalable
5513   // vectors to take VScaleForTuning into account.
5514   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5515     return true;
5516   return false;
5517 }
5518 
5519 VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,const LoopVectorizationPlanner & LVP)5520 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5521     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5522   VectorizationFactor Result = VectorizationFactor::Disabled();
5523   if (!EnableEpilogueVectorization) {
5524     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5525     return Result;
5526   }
5527 
5528   if (!isScalarEpilogueAllowed()) {
5529     LLVM_DEBUG(
5530         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5531                   "allowed.\n";);
5532     return Result;
5533   }
5534 
5535   // Not really a cost consideration, but check for unsupported cases here to
5536   // simplify the logic.
5537   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5538     LLVM_DEBUG(
5539         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5540                   "not a supported candidate.\n";);
5541     return Result;
5542   }
5543 
5544   if (EpilogueVectorizationForceVF > 1) {
5545     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5546     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5547     if (LVP.hasPlanWithVF(ForcedEC))
5548       return {ForcedEC, 0, 0};
5549     else {
5550       LLVM_DEBUG(
5551           dbgs()
5552               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5553       return Result;
5554     }
5555   }
5556 
5557   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5558       TheLoop->getHeader()->getParent()->hasMinSize()) {
5559     LLVM_DEBUG(
5560         dbgs()
5561             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5562     return Result;
5563   }
5564 
5565   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5566     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5567                          "this loop\n");
5568     return Result;
5569   }
5570 
5571   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5572   // the main loop handles 8 lanes per iteration. We could still benefit from
5573   // vectorizing the epilogue loop with VF=4.
5574   ElementCount EstimatedRuntimeVF = MainLoopVF;
5575   if (MainLoopVF.isScalable()) {
5576     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5577     if (std::optional<unsigned> VScale = getVScaleForTuning())
5578       EstimatedRuntimeVF *= *VScale;
5579   }
5580 
5581   for (auto &NextVF : ProfitableVFs)
5582     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5583           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5584          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5585         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5586         LVP.hasPlanWithVF(NextVF.Width))
5587       Result = NextVF;
5588 
5589   if (Result != VectorizationFactor::Disabled())
5590     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5591                       << Result.Width << "\n";);
5592   return Result;
5593 }
5594 
5595 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5596 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5597   unsigned MinWidth = -1U;
5598   unsigned MaxWidth = 8;
5599   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5600   // For in-loop reductions, no element types are added to ElementTypesInLoop
5601   // if there are no loads/stores in the loop. In this case, check through the
5602   // reduction variables to determine the maximum width.
5603   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5604     // Reset MaxWidth so that we can find the smallest type used by recurrences
5605     // in the loop.
5606     MaxWidth = -1U;
5607     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5608       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5609       // When finding the min width used by the recurrence we need to account
5610       // for casts on the input operands of the recurrence.
5611       MaxWidth = std::min<unsigned>(
5612           MaxWidth, std::min<unsigned>(
5613                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5614                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5615     }
5616   } else {
5617     for (Type *T : ElementTypesInLoop) {
5618       MinWidth = std::min<unsigned>(
5619           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5620       MaxWidth = std::max<unsigned>(
5621           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5622     }
5623   }
5624   return {MinWidth, MaxWidth};
5625 }
5626 
collectElementTypesForWidening()5627 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5628   ElementTypesInLoop.clear();
5629   // For each block.
5630   for (BasicBlock *BB : TheLoop->blocks()) {
5631     // For each instruction in the loop.
5632     for (Instruction &I : BB->instructionsWithoutDebug()) {
5633       Type *T = I.getType();
5634 
5635       // Skip ignored values.
5636       if (ValuesToIgnore.count(&I))
5637         continue;
5638 
5639       // Only examine Loads, Stores and PHINodes.
5640       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5641         continue;
5642 
5643       // Examine PHI nodes that are reduction variables. Update the type to
5644       // account for the recurrence type.
5645       if (auto *PN = dyn_cast<PHINode>(&I)) {
5646         if (!Legal->isReductionVariable(PN))
5647           continue;
5648         const RecurrenceDescriptor &RdxDesc =
5649             Legal->getReductionVars().find(PN)->second;
5650         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5651             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5652                                       RdxDesc.getRecurrenceType(),
5653                                       TargetTransformInfo::ReductionFlags()))
5654           continue;
5655         T = RdxDesc.getRecurrenceType();
5656       }
5657 
5658       // Examine the stored values.
5659       if (auto *ST = dyn_cast<StoreInst>(&I))
5660         T = ST->getValueOperand()->getType();
5661 
5662       assert(T->isSized() &&
5663              "Expected the load/store/recurrence type to be sized");
5664 
5665       ElementTypesInLoop.insert(T);
5666     }
5667   }
5668 }
5669 
5670 unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)5671 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5672                                                   InstructionCost LoopCost) {
5673   // -- The interleave heuristics --
5674   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5675   // There are many micro-architectural considerations that we can't predict
5676   // at this level. For example, frontend pressure (on decode or fetch) due to
5677   // code size, or the number and capabilities of the execution ports.
5678   //
5679   // We use the following heuristics to select the interleave count:
5680   // 1. If the code has reductions, then we interleave to break the cross
5681   // iteration dependency.
5682   // 2. If the loop is really small, then we interleave to reduce the loop
5683   // overhead.
5684   // 3. We don't interleave if we think that we will spill registers to memory
5685   // due to the increased register pressure.
5686 
5687   if (!isScalarEpilogueAllowed())
5688     return 1;
5689 
5690   // We used the distance for the interleave count.
5691   if (Legal->getMaxSafeDepDistBytes() != -1U)
5692     return 1;
5693 
5694   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5695   const bool HasReductions = !Legal->getReductionVars().empty();
5696   // Do not interleave loops with a relatively small known or estimated trip
5697   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5698   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5699   // because with the above conditions interleaving can expose ILP and break
5700   // cross iteration dependences for reductions.
5701   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5702       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5703     return 1;
5704 
5705   // If we did not calculate the cost for VF (because the user selected the VF)
5706   // then we calculate the cost of VF here.
5707   if (LoopCost == 0) {
5708     LoopCost = expectedCost(VF).first;
5709     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5710 
5711     // Loop body is free and there is no need for interleaving.
5712     if (LoopCost == 0)
5713       return 1;
5714   }
5715 
5716   RegisterUsage R = calculateRegisterUsage({VF})[0];
5717   // We divide by these constants so assume that we have at least one
5718   // instruction that uses at least one register.
5719   for (auto& pair : R.MaxLocalUsers) {
5720     pair.second = std::max(pair.second, 1U);
5721   }
5722 
5723   // We calculate the interleave count using the following formula.
5724   // Subtract the number of loop invariants from the number of available
5725   // registers. These registers are used by all of the interleaved instances.
5726   // Next, divide the remaining registers by the number of registers that is
5727   // required by the loop, in order to estimate how many parallel instances
5728   // fit without causing spills. All of this is rounded down if necessary to be
5729   // a power of two. We want power of two interleave count to simplify any
5730   // addressing operations or alignment considerations.
5731   // We also want power of two interleave counts to ensure that the induction
5732   // variable of the vector loop wraps to zero, when tail is folded by masking;
5733   // this currently happens when OptForSize, in which case IC is set to 1 above.
5734   unsigned IC = UINT_MAX;
5735 
5736   for (auto& pair : R.MaxLocalUsers) {
5737     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5738     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5739                       << " registers of "
5740                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5741     if (VF.isScalar()) {
5742       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5743         TargetNumRegisters = ForceTargetNumScalarRegs;
5744     } else {
5745       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5746         TargetNumRegisters = ForceTargetNumVectorRegs;
5747     }
5748     unsigned MaxLocalUsers = pair.second;
5749     unsigned LoopInvariantRegs = 0;
5750     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5751       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5752 
5753     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5754     // Don't count the induction variable as interleaved.
5755     if (EnableIndVarRegisterHeur) {
5756       TmpIC =
5757           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5758                         std::max(1U, (MaxLocalUsers - 1)));
5759     }
5760 
5761     IC = std::min(IC, TmpIC);
5762   }
5763 
5764   // Clamp the interleave ranges to reasonable counts.
5765   unsigned MaxInterleaveCount =
5766       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5767 
5768   // Check if the user has overridden the max.
5769   if (VF.isScalar()) {
5770     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5771       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5772   } else {
5773     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5774       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5775   }
5776 
5777   // If trip count is known or estimated compile time constant, limit the
5778   // interleave count to be less than the trip count divided by VF, provided it
5779   // is at least 1.
5780   //
5781   // For scalable vectors we can't know if interleaving is beneficial. It may
5782   // not be beneficial for small loops if none of the lanes in the second vector
5783   // iterations is enabled. However, for larger loops, there is likely to be a
5784   // similar benefit as for fixed-width vectors. For now, we choose to leave
5785   // the InterleaveCount as if vscale is '1', although if some information about
5786   // the vector is known (e.g. min vector size), we can make a better decision.
5787   if (BestKnownTC) {
5788     MaxInterleaveCount =
5789         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5790     // Make sure MaxInterleaveCount is greater than 0.
5791     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5792   }
5793 
5794   assert(MaxInterleaveCount > 0 &&
5795          "Maximum interleave count must be greater than 0");
5796 
5797   // Clamp the calculated IC to be between the 1 and the max interleave count
5798   // that the target and trip count allows.
5799   if (IC > MaxInterleaveCount)
5800     IC = MaxInterleaveCount;
5801   else
5802     // Make sure IC is greater than 0.
5803     IC = std::max(1u, IC);
5804 
5805   assert(IC > 0 && "Interleave count must be greater than 0.");
5806 
5807   // Interleave if we vectorized this loop and there is a reduction that could
5808   // benefit from interleaving.
5809   if (VF.isVector() && HasReductions) {
5810     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5811     return IC;
5812   }
5813 
5814   // For any scalar loop that either requires runtime checks or predication we
5815   // are better off leaving this to the unroller. Note that if we've already
5816   // vectorized the loop we will have done the runtime check and so interleaving
5817   // won't require further checks.
5818   bool ScalarInterleavingRequiresPredication =
5819       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5820          return Legal->blockNeedsPredication(BB);
5821        }));
5822   bool ScalarInterleavingRequiresRuntimePointerCheck =
5823       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5824 
5825   // We want to interleave small loops in order to reduce the loop overhead and
5826   // potentially expose ILP opportunities.
5827   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5828                     << "LV: IC is " << IC << '\n'
5829                     << "LV: VF is " << VF << '\n');
5830   const bool AggressivelyInterleaveReductions =
5831       TTI.enableAggressiveInterleaving(HasReductions);
5832   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5833       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5834     // We assume that the cost overhead is 1 and we use the cost model
5835     // to estimate the cost of the loop and interleave until the cost of the
5836     // loop overhead is about 5% of the cost of the loop.
5837     unsigned SmallIC = std::min(
5838         IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5839 
5840     // Interleave until store/load ports (estimated by max interleave count) are
5841     // saturated.
5842     unsigned NumStores = Legal->getNumStores();
5843     unsigned NumLoads = Legal->getNumLoads();
5844     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5845     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5846 
5847     // There is little point in interleaving for reductions containing selects
5848     // and compares when VF=1 since it may just create more overhead than it's
5849     // worth for loops with small trip counts. This is because we still have to
5850     // do the final reduction after the loop.
5851     bool HasSelectCmpReductions =
5852         HasReductions &&
5853         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5854           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5855           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5856               RdxDesc.getRecurrenceKind());
5857         });
5858     if (HasSelectCmpReductions) {
5859       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5860       return 1;
5861     }
5862 
5863     // If we have a scalar reduction (vector reductions are already dealt with
5864     // by this point), we can increase the critical path length if the loop
5865     // we're interleaving is inside another loop. For tree-wise reductions
5866     // set the limit to 2, and for ordered reductions it's best to disable
5867     // interleaving entirely.
5868     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5869       bool HasOrderedReductions =
5870           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5871             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5872             return RdxDesc.isOrdered();
5873           });
5874       if (HasOrderedReductions) {
5875         LLVM_DEBUG(
5876             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5877         return 1;
5878       }
5879 
5880       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5881       SmallIC = std::min(SmallIC, F);
5882       StoresIC = std::min(StoresIC, F);
5883       LoadsIC = std::min(LoadsIC, F);
5884     }
5885 
5886     if (EnableLoadStoreRuntimeInterleave &&
5887         std::max(StoresIC, LoadsIC) > SmallIC) {
5888       LLVM_DEBUG(
5889           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5890       return std::max(StoresIC, LoadsIC);
5891     }
5892 
5893     // If there are scalar reductions and TTI has enabled aggressive
5894     // interleaving for reductions, we will interleave to expose ILP.
5895     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5896         AggressivelyInterleaveReductions) {
5897       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5898       // Interleave no less than SmallIC but not as aggressive as the normal IC
5899       // to satisfy the rare situation when resources are too limited.
5900       return std::max(IC / 2, SmallIC);
5901     } else {
5902       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5903       return SmallIC;
5904     }
5905   }
5906 
5907   // Interleave if this is a large loop (small loops are already dealt with by
5908   // this point) that could benefit from interleaving.
5909   if (AggressivelyInterleaveReductions) {
5910     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5911     return IC;
5912   }
5913 
5914   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5915   return 1;
5916 }
5917 
5918 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5919 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5920   // This function calculates the register usage by measuring the highest number
5921   // of values that are alive at a single location. Obviously, this is a very
5922   // rough estimation. We scan the loop in a topological order in order and
5923   // assign a number to each instruction. We use RPO to ensure that defs are
5924   // met before their users. We assume that each instruction that has in-loop
5925   // users starts an interval. We record every time that an in-loop value is
5926   // used, so we have a list of the first and last occurrences of each
5927   // instruction. Next, we transpose this data structure into a multi map that
5928   // holds the list of intervals that *end* at a specific location. This multi
5929   // map allows us to perform a linear search. We scan the instructions linearly
5930   // and record each time that a new interval starts, by placing it in a set.
5931   // If we find this value in the multi-map then we remove it from the set.
5932   // The max register usage is the maximum size of the set.
5933   // We also search for instructions that are defined outside the loop, but are
5934   // used inside the loop. We need this number separately from the max-interval
5935   // usage number because when we unroll, loop-invariant values do not take
5936   // more register.
5937   LoopBlocksDFS DFS(TheLoop);
5938   DFS.perform(LI);
5939 
5940   RegisterUsage RU;
5941 
5942   // Each 'key' in the map opens a new interval. The values
5943   // of the map are the index of the 'last seen' usage of the
5944   // instruction that is the key.
5945   using IntervalMap = DenseMap<Instruction *, unsigned>;
5946 
5947   // Maps instruction to its index.
5948   SmallVector<Instruction *, 64> IdxToInstr;
5949   // Marks the end of each interval.
5950   IntervalMap EndPoint;
5951   // Saves the list of instruction indices that are used in the loop.
5952   SmallPtrSet<Instruction *, 8> Ends;
5953   // Saves the list of values that are used in the loop but are defined outside
5954   // the loop (not including non-instruction values such as arguments and
5955   // constants).
5956   SmallPtrSet<Instruction *, 8> LoopInvariants;
5957 
5958   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5959     for (Instruction &I : BB->instructionsWithoutDebug()) {
5960       IdxToInstr.push_back(&I);
5961 
5962       // Save the end location of each USE.
5963       for (Value *U : I.operands()) {
5964         auto *Instr = dyn_cast<Instruction>(U);
5965 
5966         // Ignore non-instruction values such as arguments, constants, etc.
5967         // FIXME: Might need some motivation why these values are ignored. If
5968         // for example an argument is used inside the loop it will increase the
5969         // register pressure (so shouldn't we add it to LoopInvariants).
5970         if (!Instr)
5971           continue;
5972 
5973         // If this instruction is outside the loop then record it and continue.
5974         if (!TheLoop->contains(Instr)) {
5975           LoopInvariants.insert(Instr);
5976           continue;
5977         }
5978 
5979         // Overwrite previous end points.
5980         EndPoint[Instr] = IdxToInstr.size();
5981         Ends.insert(Instr);
5982       }
5983     }
5984   }
5985 
5986   // Saves the list of intervals that end with the index in 'key'.
5987   using InstrList = SmallVector<Instruction *, 2>;
5988   DenseMap<unsigned, InstrList> TransposeEnds;
5989 
5990   // Transpose the EndPoints to a list of values that end at each index.
5991   for (auto &Interval : EndPoint)
5992     TransposeEnds[Interval.second].push_back(Interval.first);
5993 
5994   SmallPtrSet<Instruction *, 8> OpenIntervals;
5995   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5996   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5997 
5998   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5999 
6000   const auto &TTICapture = TTI;
6001   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6002     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6003       return 0;
6004     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6005   };
6006 
6007   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6008     Instruction *I = IdxToInstr[i];
6009 
6010     // Remove all of the instructions that end at this location.
6011     InstrList &List = TransposeEnds[i];
6012     for (Instruction *ToRemove : List)
6013       OpenIntervals.erase(ToRemove);
6014 
6015     // Ignore instructions that are never used within the loop.
6016     if (!Ends.count(I))
6017       continue;
6018 
6019     // Skip ignored values.
6020     if (ValuesToIgnore.count(I))
6021       continue;
6022 
6023     // For each VF find the maximum usage of registers.
6024     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6025       // Count the number of registers used, per register class, given all open
6026       // intervals.
6027       // Note that elements in this SmallMapVector will be default constructed
6028       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6029       // there is no previous entry for ClassID.
6030       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6031 
6032       if (VFs[j].isScalar()) {
6033         for (auto *Inst : OpenIntervals) {
6034           unsigned ClassID =
6035               TTI.getRegisterClassForType(false, Inst->getType());
6036           // FIXME: The target might use more than one register for the type
6037           // even in the scalar case.
6038           RegUsage[ClassID] += 1;
6039         }
6040       } else {
6041         collectUniformsAndScalars(VFs[j]);
6042         for (auto *Inst : OpenIntervals) {
6043           // Skip ignored values for VF > 1.
6044           if (VecValuesToIgnore.count(Inst))
6045             continue;
6046           if (isScalarAfterVectorization(Inst, VFs[j])) {
6047             unsigned ClassID =
6048                 TTI.getRegisterClassForType(false, Inst->getType());
6049             // FIXME: The target might use more than one register for the type
6050             // even in the scalar case.
6051             RegUsage[ClassID] += 1;
6052           } else {
6053             unsigned ClassID =
6054                 TTI.getRegisterClassForType(true, Inst->getType());
6055             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6056           }
6057         }
6058       }
6059 
6060       for (auto& pair : RegUsage) {
6061         auto &Entry = MaxUsages[j][pair.first];
6062         Entry = std::max(Entry, pair.second);
6063       }
6064     }
6065 
6066     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6067                       << OpenIntervals.size() << '\n');
6068 
6069     // Add the current instruction to the list of open intervals.
6070     OpenIntervals.insert(I);
6071   }
6072 
6073   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6074     // Note that elements in this SmallMapVector will be default constructed
6075     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6076     // there is no previous entry for ClassID.
6077     SmallMapVector<unsigned, unsigned, 4> Invariant;
6078 
6079     for (auto *Inst : LoopInvariants) {
6080       // FIXME: The target might use more than one register for the type
6081       // even in the scalar case.
6082       bool IsScalar = all_of(Inst->users(), [&](User *U) {
6083         auto *I = cast<Instruction>(U);
6084         return TheLoop != LI->getLoopFor(I->getParent()) ||
6085                isScalarAfterVectorization(I, VFs[i]);
6086       });
6087 
6088       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
6089       unsigned ClassID =
6090           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
6091       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
6092     }
6093 
6094     LLVM_DEBUG({
6095       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6096       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6097              << " item\n";
6098       for (const auto &pair : MaxUsages[i]) {
6099         dbgs() << "LV(REG): RegisterClass: "
6100                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6101                << " registers\n";
6102       }
6103       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6104              << " item\n";
6105       for (const auto &pair : Invariant) {
6106         dbgs() << "LV(REG): RegisterClass: "
6107                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6108                << " registers\n";
6109       }
6110     });
6111 
6112     RU.LoopInvariantRegs = Invariant;
6113     RU.MaxLocalUsers = MaxUsages[i];
6114     RUs[i] = RU;
6115   }
6116 
6117   return RUs;
6118 }
6119 
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)6120 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6121                                                            ElementCount VF) {
6122   // TODO: Cost model for emulated masked load/store is completely
6123   // broken. This hack guides the cost model to use an artificially
6124   // high enough value to practically disable vectorization with such
6125   // operations, except where previously deployed legality hack allowed
6126   // using very low cost values. This is to avoid regressions coming simply
6127   // from moving "masked load/store" check from legality to cost model.
6128   // Masked Load/Gather emulation was previously never allowed.
6129   // Limited number of Masked Store/Scatter emulation was allowed.
6130   assert((isPredicatedInst(I)) &&
6131          "Expecting a scalar emulated instruction");
6132   return isa<LoadInst>(I) ||
6133          (isa<StoreInst>(I) &&
6134           NumPredStores > NumberOfStoresToPredicate);
6135 }
6136 
collectInstsToScalarize(ElementCount VF)6137 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6138   // If we aren't vectorizing the loop, or if we've already collected the
6139   // instructions to scalarize, there's nothing to do. Collection may already
6140   // have occurred if we have a user-selected VF and are now computing the
6141   // expected cost for interleaving.
6142   if (VF.isScalar() || VF.isZero() ||
6143       InstsToScalarize.find(VF) != InstsToScalarize.end())
6144     return;
6145 
6146   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6147   // not profitable to scalarize any instructions, the presence of VF in the
6148   // map will indicate that we've analyzed it already.
6149   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6150 
6151   PredicatedBBsAfterVectorization[VF].clear();
6152 
6153   // Find all the instructions that are scalar with predication in the loop and
6154   // determine if it would be better to not if-convert the blocks they are in.
6155   // If so, we also record the instructions to scalarize.
6156   for (BasicBlock *BB : TheLoop->blocks()) {
6157     if (!blockNeedsPredicationForAnyReason(BB))
6158       continue;
6159     for (Instruction &I : *BB)
6160       if (isScalarWithPredication(&I, VF)) {
6161         ScalarCostsTy ScalarCosts;
6162         // Do not apply discount if scalable, because that would lead to
6163         // invalid scalarization costs.
6164         // Do not apply discount logic if hacked cost is needed
6165         // for emulated masked memrefs.
6166         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6167             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6168           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6169         // Remember that BB will remain after vectorization.
6170         PredicatedBBsAfterVectorization[VF].insert(BB);
6171       }
6172   }
6173 }
6174 
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)6175 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6176     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6177   assert(!isUniformAfterVectorization(PredInst, VF) &&
6178          "Instruction marked uniform-after-vectorization will be predicated");
6179 
6180   // Initialize the discount to zero, meaning that the scalar version and the
6181   // vector version cost the same.
6182   InstructionCost Discount = 0;
6183 
6184   // Holds instructions to analyze. The instructions we visit are mapped in
6185   // ScalarCosts. Those instructions are the ones that would be scalarized if
6186   // we find that the scalar version costs less.
6187   SmallVector<Instruction *, 8> Worklist;
6188 
6189   // Returns true if the given instruction can be scalarized.
6190   auto canBeScalarized = [&](Instruction *I) -> bool {
6191     // We only attempt to scalarize instructions forming a single-use chain
6192     // from the original predicated block that would otherwise be vectorized.
6193     // Although not strictly necessary, we give up on instructions we know will
6194     // already be scalar to avoid traversing chains that are unlikely to be
6195     // beneficial.
6196     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6197         isScalarAfterVectorization(I, VF))
6198       return false;
6199 
6200     // If the instruction is scalar with predication, it will be analyzed
6201     // separately. We ignore it within the context of PredInst.
6202     if (isScalarWithPredication(I, VF))
6203       return false;
6204 
6205     // If any of the instruction's operands are uniform after vectorization,
6206     // the instruction cannot be scalarized. This prevents, for example, a
6207     // masked load from being scalarized.
6208     //
6209     // We assume we will only emit a value for lane zero of an instruction
6210     // marked uniform after vectorization, rather than VF identical values.
6211     // Thus, if we scalarize an instruction that uses a uniform, we would
6212     // create uses of values corresponding to the lanes we aren't emitting code
6213     // for. This behavior can be changed by allowing getScalarValue to clone
6214     // the lane zero values for uniforms rather than asserting.
6215     for (Use &U : I->operands())
6216       if (auto *J = dyn_cast<Instruction>(U.get()))
6217         if (isUniformAfterVectorization(J, VF))
6218           return false;
6219 
6220     // Otherwise, we can scalarize the instruction.
6221     return true;
6222   };
6223 
6224   // Compute the expected cost discount from scalarizing the entire expression
6225   // feeding the predicated instruction. We currently only consider expressions
6226   // that are single-use instruction chains.
6227   Worklist.push_back(PredInst);
6228   while (!Worklist.empty()) {
6229     Instruction *I = Worklist.pop_back_val();
6230 
6231     // If we've already analyzed the instruction, there's nothing to do.
6232     if (ScalarCosts.find(I) != ScalarCosts.end())
6233       continue;
6234 
6235     // Compute the cost of the vector instruction. Note that this cost already
6236     // includes the scalarization overhead of the predicated instruction.
6237     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6238 
6239     // Compute the cost of the scalarized instruction. This cost is the cost of
6240     // the instruction as if it wasn't if-converted and instead remained in the
6241     // predicated block. We will scale this cost by block probability after
6242     // computing the scalarization overhead.
6243     InstructionCost ScalarCost =
6244         VF.getFixedValue() *
6245         getInstructionCost(I, ElementCount::getFixed(1)).first;
6246 
6247     // Compute the scalarization overhead of needed insertelement instructions
6248     // and phi nodes.
6249     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6250     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6251       ScalarCost += TTI.getScalarizationOverhead(
6252           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6253           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6254           /*Extract*/ false, CostKind);
6255       ScalarCost +=
6256           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6257     }
6258 
6259     // Compute the scalarization overhead of needed extractelement
6260     // instructions. For each of the instruction's operands, if the operand can
6261     // be scalarized, add it to the worklist; otherwise, account for the
6262     // overhead.
6263     for (Use &U : I->operands())
6264       if (auto *J = dyn_cast<Instruction>(U.get())) {
6265         assert(VectorType::isValidElementType(J->getType()) &&
6266                "Instruction has non-scalar type");
6267         if (canBeScalarized(J))
6268           Worklist.push_back(J);
6269         else if (needsExtract(J, VF)) {
6270           ScalarCost += TTI.getScalarizationOverhead(
6271               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6272               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6273               /*Extract*/ true, CostKind);
6274         }
6275       }
6276 
6277     // Scale the total scalar cost by block probability.
6278     ScalarCost /= getReciprocalPredBlockProb();
6279 
6280     // Compute the discount. A non-negative discount means the vector version
6281     // of the instruction costs more, and scalarizing would be beneficial.
6282     Discount += VectorCost - ScalarCost;
6283     ScalarCosts[I] = ScalarCost;
6284   }
6285 
6286   return Discount;
6287 }
6288 
6289 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)6290 LoopVectorizationCostModel::expectedCost(
6291     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6292   VectorizationCostTy Cost;
6293 
6294   // For each block.
6295   for (BasicBlock *BB : TheLoop->blocks()) {
6296     VectorizationCostTy BlockCost;
6297 
6298     // For each instruction in the old loop.
6299     for (Instruction &I : BB->instructionsWithoutDebug()) {
6300       // Skip ignored values.
6301       if (ValuesToIgnore.count(&I) ||
6302           (VF.isVector() && VecValuesToIgnore.count(&I)))
6303         continue;
6304 
6305       VectorizationCostTy C = getInstructionCost(&I, VF);
6306 
6307       // Check if we should override the cost.
6308       if (C.first.isValid() &&
6309           ForceTargetInstructionCost.getNumOccurrences() > 0)
6310         C.first = InstructionCost(ForceTargetInstructionCost);
6311 
6312       // Keep a list of instructions with invalid costs.
6313       if (Invalid && !C.first.isValid())
6314         Invalid->emplace_back(&I, VF);
6315 
6316       BlockCost.first += C.first;
6317       BlockCost.second |= C.second;
6318       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6319                         << " for VF " << VF << " For instruction: " << I
6320                         << '\n');
6321     }
6322 
6323     // If we are vectorizing a predicated block, it will have been
6324     // if-converted. This means that the block's instructions (aside from
6325     // stores and instructions that may divide by zero) will now be
6326     // unconditionally executed. For the scalar case, we may not always execute
6327     // the predicated block, if it is an if-else block. Thus, scale the block's
6328     // cost by the probability of executing it. blockNeedsPredication from
6329     // Legal is used so as to not include all blocks in tail folded loops.
6330     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6331       BlockCost.first /= getReciprocalPredBlockProb();
6332 
6333     Cost.first += BlockCost.first;
6334     Cost.second |= BlockCost.second;
6335   }
6336 
6337   return Cost;
6338 }
6339 
6340 /// Gets Address Access SCEV after verifying that the access pattern
6341 /// is loop invariant except the induction variable dependence.
6342 ///
6343 /// This SCEV can be sent to the Target in order to estimate the address
6344 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)6345 static const SCEV *getAddressAccessSCEV(
6346               Value *Ptr,
6347               LoopVectorizationLegality *Legal,
6348               PredicatedScalarEvolution &PSE,
6349               const Loop *TheLoop) {
6350 
6351   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6352   if (!Gep)
6353     return nullptr;
6354 
6355   // We are looking for a gep with all loop invariant indices except for one
6356   // which should be an induction variable.
6357   auto SE = PSE.getSE();
6358   unsigned NumOperands = Gep->getNumOperands();
6359   for (unsigned i = 1; i < NumOperands; ++i) {
6360     Value *Opd = Gep->getOperand(i);
6361     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6362         !Legal->isInductionVariable(Opd))
6363       return nullptr;
6364   }
6365 
6366   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6367   return PSE.getSCEV(Ptr);
6368 }
6369 
isStrideMul(Instruction * I,LoopVectorizationLegality * Legal)6370 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6371   return Legal->hasStride(I->getOperand(0)) ||
6372          Legal->hasStride(I->getOperand(1));
6373 }
6374 
6375 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6376 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6377                                                         ElementCount VF) {
6378   assert(VF.isVector() &&
6379          "Scalarization cost of instruction implies vectorization.");
6380   if (VF.isScalable())
6381     return InstructionCost::getInvalid();
6382 
6383   Type *ValTy = getLoadStoreType(I);
6384   auto SE = PSE.getSE();
6385 
6386   unsigned AS = getLoadStoreAddressSpace(I);
6387   Value *Ptr = getLoadStorePointerOperand(I);
6388   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6389   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6390   //       that it is being called from this specific place.
6391 
6392   // Figure out whether the access is strided and get the stride value
6393   // if it's known in compile time
6394   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6395 
6396   // Get the cost of the scalar memory instruction and address computation.
6397   InstructionCost Cost =
6398       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6399 
6400   // Don't pass *I here, since it is scalar but will actually be part of a
6401   // vectorized loop where the user of it is a vectorized instruction.
6402   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6403   const Align Alignment = getLoadStoreAlignment(I);
6404   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6405                                                       ValTy->getScalarType(),
6406                                                       Alignment, AS, CostKind);
6407 
6408   // Get the overhead of the extractelement and insertelement instructions
6409   // we might create due to scalarization.
6410   Cost += getScalarizationOverhead(I, VF, CostKind);
6411 
6412   // If we have a predicated load/store, it will need extra i1 extracts and
6413   // conditional branches, but may not be executed for each vector lane. Scale
6414   // the cost by the probability of executing the predicated block.
6415   if (isPredicatedInst(I)) {
6416     Cost /= getReciprocalPredBlockProb();
6417 
6418     // Add the cost of an i1 extract and a branch
6419     auto *Vec_i1Ty =
6420         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6421     Cost += TTI.getScalarizationOverhead(
6422         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6423         /*Insert=*/false, /*Extract=*/true, CostKind);
6424     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6425 
6426     if (useEmulatedMaskMemRefHack(I, VF))
6427       // Artificially setting to a high enough value to practically disable
6428       // vectorization with such operations.
6429       Cost = 3000000;
6430   }
6431 
6432   return Cost;
6433 }
6434 
6435 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6436 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6437                                                     ElementCount VF) {
6438   Type *ValTy = getLoadStoreType(I);
6439   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6440   Value *Ptr = getLoadStorePointerOperand(I);
6441   unsigned AS = getLoadStoreAddressSpace(I);
6442   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6443   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6444 
6445   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6446          "Stride should be 1 or -1 for consecutive memory access");
6447   const Align Alignment = getLoadStoreAlignment(I);
6448   InstructionCost Cost = 0;
6449   if (Legal->isMaskRequired(I)) {
6450     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6451                                       CostKind);
6452   } else {
6453     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6454     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6455                                 CostKind, OpInfo, I);
6456   }
6457 
6458   bool Reverse = ConsecutiveStride < 0;
6459   if (Reverse)
6460     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6461                                std::nullopt, CostKind, 0);
6462   return Cost;
6463 }
6464 
6465 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)6466 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6467                                                 ElementCount VF) {
6468   assert(Legal->isUniformMemOp(*I));
6469 
6470   Type *ValTy = getLoadStoreType(I);
6471   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6472   const Align Alignment = getLoadStoreAlignment(I);
6473   unsigned AS = getLoadStoreAddressSpace(I);
6474   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6475   if (isa<LoadInst>(I)) {
6476     return TTI.getAddressComputationCost(ValTy) +
6477            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6478                                CostKind) +
6479            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6480   }
6481   StoreInst *SI = cast<StoreInst>(I);
6482 
6483   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6484   return TTI.getAddressComputationCost(ValTy) +
6485          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6486                              CostKind) +
6487          (isLoopInvariantStoreValue
6488               ? 0
6489               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6490                                        CostKind, VF.getKnownMinValue() - 1));
6491 }
6492 
6493 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)6494 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6495                                                  ElementCount VF) {
6496   Type *ValTy = getLoadStoreType(I);
6497   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6498   const Align Alignment = getLoadStoreAlignment(I);
6499   const Value *Ptr = getLoadStorePointerOperand(I);
6500 
6501   return TTI.getAddressComputationCost(VectorTy) +
6502          TTI.getGatherScatterOpCost(
6503              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6504              TargetTransformInfo::TCK_RecipThroughput, I);
6505 }
6506 
6507 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)6508 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6509                                                    ElementCount VF) {
6510   // TODO: Once we have support for interleaving with scalable vectors
6511   // we can calculate the cost properly here.
6512   if (VF.isScalable())
6513     return InstructionCost::getInvalid();
6514 
6515   Type *ValTy = getLoadStoreType(I);
6516   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6517   unsigned AS = getLoadStoreAddressSpace(I);
6518   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6519 
6520   auto Group = getInterleavedAccessGroup(I);
6521   assert(Group && "Fail to get an interleaved access group.");
6522 
6523   unsigned InterleaveFactor = Group->getFactor();
6524   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6525 
6526   // Holds the indices of existing members in the interleaved group.
6527   SmallVector<unsigned, 4> Indices;
6528   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6529     if (Group->getMember(IF))
6530       Indices.push_back(IF);
6531 
6532   // Calculate the cost of the whole interleaved group.
6533   bool UseMaskForGaps =
6534       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6535       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6536   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6537       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6538       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6539 
6540   if (Group->isReverse()) {
6541     // TODO: Add support for reversed masked interleaved access.
6542     assert(!Legal->isMaskRequired(I) &&
6543            "Reverse masked interleaved access not supported.");
6544     Cost += Group->getNumMembers() *
6545             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6546                                std::nullopt, CostKind, 0);
6547   }
6548   return Cost;
6549 }
6550 
6551 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind)6552 LoopVectorizationCostModel::getReductionPatternCost(
6553     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6554   using namespace llvm::PatternMatch;
6555   // Early exit for no inloop reductions
6556   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6557     return std::nullopt;
6558   auto *VectorTy = cast<VectorType>(Ty);
6559 
6560   // We are looking for a pattern of, and finding the minimal acceptable cost:
6561   //  reduce(mul(ext(A), ext(B))) or
6562   //  reduce(mul(A, B)) or
6563   //  reduce(ext(A)) or
6564   //  reduce(A).
6565   // The basic idea is that we walk down the tree to do that, finding the root
6566   // reduction instruction in InLoopReductionImmediateChains. From there we find
6567   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6568   // of the components. If the reduction cost is lower then we return it for the
6569   // reduction instruction and 0 for the other instructions in the pattern. If
6570   // it is not we return an invalid cost specifying the orignal cost method
6571   // should be used.
6572   Instruction *RetI = I;
6573   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6574     if (!RetI->hasOneUser())
6575       return std::nullopt;
6576     RetI = RetI->user_back();
6577   }
6578 
6579   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6580       RetI->user_back()->getOpcode() == Instruction::Add) {
6581     RetI = RetI->user_back();
6582   }
6583 
6584   // Test if the found instruction is a reduction, and if not return an invalid
6585   // cost specifying the parent to use the original cost modelling.
6586   if (!InLoopReductionImmediateChains.count(RetI))
6587     return std::nullopt;
6588 
6589   // Find the reduction this chain is a part of and calculate the basic cost of
6590   // the reduction on its own.
6591   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6592   Instruction *ReductionPhi = LastChain;
6593   while (!isa<PHINode>(ReductionPhi))
6594     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6595 
6596   const RecurrenceDescriptor &RdxDesc =
6597       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6598 
6599   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6600       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6601 
6602   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6603   // normal fmul instruction to the cost of the fadd reduction.
6604   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6605     BaseCost +=
6606         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6607 
6608   // If we're using ordered reductions then we can just return the base cost
6609   // here, since getArithmeticReductionCost calculates the full ordered
6610   // reduction cost when FP reassociation is not allowed.
6611   if (useOrderedReductions(RdxDesc))
6612     return BaseCost;
6613 
6614   // Get the operand that was not the reduction chain and match it to one of the
6615   // patterns, returning the better cost if it is found.
6616   Instruction *RedOp = RetI->getOperand(1) == LastChain
6617                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6618                            : dyn_cast<Instruction>(RetI->getOperand(1));
6619 
6620   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6621 
6622   Instruction *Op0, *Op1;
6623   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6624       match(RedOp,
6625             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6626       match(Op0, m_ZExtOrSExt(m_Value())) &&
6627       Op0->getOpcode() == Op1->getOpcode() &&
6628       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6629       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6630       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6631 
6632     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6633     // Note that the extend opcodes need to all match, or if A==B they will have
6634     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6635     // which is equally fine.
6636     bool IsUnsigned = isa<ZExtInst>(Op0);
6637     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6638     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6639 
6640     InstructionCost ExtCost =
6641         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6642                              TTI::CastContextHint::None, CostKind, Op0);
6643     InstructionCost MulCost =
6644         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6645     InstructionCost Ext2Cost =
6646         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6647                              TTI::CastContextHint::None, CostKind, RedOp);
6648 
6649     InstructionCost RedCost = TTI.getMulAccReductionCost(
6650         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6651 
6652     if (RedCost.isValid() &&
6653         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6654       return I == RetI ? RedCost : 0;
6655   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6656              !TheLoop->isLoopInvariant(RedOp)) {
6657     // Matched reduce(ext(A))
6658     bool IsUnsigned = isa<ZExtInst>(RedOp);
6659     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6660     InstructionCost RedCost = TTI.getExtendedReductionCost(
6661         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6662         RdxDesc.getFastMathFlags(), CostKind);
6663 
6664     InstructionCost ExtCost =
6665         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6666                              TTI::CastContextHint::None, CostKind, RedOp);
6667     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6668       return I == RetI ? RedCost : 0;
6669   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6670              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6671     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6672         Op0->getOpcode() == Op1->getOpcode() &&
6673         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6674       bool IsUnsigned = isa<ZExtInst>(Op0);
6675       Type *Op0Ty = Op0->getOperand(0)->getType();
6676       Type *Op1Ty = Op1->getOperand(0)->getType();
6677       Type *LargestOpTy =
6678           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6679                                                                     : Op0Ty;
6680       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6681 
6682       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6683       // different sizes. We take the largest type as the ext to reduce, and add
6684       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6685       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6686           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6687           TTI::CastContextHint::None, CostKind, Op0);
6688       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6689           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6690           TTI::CastContextHint::None, CostKind, Op1);
6691       InstructionCost MulCost =
6692           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6693 
6694       InstructionCost RedCost = TTI.getMulAccReductionCost(
6695           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6696       InstructionCost ExtraExtCost = 0;
6697       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6698         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6699         ExtraExtCost = TTI.getCastInstrCost(
6700             ExtraExtOp->getOpcode(), ExtType,
6701             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6702             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6703       }
6704 
6705       if (RedCost.isValid() &&
6706           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6707         return I == RetI ? RedCost : 0;
6708     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6709       // Matched reduce.add(mul())
6710       InstructionCost MulCost =
6711           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6712 
6713       InstructionCost RedCost = TTI.getMulAccReductionCost(
6714           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6715 
6716       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6717         return I == RetI ? RedCost : 0;
6718     }
6719   }
6720 
6721   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6722 }
6723 
6724 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)6725 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6726                                                      ElementCount VF) {
6727   // Calculate scalar cost only. Vectorization cost should be ready at this
6728   // moment.
6729   if (VF.isScalar()) {
6730     Type *ValTy = getLoadStoreType(I);
6731     const Align Alignment = getLoadStoreAlignment(I);
6732     unsigned AS = getLoadStoreAddressSpace(I);
6733 
6734     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6735     return TTI.getAddressComputationCost(ValTy) +
6736            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6737                                TTI::TCK_RecipThroughput, OpInfo, I);
6738   }
6739   return getWideningCost(I, VF);
6740 }
6741 
6742 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6743 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6744                                                ElementCount VF) {
6745   // If we know that this instruction will remain uniform, check the cost of
6746   // the scalar version.
6747   if (isUniformAfterVectorization(I, VF))
6748     VF = ElementCount::getFixed(1);
6749 
6750   if (VF.isVector() && isProfitableToScalarize(I, VF))
6751     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6752 
6753   // Forced scalars do not have any scalarization overhead.
6754   auto ForcedScalar = ForcedScalars.find(VF);
6755   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6756     auto InstSet = ForcedScalar->second;
6757     if (InstSet.count(I))
6758       return VectorizationCostTy(
6759           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6760            VF.getKnownMinValue()),
6761           false);
6762   }
6763 
6764   Type *VectorTy;
6765   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6766 
6767   bool TypeNotScalarized = false;
6768   if (VF.isVector() && VectorTy->isVectorTy()) {
6769     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6770       if (VF.isScalable())
6771         // <vscale x 1 x iN> is assumed to be profitable over iN because
6772         // scalable registers are a distinct register class from scalar ones.
6773         // If we ever find a target which wants to lower scalable vectors
6774         // back to scalars, we'll need to update this code to explicitly
6775         // ask TTI about the register class uses for each part.
6776         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6777       else
6778         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6779     } else
6780       C = InstructionCost::getInvalid();
6781   }
6782   return VectorizationCostTy(C, TypeNotScalarized);
6783 }
6784 
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const6785 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6786     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6787 
6788   // There is no mechanism yet to create a scalable scalarization loop,
6789   // so this is currently Invalid.
6790   if (VF.isScalable())
6791     return InstructionCost::getInvalid();
6792 
6793   if (VF.isScalar())
6794     return 0;
6795 
6796   InstructionCost Cost = 0;
6797   Type *RetTy = ToVectorTy(I->getType(), VF);
6798   if (!RetTy->isVoidTy() &&
6799       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6800     Cost += TTI.getScalarizationOverhead(
6801         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6802         /*Insert*/ true,
6803         /*Extract*/ false, CostKind);
6804 
6805   // Some targets keep addresses scalar.
6806   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6807     return Cost;
6808 
6809   // Some targets support efficient element stores.
6810   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6811     return Cost;
6812 
6813   // Collect operands to consider.
6814   CallInst *CI = dyn_cast<CallInst>(I);
6815   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6816 
6817   // Skip operands that do not require extraction/scalarization and do not incur
6818   // any overhead.
6819   SmallVector<Type *> Tys;
6820   for (auto *V : filterExtractingOperands(Ops, VF))
6821     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6822   return Cost + TTI.getOperandsScalarizationOverhead(
6823                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6824 }
6825 
setCostBasedWideningDecision(ElementCount VF)6826 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6827   if (VF.isScalar())
6828     return;
6829   NumPredStores = 0;
6830   for (BasicBlock *BB : TheLoop->blocks()) {
6831     // For each instruction in the old loop.
6832     for (Instruction &I : *BB) {
6833       Value *Ptr =  getLoadStorePointerOperand(&I);
6834       if (!Ptr)
6835         continue;
6836 
6837       // TODO: We should generate better code and update the cost model for
6838       // predicated uniform stores. Today they are treated as any other
6839       // predicated store (see added test cases in
6840       // invariant-store-vectorization.ll).
6841       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6842         NumPredStores++;
6843 
6844       if (Legal->isUniformMemOp(I)) {
6845         auto isLegalToScalarize = [&]() {
6846           if (!VF.isScalable())
6847             // Scalarization of fixed length vectors "just works".
6848             return true;
6849 
6850           // We have dedicated lowering for unpredicated uniform loads and
6851           // stores.  Note that even with tail folding we know that at least
6852           // one lane is active (i.e. generalized predication is not possible
6853           // here), and the logic below depends on this fact.
6854           if (!foldTailByMasking())
6855             return true;
6856 
6857           // For scalable vectors, a uniform memop load is always
6858           // uniform-by-parts  and we know how to scalarize that.
6859           if (isa<LoadInst>(I))
6860             return true;
6861 
6862           // A uniform store isn't neccessarily uniform-by-part
6863           // and we can't assume scalarization.
6864           auto &SI = cast<StoreInst>(I);
6865           return TheLoop->isLoopInvariant(SI.getValueOperand());
6866         };
6867 
6868         const InstructionCost GatherScatterCost =
6869           isLegalGatherOrScatter(&I, VF) ?
6870           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6871 
6872         // Load: Scalar load + broadcast
6873         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6874         // FIXME: This cost is a significant under-estimate for tail folded
6875         // memory ops.
6876         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6877           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6878 
6879         // Choose better solution for the current VF,  Note that Invalid
6880         // costs compare as maximumal large.  If both are invalid, we get
6881         // scalable invalid which signals a failure and a vectorization abort.
6882         if (GatherScatterCost < ScalarizationCost)
6883           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6884         else
6885           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6886         continue;
6887       }
6888 
6889       // We assume that widening is the best solution when possible.
6890       if (memoryInstructionCanBeWidened(&I, VF)) {
6891         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6892         int ConsecutiveStride = Legal->isConsecutivePtr(
6893             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6894         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6895                "Expected consecutive stride.");
6896         InstWidening Decision =
6897             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6898         setWideningDecision(&I, VF, Decision, Cost);
6899         continue;
6900       }
6901 
6902       // Choose between Interleaving, Gather/Scatter or Scalarization.
6903       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6904       unsigned NumAccesses = 1;
6905       if (isAccessInterleaved(&I)) {
6906         auto Group = getInterleavedAccessGroup(&I);
6907         assert(Group && "Fail to get an interleaved access group.");
6908 
6909         // Make one decision for the whole group.
6910         if (getWideningDecision(&I, VF) != CM_Unknown)
6911           continue;
6912 
6913         NumAccesses = Group->getNumMembers();
6914         if (interleavedAccessCanBeWidened(&I, VF))
6915           InterleaveCost = getInterleaveGroupCost(&I, VF);
6916       }
6917 
6918       InstructionCost GatherScatterCost =
6919           isLegalGatherOrScatter(&I, VF)
6920               ? getGatherScatterCost(&I, VF) * NumAccesses
6921               : InstructionCost::getInvalid();
6922 
6923       InstructionCost ScalarizationCost =
6924           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6925 
6926       // Choose better solution for the current VF,
6927       // write down this decision and use it during vectorization.
6928       InstructionCost Cost;
6929       InstWidening Decision;
6930       if (InterleaveCost <= GatherScatterCost &&
6931           InterleaveCost < ScalarizationCost) {
6932         Decision = CM_Interleave;
6933         Cost = InterleaveCost;
6934       } else if (GatherScatterCost < ScalarizationCost) {
6935         Decision = CM_GatherScatter;
6936         Cost = GatherScatterCost;
6937       } else {
6938         Decision = CM_Scalarize;
6939         Cost = ScalarizationCost;
6940       }
6941       // If the instructions belongs to an interleave group, the whole group
6942       // receives the same decision. The whole group receives the cost, but
6943       // the cost will actually be assigned to one instruction.
6944       if (auto Group = getInterleavedAccessGroup(&I))
6945         setWideningDecision(Group, VF, Decision, Cost);
6946       else
6947         setWideningDecision(&I, VF, Decision, Cost);
6948     }
6949   }
6950 
6951   // Make sure that any load of address and any other address computation
6952   // remains scalar unless there is gather/scatter support. This avoids
6953   // inevitable extracts into address registers, and also has the benefit of
6954   // activating LSR more, since that pass can't optimize vectorized
6955   // addresses.
6956   if (TTI.prefersVectorizedAddressing())
6957     return;
6958 
6959   // Start with all scalar pointer uses.
6960   SmallPtrSet<Instruction *, 8> AddrDefs;
6961   for (BasicBlock *BB : TheLoop->blocks())
6962     for (Instruction &I : *BB) {
6963       Instruction *PtrDef =
6964         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6965       if (PtrDef && TheLoop->contains(PtrDef) &&
6966           getWideningDecision(&I, VF) != CM_GatherScatter)
6967         AddrDefs.insert(PtrDef);
6968     }
6969 
6970   // Add all instructions used to generate the addresses.
6971   SmallVector<Instruction *, 4> Worklist;
6972   append_range(Worklist, AddrDefs);
6973   while (!Worklist.empty()) {
6974     Instruction *I = Worklist.pop_back_val();
6975     for (auto &Op : I->operands())
6976       if (auto *InstOp = dyn_cast<Instruction>(Op))
6977         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6978             AddrDefs.insert(InstOp).second)
6979           Worklist.push_back(InstOp);
6980   }
6981 
6982   for (auto *I : AddrDefs) {
6983     if (isa<LoadInst>(I)) {
6984       // Setting the desired widening decision should ideally be handled in
6985       // by cost functions, but since this involves the task of finding out
6986       // if the loaded register is involved in an address computation, it is
6987       // instead changed here when we know this is the case.
6988       InstWidening Decision = getWideningDecision(I, VF);
6989       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6990         // Scalarize a widened load of address.
6991         setWideningDecision(
6992             I, VF, CM_Scalarize,
6993             (VF.getKnownMinValue() *
6994              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6995       else if (auto Group = getInterleavedAccessGroup(I)) {
6996         // Scalarize an interleave group of address loads.
6997         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6998           if (Instruction *Member = Group->getMember(I))
6999             setWideningDecision(
7000                 Member, VF, CM_Scalarize,
7001                 (VF.getKnownMinValue() *
7002                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7003         }
7004       }
7005     } else
7006       // Make sure I gets scalarized and a cost estimate without
7007       // scalarization overhead.
7008       ForcedScalars[VF].insert(I);
7009   }
7010 }
7011 
7012 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)7013 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7014                                                Type *&VectorTy) {
7015   Type *RetTy = I->getType();
7016   if (canTruncateToMinimalBitwidth(I, VF))
7017     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7018   auto SE = PSE.getSE();
7019   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7020 
7021   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7022                                                 ElementCount VF) -> bool {
7023     if (VF.isScalar())
7024       return true;
7025 
7026     auto Scalarized = InstsToScalarize.find(VF);
7027     assert(Scalarized != InstsToScalarize.end() &&
7028            "VF not yet analyzed for scalarization profitability");
7029     return !Scalarized->second.count(I) &&
7030            llvm::all_of(I->users(), [&](User *U) {
7031              auto *UI = cast<Instruction>(U);
7032              return !Scalarized->second.count(UI);
7033            });
7034   };
7035   (void) hasSingleCopyAfterVectorization;
7036 
7037   if (isScalarAfterVectorization(I, VF)) {
7038     // With the exception of GEPs and PHIs, after scalarization there should
7039     // only be one copy of the instruction generated in the loop. This is
7040     // because the VF is either 1, or any instructions that need scalarizing
7041     // have already been dealt with by the the time we get here. As a result,
7042     // it means we don't have to multiply the instruction cost by VF.
7043     assert(I->getOpcode() == Instruction::GetElementPtr ||
7044            I->getOpcode() == Instruction::PHI ||
7045            (I->getOpcode() == Instruction::BitCast &&
7046             I->getType()->isPointerTy()) ||
7047            hasSingleCopyAfterVectorization(I, VF));
7048     VectorTy = RetTy;
7049   } else
7050     VectorTy = ToVectorTy(RetTy, VF);
7051 
7052   // TODO: We need to estimate the cost of intrinsic calls.
7053   switch (I->getOpcode()) {
7054   case Instruction::GetElementPtr:
7055     // We mark this instruction as zero-cost because the cost of GEPs in
7056     // vectorized code depends on whether the corresponding memory instruction
7057     // is scalarized or not. Therefore, we handle GEPs with the memory
7058     // instruction cost.
7059     return 0;
7060   case Instruction::Br: {
7061     // In cases of scalarized and predicated instructions, there will be VF
7062     // predicated blocks in the vectorized loop. Each branch around these
7063     // blocks requires also an extract of its vector compare i1 element.
7064     bool ScalarPredicatedBB = false;
7065     BranchInst *BI = cast<BranchInst>(I);
7066     if (VF.isVector() && BI->isConditional() &&
7067         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7068          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7069       ScalarPredicatedBB = true;
7070 
7071     if (ScalarPredicatedBB) {
7072       // Not possible to scalarize scalable vector with predicated instructions.
7073       if (VF.isScalable())
7074         return InstructionCost::getInvalid();
7075       // Return cost for branches around scalarized and predicated blocks.
7076       auto *Vec_i1Ty =
7077           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7078       return (
7079           TTI.getScalarizationOverhead(
7080               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7081               /*Insert*/ false, /*Extract*/ true, CostKind) +
7082           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7083     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7084       // The back-edge branch will remain, as will all scalar branches.
7085       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7086     else
7087       // This branch will be eliminated by if-conversion.
7088       return 0;
7089     // Note: We currently assume zero cost for an unconditional branch inside
7090     // a predicated block since it will become a fall-through, although we
7091     // may decide in the future to call TTI for all branches.
7092   }
7093   case Instruction::PHI: {
7094     auto *Phi = cast<PHINode>(I);
7095 
7096     // First-order recurrences are replaced by vector shuffles inside the loop.
7097     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7098       SmallVector<int> Mask(VF.getKnownMinValue());
7099       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7100       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7101                                 cast<VectorType>(VectorTy), Mask, CostKind,
7102                                 VF.getKnownMinValue() - 1);
7103     }
7104 
7105     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7106     // converted into select instructions. We require N - 1 selects per phi
7107     // node, where N is the number of incoming values.
7108     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7109       return (Phi->getNumIncomingValues() - 1) *
7110              TTI.getCmpSelInstrCost(
7111                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7112                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7113                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7114 
7115     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7116   }
7117   case Instruction::UDiv:
7118   case Instruction::SDiv:
7119   case Instruction::URem:
7120   case Instruction::SRem:
7121     if (VF.isVector() && isPredicatedInst(I)) {
7122       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7123       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7124         ScalarCost : SafeDivisorCost;
7125     }
7126     // We've proven all lanes safe to speculate, fall through.
7127     [[fallthrough]];
7128   case Instruction::Add:
7129   case Instruction::FAdd:
7130   case Instruction::Sub:
7131   case Instruction::FSub:
7132   case Instruction::Mul:
7133   case Instruction::FMul:
7134   case Instruction::FDiv:
7135   case Instruction::FRem:
7136   case Instruction::Shl:
7137   case Instruction::LShr:
7138   case Instruction::AShr:
7139   case Instruction::And:
7140   case Instruction::Or:
7141   case Instruction::Xor: {
7142     // Since we will replace the stride by 1 the multiplication should go away.
7143     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7144       return 0;
7145 
7146     // Detect reduction patterns
7147     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7148       return *RedCost;
7149 
7150     // Certain instructions can be cheaper to vectorize if they have a constant
7151     // second vector operand. One example of this are shifts on x86.
7152     Value *Op2 = I->getOperand(1);
7153     auto Op2Info = TTI.getOperandInfo(Op2);
7154     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7155       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7156 
7157     SmallVector<const Value *, 4> Operands(I->operand_values());
7158     return TTI.getArithmeticInstrCost(
7159         I->getOpcode(), VectorTy, CostKind,
7160         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7161         Op2Info, Operands, I);
7162   }
7163   case Instruction::FNeg: {
7164     return TTI.getArithmeticInstrCost(
7165         I->getOpcode(), VectorTy, CostKind,
7166         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7167         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7168         I->getOperand(0), I);
7169   }
7170   case Instruction::Select: {
7171     SelectInst *SI = cast<SelectInst>(I);
7172     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7173     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7174 
7175     const Value *Op0, *Op1;
7176     using namespace llvm::PatternMatch;
7177     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7178                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7179       // select x, y, false --> x & y
7180       // select x, true, y --> x | y
7181       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7182       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7183       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7184               Op1->getType()->getScalarSizeInBits() == 1);
7185 
7186       SmallVector<const Value *, 2> Operands{Op0, Op1};
7187       return TTI.getArithmeticInstrCost(
7188           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7189           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7190     }
7191 
7192     Type *CondTy = SI->getCondition()->getType();
7193     if (!ScalarCond)
7194       CondTy = VectorType::get(CondTy, VF);
7195 
7196     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7197     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7198       Pred = Cmp->getPredicate();
7199     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7200                                   CostKind, I);
7201   }
7202   case Instruction::ICmp:
7203   case Instruction::FCmp: {
7204     Type *ValTy = I->getOperand(0)->getType();
7205     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7206     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7207       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7208     VectorTy = ToVectorTy(ValTy, VF);
7209     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7210                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7211                                   I);
7212   }
7213   case Instruction::Store:
7214   case Instruction::Load: {
7215     ElementCount Width = VF;
7216     if (Width.isVector()) {
7217       InstWidening Decision = getWideningDecision(I, Width);
7218       assert(Decision != CM_Unknown &&
7219              "CM decision should be taken at this point");
7220       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7221         return InstructionCost::getInvalid();
7222       if (Decision == CM_Scalarize)
7223         Width = ElementCount::getFixed(1);
7224     }
7225     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7226     return getMemoryInstructionCost(I, VF);
7227   }
7228   case Instruction::BitCast:
7229     if (I->getType()->isPointerTy())
7230       return 0;
7231     [[fallthrough]];
7232   case Instruction::ZExt:
7233   case Instruction::SExt:
7234   case Instruction::FPToUI:
7235   case Instruction::FPToSI:
7236   case Instruction::FPExt:
7237   case Instruction::PtrToInt:
7238   case Instruction::IntToPtr:
7239   case Instruction::SIToFP:
7240   case Instruction::UIToFP:
7241   case Instruction::Trunc:
7242   case Instruction::FPTrunc: {
7243     // Computes the CastContextHint from a Load/Store instruction.
7244     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7245       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7246              "Expected a load or a store!");
7247 
7248       if (VF.isScalar() || !TheLoop->contains(I))
7249         return TTI::CastContextHint::Normal;
7250 
7251       switch (getWideningDecision(I, VF)) {
7252       case LoopVectorizationCostModel::CM_GatherScatter:
7253         return TTI::CastContextHint::GatherScatter;
7254       case LoopVectorizationCostModel::CM_Interleave:
7255         return TTI::CastContextHint::Interleave;
7256       case LoopVectorizationCostModel::CM_Scalarize:
7257       case LoopVectorizationCostModel::CM_Widen:
7258         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7259                                         : TTI::CastContextHint::Normal;
7260       case LoopVectorizationCostModel::CM_Widen_Reverse:
7261         return TTI::CastContextHint::Reversed;
7262       case LoopVectorizationCostModel::CM_Unknown:
7263         llvm_unreachable("Instr did not go through cost modelling?");
7264       }
7265 
7266       llvm_unreachable("Unhandled case!");
7267     };
7268 
7269     unsigned Opcode = I->getOpcode();
7270     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7271     // For Trunc, the context is the only user, which must be a StoreInst.
7272     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7273       if (I->hasOneUse())
7274         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7275           CCH = ComputeCCH(Store);
7276     }
7277     // For Z/Sext, the context is the operand, which must be a LoadInst.
7278     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7279              Opcode == Instruction::FPExt) {
7280       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7281         CCH = ComputeCCH(Load);
7282     }
7283 
7284     // We optimize the truncation of induction variables having constant
7285     // integer steps. The cost of these truncations is the same as the scalar
7286     // operation.
7287     if (isOptimizableIVTruncate(I, VF)) {
7288       auto *Trunc = cast<TruncInst>(I);
7289       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7290                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7291     }
7292 
7293     // Detect reduction patterns
7294     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7295       return *RedCost;
7296 
7297     Type *SrcScalarTy = I->getOperand(0)->getType();
7298     Type *SrcVecTy =
7299         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7300     if (canTruncateToMinimalBitwidth(I, VF)) {
7301       // This cast is going to be shrunk. This may remove the cast or it might
7302       // turn it into slightly different cast. For example, if MinBW == 16,
7303       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7304       //
7305       // Calculate the modified src and dest types.
7306       Type *MinVecTy = VectorTy;
7307       if (Opcode == Instruction::Trunc) {
7308         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7309         VectorTy =
7310             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7311       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7312         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7313         VectorTy =
7314             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7315       }
7316     }
7317 
7318     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7319   }
7320   case Instruction::Call: {
7321     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7322       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7323         return *RedCost;
7324     bool NeedToScalarize;
7325     CallInst *CI = cast<CallInst>(I);
7326     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7327     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7328       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7329       return std::min(CallCost, IntrinsicCost);
7330     }
7331     return CallCost;
7332   }
7333   case Instruction::ExtractValue:
7334     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7335   case Instruction::Alloca:
7336     // We cannot easily widen alloca to a scalable alloca, as
7337     // the result would need to be a vector of pointers.
7338     if (VF.isScalable())
7339       return InstructionCost::getInvalid();
7340     [[fallthrough]];
7341   default:
7342     // This opcode is unknown. Assume that it is the same as 'mul'.
7343     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7344   } // end of switch.
7345 }
7346 
7347 char LoopVectorize::ID = 0;
7348 
7349 static const char lv_name[] = "Loop Vectorization";
7350 
7351 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7352 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7353 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7354 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7355 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7356 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7357 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7358 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7359 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7360 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7361 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7362 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7363 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7364 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7365 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7366 
7367 namespace llvm {
7368 
createLoopVectorizePass()7369 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7370 
createLoopVectorizePass(bool InterleaveOnlyWhenForced,bool VectorizeOnlyWhenForced)7371 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7372                               bool VectorizeOnlyWhenForced) {
7373   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7374 }
7375 
7376 } // end namespace llvm
7377 
collectValuesToIgnore()7378 void LoopVectorizationCostModel::collectValuesToIgnore() {
7379   // Ignore ephemeral values.
7380   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7381 
7382   // Find all stores to invariant variables. Since they are going to sink
7383   // outside the loop we do not need calculate cost for them.
7384   for (BasicBlock *BB : TheLoop->blocks())
7385     for (Instruction &I : *BB) {
7386       StoreInst *SI;
7387       if ((SI = dyn_cast<StoreInst>(&I)) &&
7388           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7389         ValuesToIgnore.insert(&I);
7390     }
7391 
7392   // Ignore type-promoting instructions we identified during reduction
7393   // detection.
7394   for (const auto &Reduction : Legal->getReductionVars()) {
7395     const RecurrenceDescriptor &RedDes = Reduction.second;
7396     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7397     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7398   }
7399   // Ignore type-casting instructions we identified during induction
7400   // detection.
7401   for (const auto &Induction : Legal->getInductionVars()) {
7402     const InductionDescriptor &IndDes = Induction.second;
7403     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7404     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7405   }
7406 }
7407 
collectInLoopReductions()7408 void LoopVectorizationCostModel::collectInLoopReductions() {
7409   for (const auto &Reduction : Legal->getReductionVars()) {
7410     PHINode *Phi = Reduction.first;
7411     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7412 
7413     // We don't collect reductions that are type promoted (yet).
7414     if (RdxDesc.getRecurrenceType() != Phi->getType())
7415       continue;
7416 
7417     // If the target would prefer this reduction to happen "in-loop", then we
7418     // want to record it as such.
7419     unsigned Opcode = RdxDesc.getOpcode();
7420     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7421         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7422                                    TargetTransformInfo::ReductionFlags()))
7423       continue;
7424 
7425     // Check that we can correctly put the reductions into the loop, by
7426     // finding the chain of operations that leads from the phi to the loop
7427     // exit value.
7428     SmallVector<Instruction *, 4> ReductionOperations =
7429         RdxDesc.getReductionOpChain(Phi, TheLoop);
7430     bool InLoop = !ReductionOperations.empty();
7431     if (InLoop) {
7432       InLoopReductionChains[Phi] = ReductionOperations;
7433       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7434       Instruction *LastChain = Phi;
7435       for (auto *I : ReductionOperations) {
7436         InLoopReductionImmediateChains[I] = LastChain;
7437         LastChain = I;
7438       }
7439     }
7440     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7441                       << " reduction for phi: " << *Phi << "\n");
7442   }
7443 }
7444 
7445 // TODO: we could return a pair of values that specify the max VF and
7446 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7447 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7448 // doesn't have a cost model that can choose which plan to execute if
7449 // more than one is generated.
determineVPlanVF(const unsigned WidestVectorRegBits,LoopVectorizationCostModel & CM)7450 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7451                                  LoopVectorizationCostModel &CM) {
7452   unsigned WidestType;
7453   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7454   return WidestVectorRegBits / WidestType;
7455 }
7456 
7457 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7458 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7459   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7460   ElementCount VF = UserVF;
7461   // Outer loop handling: They may require CFG and instruction level
7462   // transformations before even evaluating whether vectorization is profitable.
7463   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7464   // the vectorization pipeline.
7465   if (!OrigLoop->isInnermost()) {
7466     // If the user doesn't provide a vectorization factor, determine a
7467     // reasonable one.
7468     if (UserVF.isZero()) {
7469       VF = ElementCount::getFixed(determineVPlanVF(
7470           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7471               .getFixedValue(),
7472           CM));
7473       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7474 
7475       // Make sure we have a VF > 1 for stress testing.
7476       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7477         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7478                           << "overriding computed VF.\n");
7479         VF = ElementCount::getFixed(4);
7480       }
7481     }
7482     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7483     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7484            "VF needs to be a power of two");
7485     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7486                       << "VF " << VF << " to build VPlans.\n");
7487     buildVPlans(VF, VF);
7488 
7489     // For VPlan build stress testing, we bail out after VPlan construction.
7490     if (VPlanBuildStressTest)
7491       return VectorizationFactor::Disabled();
7492 
7493     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7494   }
7495 
7496   LLVM_DEBUG(
7497       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7498                 "VPlan-native path.\n");
7499   return VectorizationFactor::Disabled();
7500 }
7501 
7502 std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7503 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7504   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7505   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7506   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7507     return std::nullopt;
7508 
7509   // Invalidate interleave groups if all blocks of loop will be predicated.
7510   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7511       !useMaskedInterleavedAccesses(*TTI)) {
7512     LLVM_DEBUG(
7513         dbgs()
7514         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7515            "which requires masked-interleaved support.\n");
7516     if (CM.InterleaveInfo.invalidateGroups())
7517       // Invalidating interleave groups also requires invalidating all decisions
7518       // based on them, which includes widening decisions and uniform and scalar
7519       // values.
7520       CM.invalidateCostModelingDecisions();
7521   }
7522 
7523   ElementCount MaxUserVF =
7524       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7525   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7526   if (!UserVF.isZero() && UserVFIsLegal) {
7527     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7528            "VF needs to be a power of two");
7529     // Collect the instructions (and their associated costs) that will be more
7530     // profitable to scalarize.
7531     if (CM.selectUserVectorizationFactor(UserVF)) {
7532       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7533       CM.collectInLoopReductions();
7534       buildVPlansWithVPRecipes(UserVF, UserVF);
7535       LLVM_DEBUG(printPlans(dbgs()));
7536       return {{UserVF, 0, 0}};
7537     } else
7538       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7539                               "InvalidCost", ORE, OrigLoop);
7540   }
7541 
7542   // Populate the set of Vectorization Factor Candidates.
7543   ElementCountSet VFCandidates;
7544   for (auto VF = ElementCount::getFixed(1);
7545        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7546     VFCandidates.insert(VF);
7547   for (auto VF = ElementCount::getScalable(1);
7548        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7549     VFCandidates.insert(VF);
7550 
7551   for (const auto &VF : VFCandidates) {
7552     // Collect Uniform and Scalar instructions after vectorization with VF.
7553     CM.collectUniformsAndScalars(VF);
7554 
7555     // Collect the instructions (and their associated costs) that will be more
7556     // profitable to scalarize.
7557     if (VF.isVector())
7558       CM.collectInstsToScalarize(VF);
7559   }
7560 
7561   CM.collectInLoopReductions();
7562   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7563   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7564 
7565   LLVM_DEBUG(printPlans(dbgs()));
7566   if (!MaxFactors.hasVector())
7567     return VectorizationFactor::Disabled();
7568 
7569   // Select the optimal vectorization factor.
7570   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7571   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7572   return VF;
7573 }
7574 
getBestPlanFor(ElementCount VF) const7575 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7576   assert(count_if(VPlans,
7577                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7578              1 &&
7579          "Best VF has not a single VPlan.");
7580 
7581   for (const VPlanPtr &Plan : VPlans) {
7582     if (Plan->hasVF(VF))
7583       return *Plan.get();
7584   }
7585   llvm_unreachable("No plan found!");
7586 }
7587 
AddRuntimeUnrollDisableMetaData(Loop * L)7588 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7589   SmallVector<Metadata *, 4> MDs;
7590   // Reserve first location for self reference to the LoopID metadata node.
7591   MDs.push_back(nullptr);
7592   bool IsUnrollMetadata = false;
7593   MDNode *LoopID = L->getLoopID();
7594   if (LoopID) {
7595     // First find existing loop unrolling disable metadata.
7596     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7597       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7598       if (MD) {
7599         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7600         IsUnrollMetadata =
7601             S && S->getString().startswith("llvm.loop.unroll.disable");
7602       }
7603       MDs.push_back(LoopID->getOperand(i));
7604     }
7605   }
7606 
7607   if (!IsUnrollMetadata) {
7608     // Add runtime unroll disable metadata.
7609     LLVMContext &Context = L->getHeader()->getContext();
7610     SmallVector<Metadata *, 1> DisableOperands;
7611     DisableOperands.push_back(
7612         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7613     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7614     MDs.push_back(DisableNode);
7615     MDNode *NewLoopID = MDNode::get(Context, MDs);
7616     // Set operand 0 to refer to the loop id itself.
7617     NewLoopID->replaceOperandWith(0, NewLoopID);
7618     L->setLoopID(NewLoopID);
7619   }
7620 }
7621 
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization)7622 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7623                                            VPlan &BestVPlan,
7624                                            InnerLoopVectorizer &ILV,
7625                                            DominatorTree *DT,
7626                                            bool IsEpilogueVectorization) {
7627   assert(BestVPlan.hasVF(BestVF) &&
7628          "Trying to execute plan with unsupported VF");
7629   assert(BestVPlan.hasUF(BestUF) &&
7630          "Trying to execute plan with unsupported UF");
7631 
7632   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7633                     << '\n');
7634 
7635   // Workaround!  Compute the trip count of the original loop and cache it
7636   // before we start modifying the CFG.  This code has a systemic problem
7637   // wherein it tries to run analysis over partially constructed IR; this is
7638   // wrong, and not simply for SCEV.  The trip count of the original loop
7639   // simply happens to be prone to hitting this in practice.  In theory, we
7640   // can hit the same issue for any SCEV, or ValueTracking query done during
7641   // mutation.  See PR49900.
7642   ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7643 
7644   if (!IsEpilogueVectorization)
7645     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7646 
7647   // Perform the actual loop transformation.
7648 
7649   // 1. Set up the skeleton for vectorization, including vector pre-header and
7650   // middle block. The vector loop is created during VPlan execution.
7651   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7652   Value *CanonicalIVStartValue;
7653   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7654       ILV.createVectorizedLoopSkeleton();
7655 
7656   // Only use noalias metadata when using memory checks guaranteeing no overlap
7657   // across all iterations.
7658   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7659   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7660       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7661 
7662     //  We currently don't use LoopVersioning for the actual loop cloning but we
7663     //  still use it to add the noalias metadata.
7664     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7665     //        metadata.
7666     State.LVer = std::make_unique<LoopVersioning>(
7667         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7668         PSE.getSE());
7669     State.LVer->prepareNoAliasMetadata();
7670   }
7671 
7672   ILV.collectPoisonGeneratingRecipes(State);
7673 
7674   ILV.printDebugTracesAtStart();
7675 
7676   //===------------------------------------------------===//
7677   //
7678   // Notice: any optimization or new instruction that go
7679   // into the code below should also be implemented in
7680   // the cost-model.
7681   //
7682   //===------------------------------------------------===//
7683 
7684   // 2. Copy and widen instructions from the old loop into the new loop.
7685   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7686                              ILV.getOrCreateVectorTripCount(nullptr),
7687                              CanonicalIVStartValue, State,
7688                              IsEpilogueVectorization);
7689 
7690   BestVPlan.execute(&State);
7691 
7692   // Keep all loop hints from the original loop on the vector loop (we'll
7693   // replace the vectorizer-specific hints below).
7694   MDNode *OrigLoopID = OrigLoop->getLoopID();
7695 
7696   std::optional<MDNode *> VectorizedLoopID =
7697       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7698                                       LLVMLoopVectorizeFollowupVectorized});
7699 
7700   VPBasicBlock *HeaderVPBB =
7701       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7702   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7703   if (VectorizedLoopID)
7704     L->setLoopID(*VectorizedLoopID);
7705   else {
7706     // Keep all loop hints from the original loop on the vector loop (we'll
7707     // replace the vectorizer-specific hints below).
7708     if (MDNode *LID = OrigLoop->getLoopID())
7709       L->setLoopID(LID);
7710 
7711     LoopVectorizeHints Hints(L, true, *ORE);
7712     Hints.setAlreadyVectorized();
7713   }
7714   AddRuntimeUnrollDisableMetaData(L);
7715 
7716   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7717   //    predication, updating analyses.
7718   ILV.fixVectorizedLoop(State, BestVPlan);
7719 
7720   ILV.printDebugTracesAtEnd();
7721 }
7722 
7723 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7724 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7725   for (const auto &Plan : VPlans)
7726     if (PrintVPlansInDotFormat)
7727       Plan->printDOT(O);
7728     else
7729       Plan->print(O);
7730 }
7731 #endif
7732 
getBroadcastInstrs(Value * V)7733 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7734 
7735 //===--------------------------------------------------------------------===//
7736 // EpilogueVectorizerMainLoop
7737 //===--------------------------------------------------------------------===//
7738 
7739 /// This function is partially responsible for generating the control flow
7740 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7741 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton()7742 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7743   createVectorLoopSkeleton("");
7744 
7745   // Generate the code to check the minimum iteration count of the vector
7746   // epilogue (see below).
7747   EPI.EpilogueIterationCountCheck =
7748       emitIterationCountCheck(LoopScalarPreHeader, true);
7749   EPI.EpilogueIterationCountCheck->setName("iter.check");
7750 
7751   // Generate the code to check any assumptions that we've made for SCEV
7752   // expressions.
7753   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7754 
7755   // Generate the code that checks at runtime if arrays overlap. We put the
7756   // checks into a separate block to make the more common case of few elements
7757   // faster.
7758   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7759 
7760   // Generate the iteration count check for the main loop, *after* the check
7761   // for the epilogue loop, so that the path-length is shorter for the case
7762   // that goes directly through the vector epilogue. The longer-path length for
7763   // the main loop is compensated for, by the gain from vectorizing the larger
7764   // trip count. Note: the branch will get updated later on when we vectorize
7765   // the epilogue.
7766   EPI.MainLoopIterationCountCheck =
7767       emitIterationCountCheck(LoopScalarPreHeader, false);
7768 
7769   // Generate the induction variable.
7770   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7771 
7772   // Skip induction resume value creation here because they will be created in
7773   // the second pass for the scalar loop. The induction resume values for the
7774   // inductions in the epilogue loop are created before executing the plan for
7775   // the epilogue loop.
7776 
7777   return {completeLoopSkeleton(), nullptr};
7778 }
7779 
printDebugTracesAtStart()7780 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7781   LLVM_DEBUG({
7782     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7783            << "Main Loop VF:" << EPI.MainLoopVF
7784            << ", Main Loop UF:" << EPI.MainLoopUF
7785            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7786            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7787   });
7788 }
7789 
printDebugTracesAtEnd()7790 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7791   DEBUG_WITH_TYPE(VerboseDebug, {
7792     dbgs() << "intermediate fn:\n"
7793            << *OrigLoop->getHeader()->getParent() << "\n";
7794   });
7795 }
7796 
7797 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7798 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7799                                                     bool ForEpilogue) {
7800   assert(Bypass && "Expected valid bypass basic block.");
7801   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7802   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7803   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7804   // Reuse existing vector loop preheader for TC checks.
7805   // Note that new preheader block is generated for vector loop.
7806   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7807   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7808 
7809   // Generate code to check if the loop's trip count is less than VF * UF of the
7810   // main vector loop.
7811   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7812       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7813 
7814   Value *CheckMinIters = Builder.CreateICmp(
7815       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7816       "min.iters.check");
7817 
7818   if (!ForEpilogue)
7819     TCCheckBlock->setName("vector.main.loop.iter.check");
7820 
7821   // Create new preheader for vector loop.
7822   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7823                                    DT, LI, nullptr, "vector.ph");
7824 
7825   if (ForEpilogue) {
7826     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7827                                  DT->getNode(Bypass)->getIDom()) &&
7828            "TC check is expected to dominate Bypass");
7829 
7830     // Update dominator for Bypass & LoopExit.
7831     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7832     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7833       // For loops with multiple exits, there's no edge from the middle block
7834       // to exit blocks (as the epilogue must run) and thus no need to update
7835       // the immediate dominator of the exit blocks.
7836       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7837 
7838     LoopBypassBlocks.push_back(TCCheckBlock);
7839 
7840     // Save the trip count so we don't have to regenerate it in the
7841     // vec.epilog.iter.check. This is safe to do because the trip count
7842     // generated here dominates the vector epilog iter check.
7843     EPI.TripCount = Count;
7844   }
7845 
7846   ReplaceInstWithInst(
7847       TCCheckBlock->getTerminator(),
7848       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7849 
7850   return TCCheckBlock;
7851 }
7852 
7853 //===--------------------------------------------------------------------===//
7854 // EpilogueVectorizerEpilogueLoop
7855 //===--------------------------------------------------------------------===//
7856 
7857 /// This function is partially responsible for generating the control flow
7858 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7859 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton()7860 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7861   createVectorLoopSkeleton("vec.epilog.");
7862 
7863   // Now, compare the remaining count and if there aren't enough iterations to
7864   // execute the vectorized epilogue skip to the scalar part.
7865   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7866   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7867   LoopVectorPreHeader =
7868       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7869                  LI, nullptr, "vec.epilog.ph");
7870   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7871                                           VecEpilogueIterationCountCheck);
7872 
7873   // Adjust the control flow taking the state info from the main loop
7874   // vectorization into account.
7875   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7876          "expected this to be saved from the previous pass.");
7877   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7878       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7879 
7880   DT->changeImmediateDominator(LoopVectorPreHeader,
7881                                EPI.MainLoopIterationCountCheck);
7882 
7883   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7884       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7885 
7886   if (EPI.SCEVSafetyCheck)
7887     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7888         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7889   if (EPI.MemSafetyCheck)
7890     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7891         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7892 
7893   DT->changeImmediateDominator(
7894       VecEpilogueIterationCountCheck,
7895       VecEpilogueIterationCountCheck->getSinglePredecessor());
7896 
7897   DT->changeImmediateDominator(LoopScalarPreHeader,
7898                                EPI.EpilogueIterationCountCheck);
7899   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7900     // If there is an epilogue which must run, there's no edge from the
7901     // middle block to exit blocks  and thus no need to update the immediate
7902     // dominator of the exit blocks.
7903     DT->changeImmediateDominator(LoopExitBlock,
7904                                  EPI.EpilogueIterationCountCheck);
7905 
7906   // Keep track of bypass blocks, as they feed start values to the induction and
7907   // reduction phis in the scalar loop preheader.
7908   if (EPI.SCEVSafetyCheck)
7909     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7910   if (EPI.MemSafetyCheck)
7911     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7912   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7913 
7914   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7915   // reductions which merge control-flow from the latch block and the middle
7916   // block. Update the incoming values here and move the Phi into the preheader.
7917   SmallVector<PHINode *, 4> PhisInBlock;
7918   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7919     PhisInBlock.push_back(&Phi);
7920 
7921   for (PHINode *Phi : PhisInBlock) {
7922     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7923     Phi->replaceIncomingBlockWith(
7924         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7925         VecEpilogueIterationCountCheck);
7926 
7927     // If the phi doesn't have an incoming value from the
7928     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7929     // value and also those from other check blocks. This is needed for
7930     // reduction phis only.
7931     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7932           return EPI.EpilogueIterationCountCheck == IncB;
7933         }))
7934       continue;
7935     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7936     if (EPI.SCEVSafetyCheck)
7937       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7938     if (EPI.MemSafetyCheck)
7939       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7940   }
7941 
7942   // Generate a resume induction for the vector epilogue and put it in the
7943   // vector epilogue preheader
7944   Type *IdxTy = Legal->getWidestInductionType();
7945   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7946                                          LoopVectorPreHeader->getFirstNonPHI());
7947   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7948   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7949                            EPI.MainLoopIterationCountCheck);
7950 
7951   // Generate induction resume values. These variables save the new starting
7952   // indexes for the scalar loop. They are used to test if there are any tail
7953   // iterations left once the vector loop has completed.
7954   // Note that when the vectorized epilogue is skipped due to iteration count
7955   // check, then the resume value for the induction variable comes from
7956   // the trip count of the main vector loop, hence passing the AdditionalBypass
7957   // argument.
7958   createInductionResumeValues({VecEpilogueIterationCountCheck,
7959                                EPI.VectorTripCount} /* AdditionalBypass */);
7960 
7961   return {completeLoopSkeleton(), EPResumeVal};
7962 }
7963 
7964 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7965 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7966     BasicBlock *Bypass, BasicBlock *Insert) {
7967 
7968   assert(EPI.TripCount &&
7969          "Expected trip count to have been safed in the first pass.");
7970   assert(
7971       (!isa<Instruction>(EPI.TripCount) ||
7972        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7973       "saved trip count does not dominate insertion point.");
7974   Value *TC = EPI.TripCount;
7975   IRBuilder<> Builder(Insert->getTerminator());
7976   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7977 
7978   // Generate code to check if the loop's trip count is less than VF * UF of the
7979   // vector epilogue loop.
7980   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7981       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7982 
7983   Value *CheckMinIters =
7984       Builder.CreateICmp(P, Count,
7985                          createStepForVF(Builder, Count->getType(),
7986                                          EPI.EpilogueVF, EPI.EpilogueUF),
7987                          "min.epilog.iters.check");
7988 
7989   ReplaceInstWithInst(
7990       Insert->getTerminator(),
7991       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7992 
7993   LoopBypassBlocks.push_back(Insert);
7994   return Insert;
7995 }
7996 
printDebugTracesAtStart()7997 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7998   LLVM_DEBUG({
7999     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8000            << "Epilogue Loop VF:" << EPI.EpilogueVF
8001            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8002   });
8003 }
8004 
printDebugTracesAtEnd()8005 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8006   DEBUG_WITH_TYPE(VerboseDebug, {
8007     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8008   });
8009 }
8010 
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)8011 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8012     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8013   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8014   bool PredicateAtRangeStart = Predicate(Range.Start);
8015 
8016   for (ElementCount TmpVF = Range.Start * 2;
8017        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8018     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8019       Range.End = TmpVF;
8020       break;
8021     }
8022 
8023   return PredicateAtRangeStart;
8024 }
8025 
8026 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8027 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8028 /// of VF's starting at a given VF and extending it as much as possible. Each
8029 /// vectorization decision can potentially shorten this sub-range during
8030 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)8031 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8032                                            ElementCount MaxVF) {
8033   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8034   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8035     VFRange SubRange = {VF, MaxVFPlusOne};
8036     VPlans.push_back(buildVPlan(SubRange));
8037     VF = SubRange.End;
8038   }
8039 }
8040 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlanPtr & Plan)8041 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8042                                          VPlanPtr &Plan) {
8043   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8044 
8045   // Look for cached value.
8046   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8047   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8048   if (ECEntryIt != EdgeMaskCache.end())
8049     return ECEntryIt->second;
8050 
8051   VPValue *SrcMask = createBlockInMask(Src, Plan);
8052 
8053   // The terminator has to be a branch inst!
8054   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8055   assert(BI && "Unexpected terminator found");
8056 
8057   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8058     return EdgeMaskCache[Edge] = SrcMask;
8059 
8060   // If source is an exiting block, we know the exit edge is dynamically dead
8061   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8062   // adding uses of an otherwise potentially dead instruction.
8063   if (OrigLoop->isLoopExiting(Src))
8064     return EdgeMaskCache[Edge] = SrcMask;
8065 
8066   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8067   assert(EdgeMask && "No Edge Mask found for condition");
8068 
8069   if (BI->getSuccessor(0) != Dst)
8070     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8071 
8072   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8073     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8074     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8075     // The select version does not introduce new UB if SrcMask is false and
8076     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8077     VPValue *False = Plan->getOrAddVPValue(
8078         ConstantInt::getFalse(BI->getCondition()->getType()));
8079     EdgeMask =
8080         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8081   }
8082 
8083   return EdgeMaskCache[Edge] = EdgeMask;
8084 }
8085 
createBlockInMask(BasicBlock * BB,VPlanPtr & Plan)8086 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8087   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8088 
8089   // Look for cached value.
8090   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8091   if (BCEntryIt != BlockMaskCache.end())
8092     return BCEntryIt->second;
8093 
8094   // All-one mask is modelled as no-mask following the convention for masked
8095   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8096   VPValue *BlockMask = nullptr;
8097 
8098   if (OrigLoop->getHeader() == BB) {
8099     if (!CM.blockNeedsPredicationForAnyReason(BB))
8100       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8101 
8102     assert(CM.foldTailByMasking() && "must fold the tail");
8103 
8104     // If we're using the active lane mask for control flow, then we get the
8105     // mask from the active lane mask PHI that is cached in the VPlan.
8106     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8107     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8108       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8109 
8110     // Introduce the early-exit compare IV <= BTC to form header block mask.
8111     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8112     // constructing the desired canonical IV in the header block as its first
8113     // non-phi instructions.
8114 
8115     VPBasicBlock *HeaderVPBB =
8116         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8117     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8118     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8119     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8120 
8121     VPBuilder::InsertPointGuard Guard(Builder);
8122     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8123     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8124       VPValue *TC = Plan->getOrCreateTripCount();
8125       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8126                                        nullptr, "active.lane.mask");
8127     } else {
8128       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8129       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8130     }
8131     return BlockMaskCache[BB] = BlockMask;
8132   }
8133 
8134   // This is the block mask. We OR all incoming edges.
8135   for (auto *Predecessor : predecessors(BB)) {
8136     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8137     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8138       return BlockMaskCache[BB] = EdgeMask;
8139 
8140     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8141       BlockMask = EdgeMask;
8142       continue;
8143     }
8144 
8145     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8146   }
8147 
8148   return BlockMaskCache[BB] = BlockMask;
8149 }
8150 
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8151 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8152                                                 ArrayRef<VPValue *> Operands,
8153                                                 VFRange &Range,
8154                                                 VPlanPtr &Plan) {
8155   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8156          "Must be called with either a load or store");
8157 
8158   auto willWiden = [&](ElementCount VF) -> bool {
8159     LoopVectorizationCostModel::InstWidening Decision =
8160         CM.getWideningDecision(I, VF);
8161     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8162            "CM decision should be taken at this point.");
8163     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8164       return true;
8165     if (CM.isScalarAfterVectorization(I, VF) ||
8166         CM.isProfitableToScalarize(I, VF))
8167       return false;
8168     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8169   };
8170 
8171   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8172     return nullptr;
8173 
8174   VPValue *Mask = nullptr;
8175   if (Legal->isMaskRequired(I))
8176     Mask = createBlockInMask(I->getParent(), Plan);
8177 
8178   // Determine if the pointer operand of the access is either consecutive or
8179   // reverse consecutive.
8180   LoopVectorizationCostModel::InstWidening Decision =
8181       CM.getWideningDecision(I, Range.Start);
8182   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8183   bool Consecutive =
8184       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8185 
8186   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8187     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8188                                               Consecutive, Reverse);
8189 
8190   StoreInst *Store = cast<StoreInst>(I);
8191   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8192                                             Mask, Consecutive, Reverse);
8193 }
8194 
8195 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8196 /// insert a recipe to expand the step for the induction recipe.
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,LoopVectorizationCostModel & CM,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop,VFRange & Range)8197 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8198     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8199     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8200     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8201   // Returns true if an instruction \p I should be scalarized instead of
8202   // vectorized for the chosen vectorization factor.
8203   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8204     return CM.isScalarAfterVectorization(I, VF) ||
8205            CM.isProfitableToScalarize(I, VF);
8206   };
8207 
8208   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8209       [&](ElementCount VF) {
8210         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8211       },
8212       Range);
8213   assert(IndDesc.getStartValue() ==
8214          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8215   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8216          "step must be loop invariant");
8217 
8218   VPValue *Step =
8219       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8220   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8221     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8222                                              !NeedsScalarIVOnly);
8223   }
8224   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8225   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8226                                            !NeedsScalarIVOnly);
8227 }
8228 
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlan & Plan,VFRange & Range)8229 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8230     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8231 
8232   // Check if this is an integer or fp induction. If so, build the recipe that
8233   // produces its scalar and vector values.
8234   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8235     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8236                                        *PSE.getSE(), *OrigLoop, Range);
8237 
8238   // Check if this is pointer induction. If so, build the recipe for it.
8239   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8240     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8241                                                            *PSE.getSE());
8242     assert(isa<SCEVConstant>(II->getStep()));
8243     return new VPWidenPointerInductionRecipe(
8244         Phi, Operands[0], Step, *II,
8245         LoopVectorizationPlanner::getDecisionAndClampRange(
8246             [&](ElementCount VF) {
8247               return CM.isScalarAfterVectorization(Phi, VF);
8248             },
8249             Range));
8250   }
8251   return nullptr;
8252 }
8253 
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlan & Plan)8254 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8255     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8256   // Optimize the special case where the source is a constant integer
8257   // induction variable. Notice that we can only optimize the 'trunc' case
8258   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8259   // (c) other casts depend on pointer size.
8260 
8261   // Determine whether \p K is a truncation based on an induction variable that
8262   // can be optimized.
8263   auto isOptimizableIVTruncate =
8264       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8265     return [=](ElementCount VF) -> bool {
8266       return CM.isOptimizableIVTruncate(K, VF);
8267     };
8268   };
8269 
8270   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8271           isOptimizableIVTruncate(I), Range)) {
8272 
8273     auto *Phi = cast<PHINode>(I->getOperand(0));
8274     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8275     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8276     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8277                                        *PSE.getSE(), *OrigLoop, Range);
8278   }
8279   return nullptr;
8280 }
8281 
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlanPtr & Plan)8282 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8283                                                 ArrayRef<VPValue *> Operands,
8284                                                 VPlanPtr &Plan) {
8285   // If all incoming values are equal, the incoming VPValue can be used directly
8286   // instead of creating a new VPBlendRecipe.
8287   if (llvm::all_equal(Operands))
8288     return Operands[0];
8289 
8290   unsigned NumIncoming = Phi->getNumIncomingValues();
8291   // For in-loop reductions, we do not need to create an additional select.
8292   VPValue *InLoopVal = nullptr;
8293   for (unsigned In = 0; In < NumIncoming; In++) {
8294     PHINode *PhiOp =
8295         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8296     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8297       assert(!InLoopVal && "Found more than one in-loop reduction!");
8298       InLoopVal = Operands[In];
8299     }
8300   }
8301 
8302   assert((!InLoopVal || NumIncoming == 2) &&
8303          "Found an in-loop reduction for PHI with unexpected number of "
8304          "incoming values");
8305   if (InLoopVal)
8306     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8307 
8308   // We know that all PHIs in non-header blocks are converted into selects, so
8309   // we don't have to worry about the insertion order and we can just use the
8310   // builder. At this point we generate the predication tree. There may be
8311   // duplications since this is a simple recursive scan, but future
8312   // optimizations will clean it up.
8313   SmallVector<VPValue *, 2> OperandsWithMask;
8314 
8315   for (unsigned In = 0; In < NumIncoming; In++) {
8316     VPValue *EdgeMask =
8317       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8318     assert((EdgeMask || NumIncoming == 1) &&
8319            "Multiple predecessors with one having a full mask");
8320     OperandsWithMask.push_back(Operands[In]);
8321     if (EdgeMask)
8322       OperandsWithMask.push_back(EdgeMask);
8323   }
8324   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8325 }
8326 
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range) const8327 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8328                                                    ArrayRef<VPValue *> Operands,
8329                                                    VFRange &Range) const {
8330 
8331   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8332       [this, CI](ElementCount VF) {
8333         return CM.isScalarWithPredication(CI, VF);
8334       },
8335       Range);
8336 
8337   if (IsPredicated)
8338     return nullptr;
8339 
8340   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8341   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8342              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8343              ID == Intrinsic::pseudoprobe ||
8344              ID == Intrinsic::experimental_noalias_scope_decl))
8345     return nullptr;
8346 
8347   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8348 
8349   // Is it beneficial to perform intrinsic call compared to lib call?
8350   bool ShouldUseVectorIntrinsic =
8351       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8352                 [&](ElementCount VF) -> bool {
8353                   bool NeedToScalarize = false;
8354                   // Is it beneficial to perform intrinsic call compared to lib
8355                   // call?
8356                   InstructionCost CallCost =
8357                       CM.getVectorCallCost(CI, VF, NeedToScalarize);
8358                   InstructionCost IntrinsicCost =
8359                       CM.getVectorIntrinsicCost(CI, VF);
8360                   return IntrinsicCost <= CallCost;
8361                 },
8362                 Range);
8363   if (ShouldUseVectorIntrinsic)
8364     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8365 
8366   // Is better to call a vectorized version of the function than to to scalarize
8367   // the call?
8368   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8369       [&](ElementCount VF) -> bool {
8370         // The following case may be scalarized depending on the VF.
8371         // The flag shows whether we can use a usual Call for vectorized
8372         // version of the instruction.
8373         bool NeedToScalarize = false;
8374         CM.getVectorCallCost(CI, VF, NeedToScalarize);
8375         return !NeedToScalarize;
8376       },
8377       Range);
8378   if (ShouldUseVectorCall)
8379     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8380                                  Intrinsic::not_intrinsic);
8381 
8382   return nullptr;
8383 }
8384 
shouldWiden(Instruction * I,VFRange & Range) const8385 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8386   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8387          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8388   // Instruction should be widened, unless it is scalar after vectorization,
8389   // scalarization is profitable or it is predicated.
8390   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8391     return CM.isScalarAfterVectorization(I, VF) ||
8392            CM.isProfitableToScalarize(I, VF) ||
8393            CM.isScalarWithPredication(I, VF);
8394   };
8395   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8396                                                              Range);
8397 }
8398 
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB,VPlanPtr & Plan)8399 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8400                                           ArrayRef<VPValue *> Operands,
8401                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8402   switch (I->getOpcode()) {
8403   default:
8404     return nullptr;
8405   case Instruction::SDiv:
8406   case Instruction::UDiv:
8407   case Instruction::SRem:
8408   case Instruction::URem: {
8409     // If not provably safe, use a select to form a safe divisor before widening the
8410     // div/rem operation itself.  Otherwise fall through to general handling below.
8411     if (CM.isPredicatedInst(I)) {
8412       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8413       VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8414       VPValue *One =
8415         Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8416       auto *SafeRHS =
8417          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8418                            I->getDebugLoc());
8419       VPBB->appendRecipe(SafeRHS);
8420       Ops[1] = SafeRHS;
8421       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8422     }
8423     LLVM_FALLTHROUGH;
8424   }
8425   case Instruction::Add:
8426   case Instruction::And:
8427   case Instruction::AShr:
8428   case Instruction::BitCast:
8429   case Instruction::FAdd:
8430   case Instruction::FCmp:
8431   case Instruction::FDiv:
8432   case Instruction::FMul:
8433   case Instruction::FNeg:
8434   case Instruction::FPExt:
8435   case Instruction::FPToSI:
8436   case Instruction::FPToUI:
8437   case Instruction::FPTrunc:
8438   case Instruction::FRem:
8439   case Instruction::FSub:
8440   case Instruction::ICmp:
8441   case Instruction::IntToPtr:
8442   case Instruction::LShr:
8443   case Instruction::Mul:
8444   case Instruction::Or:
8445   case Instruction::PtrToInt:
8446   case Instruction::Select:
8447   case Instruction::SExt:
8448   case Instruction::Shl:
8449   case Instruction::SIToFP:
8450   case Instruction::Sub:
8451   case Instruction::Trunc:
8452   case Instruction::UIToFP:
8453   case Instruction::Xor:
8454   case Instruction::ZExt:
8455   case Instruction::Freeze:
8456     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8457   };
8458 }
8459 
fixHeaderPhis()8460 void VPRecipeBuilder::fixHeaderPhis() {
8461   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8462   for (VPHeaderPHIRecipe *R : PhisToFix) {
8463     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8464     VPRecipeBase *IncR =
8465         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8466     R->addOperand(IncR->getVPSingleValue());
8467   }
8468 }
8469 
handleReplication(Instruction * I,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8470 VPBasicBlock *VPRecipeBuilder::handleReplication(
8471     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8472     VPlanPtr &Plan) {
8473   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8474       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8475       Range);
8476 
8477   bool IsPredicated = CM.isPredicatedInst(I);
8478 
8479   // Even if the instruction is not marked as uniform, there are certain
8480   // intrinsic calls that can be effectively treated as such, so we check for
8481   // them here. Conservatively, we only do this for scalable vectors, since
8482   // for fixed-width VFs we can always fall back on full scalarization.
8483   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8484     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8485     case Intrinsic::assume:
8486     case Intrinsic::lifetime_start:
8487     case Intrinsic::lifetime_end:
8488       // For scalable vectors if one of the operands is variant then we still
8489       // want to mark as uniform, which will generate one instruction for just
8490       // the first lane of the vector. We can't scalarize the call in the same
8491       // way as for fixed-width vectors because we don't know how many lanes
8492       // there are.
8493       //
8494       // The reasons for doing it this way for scalable vectors are:
8495       //   1. For the assume intrinsic generating the instruction for the first
8496       //      lane is still be better than not generating any at all. For
8497       //      example, the input may be a splat across all lanes.
8498       //   2. For the lifetime start/end intrinsics the pointer operand only
8499       //      does anything useful when the input comes from a stack object,
8500       //      which suggests it should always be uniform. For non-stack objects
8501       //      the effect is to poison the object, which still allows us to
8502       //      remove the call.
8503       IsUniform = true;
8504       break;
8505     default:
8506       break;
8507     }
8508   }
8509 
8510   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8511                                        IsUniform, IsPredicated);
8512 
8513   // Find if I uses a predicated instruction. If so, it will use its scalar
8514   // value. Avoid hoisting the insert-element which packs the scalar value into
8515   // a vector value, as that happens iff all users use the vector value.
8516   for (VPValue *Op : Recipe->operands()) {
8517     auto *PredR =
8518         dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8519     if (!PredR)
8520       continue;
8521     auto *RepR = cast<VPReplicateRecipe>(
8522         PredR->getOperand(0)->getDefiningRecipe());
8523     assert(RepR->isPredicated() &&
8524            "expected Replicate recipe to be predicated");
8525     RepR->setAlsoPack(false);
8526   }
8527 
8528   // Finalize the recipe for Instr, first if it is not predicated.
8529   if (!IsPredicated) {
8530     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8531     setRecipe(I, Recipe);
8532     Plan->addVPValue(I, Recipe);
8533     VPBB->appendRecipe(Recipe);
8534     return VPBB;
8535   }
8536   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8537 
8538   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8539   assert(SingleSucc && "VPBB must have a single successor when handling "
8540                        "predicated replication.");
8541   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8542   // Record predicated instructions for above packing optimizations.
8543   VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8544   VPBlockUtils::insertBlockAfter(Region, VPBB);
8545   auto *RegSucc = new VPBasicBlock();
8546   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8547   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8548   return RegSucc;
8549 }
8550 
8551 VPRegionBlock *
createReplicateRegion(VPReplicateRecipe * PredRecipe,VPlanPtr & Plan)8552 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8553                                        VPlanPtr &Plan) {
8554   Instruction *Instr = PredRecipe->getUnderlyingInstr();
8555   // Instructions marked for predication are replicated and placed under an
8556   // if-then construct to prevent side-effects.
8557   // Generate recipes to compute the block mask for this region.
8558   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8559 
8560   // Build the triangular if-then region.
8561   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8562   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8563   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8564   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8565   auto *PHIRecipe = Instr->getType()->isVoidTy()
8566                         ? nullptr
8567                         : new VPPredInstPHIRecipe(PredRecipe);
8568   if (PHIRecipe) {
8569     setRecipe(Instr, PHIRecipe);
8570     Plan->addVPValue(Instr, PHIRecipe);
8571   } else {
8572     setRecipe(Instr, PredRecipe);
8573     Plan->addVPValue(Instr, PredRecipe);
8574   }
8575 
8576   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8577   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8578   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8579 
8580   // Note: first set Entry as region entry and then connect successors starting
8581   // from it in order, to propagate the "parent" of each VPBasicBlock.
8582   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8583   VPBlockUtils::connectBlocks(Pred, Exiting);
8584 
8585   return Region;
8586 }
8587 
8588 VPRecipeOrVPValueTy
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8589 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8590                                         ArrayRef<VPValue *> Operands,
8591                                         VFRange &Range, VPBasicBlock *VPBB,
8592                                         VPlanPtr &Plan) {
8593   // First, check for specific widening recipes that deal with inductions, Phi
8594   // nodes, calls and memory operations.
8595   VPRecipeBase *Recipe;
8596   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8597     if (Phi->getParent() != OrigLoop->getHeader())
8598       return tryToBlend(Phi, Operands, Plan);
8599 
8600     // Always record recipes for header phis. Later first-order recurrence phis
8601     // can have earlier phis as incoming values.
8602     recordRecipeOf(Phi);
8603 
8604     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8605       return toVPRecipeResult(Recipe);
8606 
8607     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8608     assert((Legal->isReductionVariable(Phi) ||
8609             Legal->isFixedOrderRecurrence(Phi)) &&
8610            "can only widen reductions and fixed-order recurrences here");
8611     VPValue *StartV = Operands[0];
8612     if (Legal->isReductionVariable(Phi)) {
8613       const RecurrenceDescriptor &RdxDesc =
8614           Legal->getReductionVars().find(Phi)->second;
8615       assert(RdxDesc.getRecurrenceStartValue() ==
8616              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8617       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8618                                            CM.isInLoopReduction(Phi),
8619                                            CM.useOrderedReductions(RdxDesc));
8620     } else {
8621       // TODO: Currently fixed-order recurrences are modeled as chains of
8622       // first-order recurrences. If there are no users of the intermediate
8623       // recurrences in the chain, the fixed order recurrence should be modeled
8624       // directly, enabling more efficient codegen.
8625       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8626     }
8627 
8628     // Record the incoming value from the backedge, so we can add the incoming
8629     // value from the backedge after all recipes have been created.
8630     auto *Inc = cast<Instruction>(
8631         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8632     auto RecipeIter = Ingredient2Recipe.find(Inc);
8633     if (RecipeIter == Ingredient2Recipe.end())
8634       recordRecipeOf(Inc);
8635 
8636     PhisToFix.push_back(PhiRecipe);
8637     return toVPRecipeResult(PhiRecipe);
8638   }
8639 
8640   if (isa<TruncInst>(Instr) &&
8641       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8642                                                Range, *Plan)))
8643     return toVPRecipeResult(Recipe);
8644 
8645   // All widen recipes below deal only with VF > 1.
8646   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8647           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8648     return nullptr;
8649 
8650   if (auto *CI = dyn_cast<CallInst>(Instr))
8651     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8652 
8653   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8654     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8655 
8656   if (!shouldWiden(Instr, Range))
8657     return nullptr;
8658 
8659   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8660     return toVPRecipeResult(new VPWidenGEPRecipe(
8661         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8662 
8663   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8664     bool InvariantCond =
8665         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8666     return toVPRecipeResult(new VPWidenSelectRecipe(
8667         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8668   }
8669 
8670   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8671 }
8672 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8673 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8674                                                         ElementCount MaxVF) {
8675   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8676 
8677   // Add assume instructions we need to drop to DeadInstructions, to prevent
8678   // them from being added to the VPlan.
8679   // TODO: We only need to drop assumes in blocks that get flattend. If the
8680   // control flow is preserved, we should keep them.
8681   SmallPtrSet<Instruction *, 4> DeadInstructions;
8682   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8683   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8684 
8685   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8686   // Dead instructions do not need sinking. Remove them from SinkAfter.
8687   for (Instruction *I : DeadInstructions)
8688     SinkAfter.erase(I);
8689 
8690   // Cannot sink instructions after dead instructions (there won't be any
8691   // recipes for them). Instead, find the first non-dead previous instruction.
8692   for (auto &P : Legal->getSinkAfter()) {
8693     Instruction *SinkTarget = P.second;
8694     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8695     (void)FirstInst;
8696     while (DeadInstructions.contains(SinkTarget)) {
8697       assert(
8698           SinkTarget != FirstInst &&
8699           "Must find a live instruction (at least the one feeding the "
8700           "fixed-order recurrence PHI) before reaching beginning of the block");
8701       SinkTarget = SinkTarget->getPrevNode();
8702       assert(SinkTarget != P.first &&
8703              "sink source equals target, no sinking required");
8704     }
8705     P.second = SinkTarget;
8706   }
8707 
8708   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8709   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8710     VFRange SubRange = {VF, MaxVFPlusOne};
8711     VPlans.push_back(
8712         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8713     VF = SubRange.End;
8714   }
8715 }
8716 
8717 // Add the necessary canonical IV and branch recipes required to control the
8718 // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,DebugLoc DL,bool HasNUW,bool UseLaneMaskForLoopControlFlow)8719 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8720                                   bool HasNUW,
8721                                   bool UseLaneMaskForLoopControlFlow) {
8722   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8723   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8724 
8725   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8726   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8727   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8728   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8729   Header->insert(CanonicalIVPHI, Header->begin());
8730 
8731   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8732   // IV by VF * UF.
8733   auto *CanonicalIVIncrement =
8734       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8735                                : VPInstruction::CanonicalIVIncrement,
8736                         {CanonicalIVPHI}, DL, "index.next");
8737   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8738 
8739   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8740   EB->appendRecipe(CanonicalIVIncrement);
8741 
8742   if (UseLaneMaskForLoopControlFlow) {
8743     // Create the active lane mask instruction in the vplan preheader.
8744     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8745 
8746     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8747     // we have to take unrolling into account. Each part needs to start at
8748     //   Part * VF
8749     auto *CanonicalIVIncrementParts =
8750         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8751                                  : VPInstruction::CanonicalIVIncrementForPart,
8752                           {StartV}, DL, "index.part.next");
8753     Preheader->appendRecipe(CanonicalIVIncrementParts);
8754 
8755     // Create the ActiveLaneMask instruction using the correct start values.
8756     VPValue *TC = Plan.getOrCreateTripCount();
8757     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8758                                        {CanonicalIVIncrementParts, TC}, DL,
8759                                        "active.lane.mask.entry");
8760     Preheader->appendRecipe(EntryALM);
8761 
8762     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8763     // preheader ActiveLaneMask instruction.
8764     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8765     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8766 
8767     // Create the active lane mask for the next iteration of the loop.
8768     CanonicalIVIncrementParts =
8769         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8770                                  : VPInstruction::CanonicalIVIncrementForPart,
8771                           {CanonicalIVIncrement}, DL);
8772     EB->appendRecipe(CanonicalIVIncrementParts);
8773 
8774     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8775                                   {CanonicalIVIncrementParts, TC}, DL,
8776                                   "active.lane.mask.next");
8777     EB->appendRecipe(ALM);
8778     LaneMaskPhi->addOperand(ALM);
8779 
8780     // We have to invert the mask here because a true condition means jumping
8781     // to the exit block.
8782     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8783     EB->appendRecipe(NotMask);
8784 
8785     VPInstruction *BranchBack =
8786         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8787     EB->appendRecipe(BranchBack);
8788   } else {
8789     // Add the BranchOnCount VPInstruction to the latch.
8790     VPInstruction *BranchBack = new VPInstruction(
8791         VPInstruction::BranchOnCount,
8792         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8793     EB->appendRecipe(BranchBack);
8794   }
8795 }
8796 
8797 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8798 // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,VPBasicBlock * MiddleVPBB,Loop * OrigLoop,VPlan & Plan)8799 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8800                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8801                                 VPlan &Plan) {
8802   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8803   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8804   // Only handle single-exit loops with unique exit blocks for now.
8805   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8806     return;
8807 
8808   // Introduce VPUsers modeling the exit values.
8809   for (PHINode &ExitPhi : ExitBB->phis()) {
8810     Value *IncomingValue =
8811         ExitPhi.getIncomingValueForBlock(ExitingBB);
8812     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8813     Plan.addLiveOut(&ExitPhi, V);
8814   }
8815 }
8816 
buildVPlanWithVPRecipes(VFRange & Range,SmallPtrSetImpl<Instruction * > & DeadInstructions,const MapVector<Instruction *,Instruction * > & SinkAfter)8817 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8818     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8819     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8820 
8821   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8822 
8823   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8824 
8825   // ---------------------------------------------------------------------------
8826   // Pre-construction: record ingredients whose recipes we'll need to further
8827   // process after constructing the initial VPlan.
8828   // ---------------------------------------------------------------------------
8829 
8830   // Mark instructions we'll need to sink later and their targets as
8831   // ingredients whose recipe we'll need to record.
8832   for (const auto &Entry : SinkAfter) {
8833     RecipeBuilder.recordRecipeOf(Entry.first);
8834     RecipeBuilder.recordRecipeOf(Entry.second);
8835   }
8836   for (const auto &Reduction : CM.getInLoopReductionChains()) {
8837     PHINode *Phi = Reduction.first;
8838     RecurKind Kind =
8839         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8840     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8841 
8842     RecipeBuilder.recordRecipeOf(Phi);
8843     for (const auto &R : ReductionOperations) {
8844       RecipeBuilder.recordRecipeOf(R);
8845       // For min/max reductions, where we have a pair of icmp/select, we also
8846       // need to record the ICmp recipe, so it can be removed later.
8847       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8848              "Only min/max recurrences allowed for inloop reductions");
8849       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8850         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8851     }
8852   }
8853 
8854   // For each interleave group which is relevant for this (possibly trimmed)
8855   // Range, add it to the set of groups to be later applied to the VPlan and add
8856   // placeholders for its members' Recipes which we'll be replacing with a
8857   // single VPInterleaveRecipe.
8858   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8859     auto applyIG = [IG, this](ElementCount VF) -> bool {
8860       return (VF.isVector() && // Query is illegal for VF == 1
8861               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8862                   LoopVectorizationCostModel::CM_Interleave);
8863     };
8864     if (!getDecisionAndClampRange(applyIG, Range))
8865       continue;
8866     InterleaveGroups.insert(IG);
8867     for (unsigned i = 0; i < IG->getFactor(); i++)
8868       if (Instruction *Member = IG->getMember(i))
8869         RecipeBuilder.recordRecipeOf(Member);
8870   };
8871 
8872   // ---------------------------------------------------------------------------
8873   // Build initial VPlan: Scan the body of the loop in a topological order to
8874   // visit each basic block after having visited its predecessor basic blocks.
8875   // ---------------------------------------------------------------------------
8876 
8877   // Create initial VPlan skeleton, starting with a block for the pre-header,
8878   // followed by a region for the vector loop, followed by the middle block. The
8879   // skeleton vector loop region contains a header and latch block.
8880   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8881   auto Plan = std::make_unique<VPlan>(Preheader);
8882 
8883   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8884   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8885   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8886   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8887   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8888   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8889   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8890 
8891   Instruction *DLInst =
8892       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8893   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8894                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8895                         !CM.foldTailByMasking(),
8896                         CM.useActiveLaneMaskForControlFlow());
8897 
8898   // Scan the body of the loop in a topological order to visit each basic block
8899   // after having visited its predecessor basic blocks.
8900   LoopBlocksDFS DFS(OrigLoop);
8901   DFS.perform(LI);
8902 
8903   VPBasicBlock *VPBB = HeaderVPBB;
8904   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8905   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8906     // Relevant instructions from basic block BB will be grouped into VPRecipe
8907     // ingredients and fill a new VPBasicBlock.
8908     unsigned VPBBsForBB = 0;
8909     if (VPBB != HeaderVPBB)
8910       VPBB->setName(BB->getName());
8911     Builder.setInsertPoint(VPBB);
8912 
8913     // Introduce each ingredient into VPlan.
8914     // TODO: Model and preserve debug intrinsics in VPlan.
8915     for (Instruction &I : BB->instructionsWithoutDebug()) {
8916       Instruction *Instr = &I;
8917 
8918       // First filter out irrelevant instructions, to ensure no recipes are
8919       // built for them.
8920       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8921         continue;
8922 
8923       SmallVector<VPValue *, 4> Operands;
8924       auto *Phi = dyn_cast<PHINode>(Instr);
8925       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8926         Operands.push_back(Plan->getOrAddVPValue(
8927             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8928       } else {
8929         auto OpRange = Plan->mapToVPValues(Instr->operands());
8930         Operands = {OpRange.begin(), OpRange.end()};
8931       }
8932 
8933       // Invariant stores inside loop will be deleted and a single store
8934       // with the final reduction value will be added to the exit block
8935       StoreInst *SI;
8936       if ((SI = dyn_cast<StoreInst>(&I)) &&
8937           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8938         continue;
8939 
8940       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8941               Instr, Operands, Range, VPBB, Plan)) {
8942         // If Instr can be simplified to an existing VPValue, use it.
8943         if (RecipeOrValue.is<VPValue *>()) {
8944           auto *VPV = RecipeOrValue.get<VPValue *>();
8945           Plan->addVPValue(Instr, VPV);
8946           // If the re-used value is a recipe, register the recipe for the
8947           // instruction, in case the recipe for Instr needs to be recorded.
8948           if (VPRecipeBase *R = VPV->getDefiningRecipe())
8949             RecipeBuilder.setRecipe(Instr, R);
8950           continue;
8951         }
8952         // Otherwise, add the new recipe.
8953         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8954         for (auto *Def : Recipe->definedValues()) {
8955           auto *UV = Def->getUnderlyingValue();
8956           Plan->addVPValue(UV, Def);
8957         }
8958 
8959         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8960             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8961           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8962           // of the header block. That can happen for truncates of induction
8963           // variables. Those recipes are moved to the phi section of the header
8964           // block after applying SinkAfter, which relies on the original
8965           // position of the trunc.
8966           assert(isa<TruncInst>(Instr));
8967           InductionsToMove.push_back(
8968               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8969         }
8970         RecipeBuilder.setRecipe(Instr, Recipe);
8971         VPBB->appendRecipe(Recipe);
8972         continue;
8973       }
8974 
8975       // Otherwise, if all widening options failed, Instruction is to be
8976       // replicated. This may create a successor for VPBB.
8977       VPBasicBlock *NextVPBB =
8978           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8979       if (NextVPBB != VPBB) {
8980         VPBB = NextVPBB;
8981         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8982                                     : "");
8983       }
8984     }
8985 
8986     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8987     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8988   }
8989 
8990   // After here, VPBB should not be used.
8991   VPBB = nullptr;
8992 
8993   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8994 
8995   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8996          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8997          "entry block must be set to a VPRegionBlock having a non-empty entry "
8998          "VPBasicBlock");
8999   RecipeBuilder.fixHeaderPhis();
9000 
9001   // ---------------------------------------------------------------------------
9002   // Transform initial VPlan: Apply previously taken decisions, in order, to
9003   // bring the VPlan to its final state.
9004   // ---------------------------------------------------------------------------
9005 
9006   // Apply Sink-After legal constraints.
9007   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9008     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9009     if (Region && Region->isReplicator()) {
9010       assert(Region->getNumSuccessors() == 1 &&
9011              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9012       assert(R->getParent()->size() == 1 &&
9013              "A recipe in an original replicator region must be the only "
9014              "recipe in its block");
9015       return Region;
9016     }
9017     return nullptr;
9018   };
9019   for (const auto &Entry : SinkAfter) {
9020     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9021     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9022 
9023     auto *TargetRegion = GetReplicateRegion(Target);
9024     auto *SinkRegion = GetReplicateRegion(Sink);
9025     if (!SinkRegion) {
9026       // If the sink source is not a replicate region, sink the recipe directly.
9027       if (TargetRegion) {
9028         // The target is in a replication region, make sure to move Sink to
9029         // the block after it, not into the replication region itself.
9030         VPBasicBlock *NextBlock =
9031             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9032         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9033       } else
9034         Sink->moveAfter(Target);
9035       continue;
9036     }
9037 
9038     // The sink source is in a replicate region. Unhook the region from the CFG.
9039     auto *SinkPred = SinkRegion->getSinglePredecessor();
9040     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9041     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9042     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9043     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9044 
9045     if (TargetRegion) {
9046       // The target recipe is also in a replicate region, move the sink region
9047       // after the target region.
9048       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9049       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9050       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9051       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9052     } else {
9053       // The sink source is in a replicate region, we need to move the whole
9054       // replicate region, which should only contain a single recipe in the
9055       // main block.
9056       auto *SplitBlock =
9057           Target->getParent()->splitAt(std::next(Target->getIterator()));
9058 
9059       auto *SplitPred = SplitBlock->getSinglePredecessor();
9060 
9061       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9062       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9063       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9064     }
9065   }
9066 
9067   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9068   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9069 
9070   // Now that sink-after is done, move induction recipes for optimized truncates
9071   // to the phi section of the header block.
9072   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9073     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9074 
9075   // Adjust the recipes for any inloop reductions.
9076   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9077                              RecipeBuilder, Range.Start);
9078 
9079   // Introduce a recipe to combine the incoming and previous values of a
9080   // fixed-order recurrence.
9081   for (VPRecipeBase &R :
9082        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9083     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9084     if (!RecurPhi)
9085       continue;
9086 
9087     VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9088     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9089     // to terminate.
9090     while (auto *PrevPhi =
9091                dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9092       PrevRecipe = &PrevPhi->getBackedgeRecipe();
9093     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9094     auto *Region = GetReplicateRegion(PrevRecipe);
9095     if (Region)
9096       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9097     if (!InsertBlock) {
9098       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9099       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9100     }
9101     if (Region || PrevRecipe->isPhi())
9102       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9103     else
9104       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9105 
9106     auto *RecurSplice = cast<VPInstruction>(
9107         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9108                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9109 
9110     RecurPhi->replaceAllUsesWith(RecurSplice);
9111     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9112     // all users.
9113     RecurSplice->setOperand(0, RecurPhi);
9114   }
9115 
9116   // Interleave memory: for each Interleave Group we marked earlier as relevant
9117   // for this VPlan, replace the Recipes widening its memory instructions with a
9118   // single VPInterleaveRecipe at its insertion point.
9119   for (const auto *IG : InterleaveGroups) {
9120     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9121         RecipeBuilder.getRecipe(IG->getInsertPos()));
9122     SmallVector<VPValue *, 4> StoredValues;
9123     for (unsigned i = 0; i < IG->getFactor(); ++i)
9124       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9125         auto *StoreR =
9126             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9127         StoredValues.push_back(StoreR->getStoredValue());
9128       }
9129 
9130     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9131                                         Recipe->getMask());
9132     VPIG->insertBefore(Recipe);
9133     unsigned J = 0;
9134     for (unsigned i = 0; i < IG->getFactor(); ++i)
9135       if (Instruction *Member = IG->getMember(i)) {
9136         if (!Member->getType()->isVoidTy()) {
9137           VPValue *OriginalV = Plan->getVPValue(Member);
9138           Plan->removeVPValueFor(Member);
9139           Plan->addVPValue(Member, VPIG->getVPValue(J));
9140           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9141           J++;
9142         }
9143         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9144       }
9145   }
9146 
9147   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9148        VF *= 2)
9149     Plan->addVF(VF);
9150   Plan->setName("Initial VPlan");
9151 
9152   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9153   // in ways that accessing values using original IR values is incorrect.
9154   Plan->disableValue2VPValue();
9155 
9156   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9157   VPlanTransforms::removeDeadRecipes(*Plan);
9158 
9159   bool ShouldSimplify = true;
9160   while (ShouldSimplify) {
9161     ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
9162     ShouldSimplify |=
9163         VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
9164     ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9165   }
9166 
9167   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9168   VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9169 
9170   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9171   return Plan;
9172 }
9173 
buildVPlan(VFRange & Range)9174 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9175   // Outer loop handling: They may require CFG and instruction level
9176   // transformations before even evaluating whether vectorization is profitable.
9177   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9178   // the vectorization pipeline.
9179   assert(!OrigLoop->isInnermost());
9180   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9181 
9182   // Create new empty VPlan
9183   auto Plan = std::make_unique<VPlan>();
9184 
9185   // Build hierarchical CFG
9186   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9187   HCFGBuilder.buildHierarchicalCFG();
9188 
9189   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9190        VF *= 2)
9191     Plan->addVF(VF);
9192 
9193   SmallPtrSet<Instruction *, 1> DeadInstructions;
9194   VPlanTransforms::VPInstructionsToVPRecipes(
9195       OrigLoop, Plan,
9196       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9197       DeadInstructions, *PSE.getSE(), *TLI);
9198 
9199   // Remove the existing terminator of the exiting block of the top-most region.
9200   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9201   auto *Term =
9202       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9203   Term->eraseFromParent();
9204 
9205   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9206                         true, CM.useActiveLaneMaskForControlFlow());
9207   return Plan;
9208 }
9209 
9210 // Adjust the recipes for reductions. For in-loop reductions the chain of
9211 // instructions leading from the loop exit instr to the phi need to be converted
9212 // to reductions, with one operand being vector and the other being the scalar
9213 // reduction chain. For other reductions, a select is introduced between the phi
9214 // and live-out recipes when folding the tail.
adjustRecipesForReductions(VPBasicBlock * LatchVPBB,VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)9215 void LoopVectorizationPlanner::adjustRecipesForReductions(
9216     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9217     ElementCount MinVF) {
9218   for (const auto &Reduction : CM.getInLoopReductionChains()) {
9219     PHINode *Phi = Reduction.first;
9220     const RecurrenceDescriptor &RdxDesc =
9221         Legal->getReductionVars().find(Phi)->second;
9222     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9223 
9224     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9225       continue;
9226 
9227     // ReductionOperations are orders top-down from the phi's use to the
9228     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9229     // which of the two operands will remain scalar and which will be reduced.
9230     // For minmax the chain will be the select instructions.
9231     Instruction *Chain = Phi;
9232     for (Instruction *R : ReductionOperations) {
9233       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9234       RecurKind Kind = RdxDesc.getRecurrenceKind();
9235 
9236       VPValue *ChainOp = Plan->getVPValue(Chain);
9237       unsigned FirstOpId;
9238       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9239              "Only min/max recurrences allowed for inloop reductions");
9240       // Recognize a call to the llvm.fmuladd intrinsic.
9241       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9242       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9243              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9244       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9245         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9246                "Expected to replace a VPWidenSelectSC");
9247         FirstOpId = 1;
9248       } else {
9249         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9250                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9251                "Expected to replace a VPWidenSC");
9252         FirstOpId = 0;
9253       }
9254       unsigned VecOpId =
9255           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9256       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9257 
9258       VPValue *CondOp = nullptr;
9259       if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
9260         VPBuilder::InsertPointGuard Guard(Builder);
9261         Builder.setInsertPoint(WidenRecipe->getParent(),
9262                                WidenRecipe->getIterator());
9263         CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
9264       }
9265 
9266       if (IsFMulAdd) {
9267         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9268         // need to create an fmul recipe to use as the vector operand for the
9269         // fadd reduction.
9270         VPInstruction *FMulRecipe = new VPInstruction(
9271             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9272         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9273         WidenRecipe->getParent()->insert(FMulRecipe,
9274                                          WidenRecipe->getIterator());
9275         VecOp = FMulRecipe;
9276       }
9277       VPReductionRecipe *RedRecipe =
9278           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9279       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9280       Plan->removeVPValueFor(R);
9281       Plan->addVPValue(R, RedRecipe);
9282       // Append the recipe to the end of the VPBasicBlock because we need to
9283       // ensure that it comes after all of it's inputs, including CondOp.
9284       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9285       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9286       WidenRecipe->eraseFromParent();
9287 
9288       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9289         VPRecipeBase *CompareRecipe =
9290             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9291         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9292                "Expected to replace a VPWidenSC");
9293         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9294                "Expected no remaining users");
9295         CompareRecipe->eraseFromParent();
9296       }
9297       Chain = R;
9298     }
9299   }
9300 
9301   // If tail is folded by masking, introduce selects between the phi
9302   // and the live-out instruction of each reduction, at the beginning of the
9303   // dedicated latch block.
9304   if (CM.foldTailByMasking()) {
9305     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9306     for (VPRecipeBase &R :
9307          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9308       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9309       if (!PhiR || PhiR->isInLoop())
9310         continue;
9311       VPValue *Cond =
9312           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9313       VPValue *Red = PhiR->getBackedgeValue();
9314       assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
9315              "reduction recipe must be defined before latch");
9316       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9317     }
9318   }
9319 }
9320 
9321 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const9322 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9323                                VPSlotTracker &SlotTracker) const {
9324   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9325   IG->getInsertPos()->printAsOperand(O, false);
9326   O << ", ";
9327   getAddr()->printAsOperand(O, SlotTracker);
9328   VPValue *Mask = getMask();
9329   if (Mask) {
9330     O << ", ";
9331     Mask->printAsOperand(O, SlotTracker);
9332   }
9333 
9334   unsigned OpIdx = 0;
9335   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9336     if (!IG->getMember(i))
9337       continue;
9338     if (getNumStoreOperands() > 0) {
9339       O << "\n" << Indent << "  store ";
9340       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9341       O << " to index " << i;
9342     } else {
9343       O << "\n" << Indent << "  ";
9344       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9345       O << " = load from index " << i;
9346     }
9347     ++OpIdx;
9348   }
9349 }
9350 #endif
9351 
execute(VPTransformState & State)9352 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9353   assert(!State.Instance && "Int or FP induction being replicated.");
9354 
9355   Value *Start = getStartValue()->getLiveInIRValue();
9356   const InductionDescriptor &ID = getInductionDescriptor();
9357   TruncInst *Trunc = getTruncInst();
9358   IRBuilderBase &Builder = State.Builder;
9359   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9360   assert(State.VF.isVector() && "must have vector VF");
9361 
9362   // The value from the original loop to which we are mapping the new induction
9363   // variable.
9364   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9365 
9366   // Fast-math-flags propagate from the original induction instruction.
9367   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9368   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9369     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9370 
9371   // Now do the actual transformations, and start with fetching the step value.
9372   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9373 
9374   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9375          "Expected either an induction phi-node or a truncate of it!");
9376 
9377   // Construct the initial value of the vector IV in the vector loop preheader
9378   auto CurrIP = Builder.saveIP();
9379   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9380   Builder.SetInsertPoint(VectorPH->getTerminator());
9381   if (isa<TruncInst>(EntryVal)) {
9382     assert(Start->getType()->isIntegerTy() &&
9383            "Truncation requires an integer type");
9384     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9385     Step = Builder.CreateTrunc(Step, TruncType);
9386     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9387   }
9388 
9389   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9390   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9391   Value *SteppedStart = getStepVector(
9392       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9393 
9394   // We create vector phi nodes for both integer and floating-point induction
9395   // variables. Here, we determine the kind of arithmetic we will perform.
9396   Instruction::BinaryOps AddOp;
9397   Instruction::BinaryOps MulOp;
9398   if (Step->getType()->isIntegerTy()) {
9399     AddOp = Instruction::Add;
9400     MulOp = Instruction::Mul;
9401   } else {
9402     AddOp = ID.getInductionOpcode();
9403     MulOp = Instruction::FMul;
9404   }
9405 
9406   // Multiply the vectorization factor by the step using integer or
9407   // floating-point arithmetic as appropriate.
9408   Type *StepType = Step->getType();
9409   Value *RuntimeVF;
9410   if (Step->getType()->isFloatingPointTy())
9411     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9412   else
9413     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9414   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9415 
9416   // Create a vector splat to use in the induction update.
9417   //
9418   // FIXME: If the step is non-constant, we create the vector splat with
9419   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9420   //        handle a constant vector splat.
9421   Value *SplatVF = isa<Constant>(Mul)
9422                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9423                        : Builder.CreateVectorSplat(State.VF, Mul);
9424   Builder.restoreIP(CurrIP);
9425 
9426   // We may need to add the step a number of times, depending on the unroll
9427   // factor. The last of those goes into the PHI.
9428   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9429                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9430   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9431   Instruction *LastInduction = VecInd;
9432   for (unsigned Part = 0; Part < State.UF; ++Part) {
9433     State.set(this, LastInduction, Part);
9434 
9435     if (isa<TruncInst>(EntryVal))
9436       State.addMetadata(LastInduction, EntryVal);
9437 
9438     LastInduction = cast<Instruction>(
9439         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9440     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9441   }
9442 
9443   LastInduction->setName("vec.ind.next");
9444   VecInd->addIncoming(SteppedStart, VectorPH);
9445   // Add induction update using an incorrect block temporarily. The phi node
9446   // will be fixed after VPlan execution. Note that at this point the latch
9447   // block cannot be used, as it does not exist yet.
9448   // TODO: Model increment value in VPlan, by turning the recipe into a
9449   // multi-def and a subclass of VPHeaderPHIRecipe.
9450   VecInd->addIncoming(LastInduction, VectorPH);
9451 }
9452 
execute(VPTransformState & State)9453 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9454   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9455          "Not a pointer induction according to InductionDescriptor!");
9456   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9457          "Unexpected type.");
9458 
9459   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9460   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9461 
9462   if (onlyScalarsGenerated(State.VF)) {
9463     // This is the normalized GEP that starts counting at zero.
9464     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9465         CanonicalIV, IndDesc.getStep()->getType());
9466     // Determine the number of scalars we need to generate for each unroll
9467     // iteration. If the instruction is uniform, we only need to generate the
9468     // first lane. Otherwise, we generate all VF values.
9469     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9470     assert((IsUniform || !State.VF.isScalable()) &&
9471            "Cannot scalarize a scalable VF");
9472     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9473 
9474     for (unsigned Part = 0; Part < State.UF; ++Part) {
9475       Value *PartStart =
9476           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9477 
9478       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9479         Value *Idx = State.Builder.CreateAdd(
9480             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9481         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9482 
9483         Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9484         Value *SclrGep = emitTransformedIndex(
9485             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9486         SclrGep->setName("next.gep");
9487         State.set(this, SclrGep, VPIteration(Part, Lane));
9488       }
9489     }
9490     return;
9491   }
9492 
9493   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9494          "Induction step not a SCEV constant!");
9495   Type *PhiType = IndDesc.getStep()->getType();
9496 
9497   // Build a pointer phi
9498   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9499   Type *ScStValueType = ScalarStartValue->getType();
9500   PHINode *NewPointerPhi =
9501       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9502 
9503   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9504   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9505 
9506   // A pointer induction, performed by using a gep
9507   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9508 
9509   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9510   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9511   Value *NumUnrolledElems =
9512       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9513   Value *InductionGEP = GetElementPtrInst::Create(
9514       IndDesc.getElementType(), NewPointerPhi,
9515       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9516       InductionLoc);
9517   // Add induction update using an incorrect block temporarily. The phi node
9518   // will be fixed after VPlan execution. Note that at this point the latch
9519   // block cannot be used, as it does not exist yet.
9520   // TODO: Model increment value in VPlan, by turning the recipe into a
9521   // multi-def and a subclass of VPHeaderPHIRecipe.
9522   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9523 
9524   // Create UF many actual address geps that use the pointer
9525   // phi as base and a vectorized version of the step value
9526   // (<step*0, ..., step*N>) as offset.
9527   for (unsigned Part = 0; Part < State.UF; ++Part) {
9528     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9529     Value *StartOffsetScalar =
9530         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9531     Value *StartOffset =
9532         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9533     // Create a vector of consecutive numbers from zero to VF.
9534     StartOffset = State.Builder.CreateAdd(
9535         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9536 
9537     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
9538            "scalar step must be the same across all parts");
9539     Value *GEP = State.Builder.CreateGEP(
9540         IndDesc.getElementType(), NewPointerPhi,
9541         State.Builder.CreateMul(
9542             StartOffset,
9543             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9544             "vector.gep"));
9545     State.set(this, GEP, Part);
9546   }
9547 }
9548 
execute(VPTransformState & State)9549 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9550   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9551 
9552   // Fast-math-flags propagate from the original induction instruction.
9553   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9554   if (IndDesc.getInductionBinOp() &&
9555       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9556     State.Builder.setFastMathFlags(
9557         IndDesc.getInductionBinOp()->getFastMathFlags());
9558 
9559   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9560   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9561   Value *DerivedIV =
9562       emitTransformedIndex(State.Builder, CanonicalIV,
9563                            getStartValue()->getLiveInIRValue(), Step, IndDesc);
9564   DerivedIV->setName("offset.idx");
9565   if (ResultTy != DerivedIV->getType()) {
9566     assert(Step->getType()->isIntegerTy() &&
9567            "Truncation requires an integer step");
9568     DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9569   }
9570   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9571 
9572   State.set(this, DerivedIV, VPIteration(0, 0));
9573 }
9574 
execute(VPTransformState & State)9575 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9576   // Fast-math-flags propagate from the original induction instruction.
9577   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9578   if (IndDesc.getInductionBinOp() &&
9579       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9580     State.Builder.setFastMathFlags(
9581         IndDesc.getInductionBinOp()->getFastMathFlags());
9582 
9583   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9584   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9585 
9586   buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9587 }
9588 
execute(VPTransformState & State)9589 void VPInterleaveRecipe::execute(VPTransformState &State) {
9590   assert(!State.Instance && "Interleave group being replicated.");
9591   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9592                                       getStoredValues(), getMask());
9593 }
9594 
execute(VPTransformState & State)9595 void VPReductionRecipe::execute(VPTransformState &State) {
9596   assert(!State.Instance && "Reduction being replicated.");
9597   Value *PrevInChain = State.get(getChainOp(), 0);
9598   RecurKind Kind = RdxDesc->getRecurrenceKind();
9599   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9600   // Propagate the fast-math flags carried by the underlying instruction.
9601   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9602   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9603   for (unsigned Part = 0; Part < State.UF; ++Part) {
9604     Value *NewVecOp = State.get(getVecOp(), Part);
9605     if (VPValue *Cond = getCondOp()) {
9606       Value *NewCond = State.get(Cond, Part);
9607       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9608       Value *Iden = RdxDesc->getRecurrenceIdentity(
9609           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9610       Value *IdenVec =
9611           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9612       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9613       NewVecOp = Select;
9614     }
9615     Value *NewRed;
9616     Value *NextInChain;
9617     if (IsOrdered) {
9618       if (State.VF.isVector())
9619         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9620                                         PrevInChain);
9621       else
9622         NewRed = State.Builder.CreateBinOp(
9623             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9624             NewVecOp);
9625       PrevInChain = NewRed;
9626     } else {
9627       PrevInChain = State.get(getChainOp(), Part);
9628       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9629     }
9630     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9631       NextInChain =
9632           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9633                          NewRed, PrevInChain);
9634     } else if (IsOrdered)
9635       NextInChain = NewRed;
9636     else
9637       NextInChain = State.Builder.CreateBinOp(
9638           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9639           PrevInChain);
9640     State.set(this, NextInChain, Part);
9641   }
9642 }
9643 
execute(VPTransformState & State)9644 void VPReplicateRecipe::execute(VPTransformState &State) {
9645   Instruction *UI = getUnderlyingInstr();
9646   if (State.Instance) { // Generate a single instance.
9647     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9648     State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9649                                     IsPredicated, State);
9650     // Insert scalar instance packing it into a vector.
9651     if (AlsoPack && State.VF.isVector()) {
9652       // If we're constructing lane 0, initialize to start from poison.
9653       if (State.Instance->Lane.isFirstLane()) {
9654         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9655         Value *Poison = PoisonValue::get(
9656             VectorType::get(UI->getType(), State.VF));
9657         State.set(this, Poison, State.Instance->Part);
9658       }
9659       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9660     }
9661     return;
9662   }
9663 
9664   if (IsUniform) {
9665     // If the recipe is uniform across all parts (instead of just per VF), only
9666     // generate a single instance.
9667     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9668         all_of(operands(), [](VPValue *Op) {
9669           return Op->isDefinedOutsideVectorRegions();
9670         })) {
9671       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9672                                       State);
9673       if (user_begin() != user_end()) {
9674         for (unsigned Part = 1; Part < State.UF; ++Part)
9675           State.set(this, State.get(this, VPIteration(0, 0)),
9676                     VPIteration(Part, 0));
9677       }
9678       return;
9679     }
9680 
9681     // Uniform within VL means we need to generate lane 0 only for each
9682     // unrolled copy.
9683     for (unsigned Part = 0; Part < State.UF; ++Part)
9684       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9685                                       IsPredicated, State);
9686     return;
9687   }
9688 
9689   // A store of a loop varying value to a loop invariant address only
9690   // needs only the last copy of the store.
9691   if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9692     auto Lane = VPLane::getLastLaneForVF(State.VF);
9693     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9694                                     State);
9695     return;
9696   }
9697 
9698   // Generate scalar instances for all VF lanes of all UF parts.
9699   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9700   const unsigned EndLane = State.VF.getKnownMinValue();
9701   for (unsigned Part = 0; Part < State.UF; ++Part)
9702     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9703       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9704                                       IsPredicated, State);
9705 }
9706 
execute(VPTransformState & State)9707 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9708   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9709 
9710   // Attempt to issue a wide load.
9711   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9712   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9713 
9714   assert((LI || SI) && "Invalid Load/Store instruction");
9715   assert((!SI || StoredValue) && "No stored value provided for widened store");
9716   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9717 
9718   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9719 
9720   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9721   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9722   bool CreateGatherScatter = !Consecutive;
9723 
9724   auto &Builder = State.Builder;
9725   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9726   bool isMaskRequired = getMask();
9727   if (isMaskRequired)
9728     for (unsigned Part = 0; Part < State.UF; ++Part)
9729       BlockInMaskParts[Part] = State.get(getMask(), Part);
9730 
9731   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9732     // Calculate the pointer for the specific unroll-part.
9733     GetElementPtrInst *PartPtr = nullptr;
9734 
9735     bool InBounds = false;
9736     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9737       InBounds = gep->isInBounds();
9738     if (Reverse) {
9739       // If the address is consecutive but reversed, then the
9740       // wide store needs to start at the last vector element.
9741       // RunTimeVF =  VScale * VF.getKnownMinValue()
9742       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9743       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9744       // NumElt = -Part * RunTimeVF
9745       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9746       // LastLane = 1 - RunTimeVF
9747       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9748       PartPtr =
9749           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9750       PartPtr->setIsInBounds(InBounds);
9751       PartPtr = cast<GetElementPtrInst>(
9752           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9753       PartPtr->setIsInBounds(InBounds);
9754       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9755         BlockInMaskParts[Part] =
9756             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9757     } else {
9758       Value *Increment =
9759           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9760       PartPtr = cast<GetElementPtrInst>(
9761           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9762       PartPtr->setIsInBounds(InBounds);
9763     }
9764 
9765     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9766     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9767   };
9768 
9769   // Handle Stores:
9770   if (SI) {
9771     State.setDebugLocFromInst(SI);
9772 
9773     for (unsigned Part = 0; Part < State.UF; ++Part) {
9774       Instruction *NewSI = nullptr;
9775       Value *StoredVal = State.get(StoredValue, Part);
9776       if (CreateGatherScatter) {
9777         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9778         Value *VectorGep = State.get(getAddr(), Part);
9779         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9780                                             MaskPart);
9781       } else {
9782         if (Reverse) {
9783           // If we store to reverse consecutive memory locations, then we need
9784           // to reverse the order of elements in the stored value.
9785           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9786           // We don't want to update the value in the map as it might be used in
9787           // another expression. So don't call resetVectorValue(StoredVal).
9788         }
9789         auto *VecPtr =
9790             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9791         if (isMaskRequired)
9792           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9793                                             BlockInMaskParts[Part]);
9794         else
9795           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9796       }
9797       State.addMetadata(NewSI, SI);
9798     }
9799     return;
9800   }
9801 
9802   // Handle loads.
9803   assert(LI && "Must have a load instruction");
9804   State.setDebugLocFromInst(LI);
9805   for (unsigned Part = 0; Part < State.UF; ++Part) {
9806     Value *NewLI;
9807     if (CreateGatherScatter) {
9808       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9809       Value *VectorGep = State.get(getAddr(), Part);
9810       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9811                                          nullptr, "wide.masked.gather");
9812       State.addMetadata(NewLI, LI);
9813     } else {
9814       auto *VecPtr =
9815           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9816       if (isMaskRequired)
9817         NewLI = Builder.CreateMaskedLoad(
9818             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9819             PoisonValue::get(DataTy), "wide.masked.load");
9820       else
9821         NewLI =
9822             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9823 
9824       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9825       State.addMetadata(NewLI, LI);
9826       if (Reverse)
9827         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9828     }
9829 
9830     State.set(getVPSingleValue(), NewLI, Part);
9831   }
9832 }
9833 
9834 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9835 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9836 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9837 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,AssumptionCache * AC,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9838 static ScalarEpilogueLowering getScalarEpilogueLowering(
9839     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9840     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9841     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9842     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9843   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9844   // don't look at hints or options, and don't request a scalar epilogue.
9845   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9846   // LoopAccessInfo (due to code dependency and not being able to reliably get
9847   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9848   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9849   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9850   // back to the old way and vectorize with versioning when forced. See D81345.)
9851   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9852                                                       PGSOQueryType::IRPass) &&
9853                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9854     return CM_ScalarEpilogueNotAllowedOptSize;
9855 
9856   // 2) If set, obey the directives
9857   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9858     switch (PreferPredicateOverEpilogue) {
9859     case PreferPredicateTy::ScalarEpilogue:
9860       return CM_ScalarEpilogueAllowed;
9861     case PreferPredicateTy::PredicateElseScalarEpilogue:
9862       return CM_ScalarEpilogueNotNeededUsePredicate;
9863     case PreferPredicateTy::PredicateOrDontVectorize:
9864       return CM_ScalarEpilogueNotAllowedUsePredicate;
9865     };
9866   }
9867 
9868   // 3) If set, obey the hints
9869   switch (Hints.getPredicate()) {
9870   case LoopVectorizeHints::FK_Enabled:
9871     return CM_ScalarEpilogueNotNeededUsePredicate;
9872   case LoopVectorizeHints::FK_Disabled:
9873     return CM_ScalarEpilogueAllowed;
9874   };
9875 
9876   // 4) if the TTI hook indicates this is profitable, request predication.
9877   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9878     return CM_ScalarEpilogueNotNeededUsePredicate;
9879 
9880   return CM_ScalarEpilogueAllowed;
9881 }
9882 
get(VPValue * Def,unsigned Part)9883 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9884   // If Values have been set for this Def return the one relevant for \p Part.
9885   if (hasVectorValue(Def, Part))
9886     return Data.PerPartOutput[Def][Part];
9887 
9888   if (!hasScalarValue(Def, {Part, 0})) {
9889     Value *IRV = Def->getLiveInIRValue();
9890     Value *B = ILV->getBroadcastInstrs(IRV);
9891     set(Def, B, Part);
9892     return B;
9893   }
9894 
9895   Value *ScalarValue = get(Def, {Part, 0});
9896   // If we aren't vectorizing, we can just copy the scalar map values over
9897   // to the vector map.
9898   if (VF.isScalar()) {
9899     set(Def, ScalarValue, Part);
9900     return ScalarValue;
9901   }
9902 
9903   bool IsUniform = vputils::isUniformAfterVectorization(Def);
9904 
9905   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9906   // Check if there is a scalar value for the selected lane.
9907   if (!hasScalarValue(Def, {Part, LastLane})) {
9908     // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9909     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
9910             isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
9911            "unexpected recipe found to be invariant");
9912     IsUniform = true;
9913     LastLane = 0;
9914   }
9915 
9916   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9917   // Set the insert point after the last scalarized instruction or after the
9918   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9919   // will directly follow the scalar definitions.
9920   auto OldIP = Builder.saveIP();
9921   auto NewIP =
9922       isa<PHINode>(LastInst)
9923           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9924           : std::next(BasicBlock::iterator(LastInst));
9925   Builder.SetInsertPoint(&*NewIP);
9926 
9927   // However, if we are vectorizing, we need to construct the vector values.
9928   // If the value is known to be uniform after vectorization, we can just
9929   // broadcast the scalar value corresponding to lane zero for each unroll
9930   // iteration. Otherwise, we construct the vector values using
9931   // insertelement instructions. Since the resulting vectors are stored in
9932   // State, we will only generate the insertelements once.
9933   Value *VectorValue = nullptr;
9934   if (IsUniform) {
9935     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9936     set(Def, VectorValue, Part);
9937   } else {
9938     // Initialize packing with insertelements to start from undef.
9939     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9940     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9941     set(Def, Undef, Part);
9942     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9943       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9944     VectorValue = get(Def, Part);
9945   }
9946   Builder.restoreIP(OldIP);
9947   return VectorValue;
9948 }
9949 
9950 // Process the loop in the VPlan-native vectorization path. This path builds
9951 // VPlan upfront in the vectorization pipeline, which allows to apply
9952 // VPlan-to-VPlan transformations from the very beginning without modifying the
9953 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9954 static bool processLoopInVPlanNativePath(
9955     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9956     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9957     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9958     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9959     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9960     LoopVectorizationRequirements &Requirements) {
9961 
9962   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9963     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9964     return false;
9965   }
9966   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9967   Function *F = L->getHeader()->getParent();
9968   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9969 
9970   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9971       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9972 
9973   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9974                                 &Hints, IAI);
9975   // Use the planner for outer loop vectorization.
9976   // TODO: CM is not used at this point inside the planner. Turn CM into an
9977   // optional argument if we don't need it in the future.
9978   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9979 
9980   // Get user vectorization factor.
9981   ElementCount UserVF = Hints.getWidth();
9982 
9983   CM.collectElementTypesForWidening();
9984 
9985   // Plan how to best vectorize, return the best VF and its cost.
9986   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9987 
9988   // If we are stress testing VPlan builds, do not attempt to generate vector
9989   // code. Masked vector code generation support will follow soon.
9990   // Also, do not attempt to vectorize if no vector code will be produced.
9991   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9992     return false;
9993 
9994   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9995 
9996   {
9997     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9998                              F->getParent()->getDataLayout());
9999     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10000                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
10001     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10002                       << L->getHeader()->getParent()->getName() << "\"\n");
10003     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10004   }
10005 
10006   // Mark the loop as already vectorized to avoid vectorizing again.
10007   Hints.setAlreadyVectorized();
10008   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10009   return true;
10010 }
10011 
10012 // Emit a remark if there are stores to floats that required a floating point
10013 // extension. If the vectorized loop was generated with floating point there
10014 // will be a performance penalty from the conversion overhead and the change in
10015 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)10016 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10017   SmallVector<Instruction *, 4> Worklist;
10018   for (BasicBlock *BB : L->getBlocks()) {
10019     for (Instruction &Inst : *BB) {
10020       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10021         if (S->getValueOperand()->getType()->isFloatTy())
10022           Worklist.push_back(S);
10023       }
10024     }
10025   }
10026 
10027   // Traverse the floating point stores upwards searching, for floating point
10028   // conversions.
10029   SmallPtrSet<const Instruction *, 4> Visited;
10030   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10031   while (!Worklist.empty()) {
10032     auto *I = Worklist.pop_back_val();
10033     if (!L->contains(I))
10034       continue;
10035     if (!Visited.insert(I).second)
10036       continue;
10037 
10038     // Emit a remark if the floating point store required a floating
10039     // point conversion.
10040     // TODO: More work could be done to identify the root cause such as a
10041     // constant or a function return type and point the user to it.
10042     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10043       ORE->emit([&]() {
10044         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10045                                           I->getDebugLoc(), L->getHeader())
10046                << "floating point conversion changes vector width. "
10047                << "Mixed floating point precision requires an up/down "
10048                << "cast that will negatively impact performance.";
10049       });
10050 
10051     for (Use &Op : I->operands())
10052       if (auto *OpI = dyn_cast<Instruction>(Op))
10053         Worklist.push_back(OpI);
10054   }
10055 }
10056 
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE)10057 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10058                                        VectorizationFactor &VF,
10059                                        std::optional<unsigned> VScale, Loop *L,
10060                                        ScalarEvolution &SE) {
10061   InstructionCost CheckCost = Checks.getCost();
10062   if (!CheckCost.isValid())
10063     return false;
10064 
10065   // When interleaving only scalar and vector cost will be equal, which in turn
10066   // would lead to a divide by 0. Fall back to hard threshold.
10067   if (VF.Width.isScalar()) {
10068     if (CheckCost > VectorizeMemoryCheckThreshold) {
10069       LLVM_DEBUG(
10070           dbgs()
10071           << "LV: Interleaving only is not profitable due to runtime checks\n");
10072       return false;
10073     }
10074     return true;
10075   }
10076 
10077   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10078   double ScalarC = *VF.ScalarCost.getValue();
10079   if (ScalarC == 0)
10080     return true;
10081 
10082   // First, compute the minimum iteration count required so that the vector
10083   // loop outperforms the scalar loop.
10084   //  The total cost of the scalar loop is
10085   //   ScalarC * TC
10086   //  where
10087   //  * TC is the actual trip count of the loop.
10088   //  * ScalarC is the cost of a single scalar iteration.
10089   //
10090   //  The total cost of the vector loop is
10091   //    RtC + VecC * (TC / VF) + EpiC
10092   //  where
10093   //  * RtC is the cost of the generated runtime checks
10094   //  * VecC is the cost of a single vector iteration.
10095   //  * TC is the actual trip count of the loop
10096   //  * VF is the vectorization factor
10097   //  * EpiCost is the cost of the generated epilogue, including the cost
10098   //    of the remaining scalar operations.
10099   //
10100   // Vectorization is profitable once the total vector cost is less than the
10101   // total scalar cost:
10102   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10103   //
10104   // Now we can compute the minimum required trip count TC as
10105   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10106   //
10107   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10108   // the computations are performed on doubles, not integers and the result
10109   // is rounded up, hence we get an upper estimate of the TC.
10110   unsigned IntVF = VF.Width.getKnownMinValue();
10111   if (VF.Width.isScalable()) {
10112     unsigned AssumedMinimumVscale = 1;
10113     if (VScale)
10114       AssumedMinimumVscale = *VScale;
10115     IntVF *= AssumedMinimumVscale;
10116   }
10117   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10118   double RtC = *CheckCost.getValue();
10119   double MinTC1 = RtC / (ScalarC - VecCOverVF);
10120 
10121   // Second, compute a minimum iteration count so that the cost of the
10122   // runtime checks is only a fraction of the total scalar loop cost. This
10123   // adds a loop-dependent bound on the overhead incurred if the runtime
10124   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10125   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10126   // cost, compute
10127   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10128   double MinTC2 = RtC * 10 / ScalarC;
10129 
10130   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10131   // next closest multiple of VF. This should partly compensate for ignoring
10132   // the epilogue cost.
10133   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10134   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10135 
10136   LLVM_DEBUG(
10137       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10138              << VF.MinProfitableTripCount << "\n");
10139 
10140   // Skip vectorization if the expected trip count is less than the minimum
10141   // required trip count.
10142   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10143     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10144                                 VF.MinProfitableTripCount)) {
10145       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10146                            "trip count < minimum profitable VF ("
10147                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10148                         << ")\n");
10149 
10150       return false;
10151     }
10152   }
10153   return true;
10154 }
10155 
LoopVectorizePass(LoopVectorizeOptions Opts)10156 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10157     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10158                                !EnableLoopInterleaving),
10159       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10160                               !EnableLoopVectorization) {}
10161 
processLoop(Loop * L)10162 bool LoopVectorizePass::processLoop(Loop *L) {
10163   assert((EnableVPlanNativePath || L->isInnermost()) &&
10164          "VPlan-native path is not enabled. Only process inner loops.");
10165 
10166 #ifndef NDEBUG
10167   const std::string DebugLocStr = getDebugLocString(L);
10168 #endif /* NDEBUG */
10169 
10170   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10171                     << L->getHeader()->getParent()->getName() << "' from "
10172                     << DebugLocStr << "\n");
10173 
10174   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10175 
10176   LLVM_DEBUG(
10177       dbgs() << "LV: Loop hints:"
10178              << " force="
10179              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10180                      ? "disabled"
10181                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10182                             ? "enabled"
10183                             : "?"))
10184              << " width=" << Hints.getWidth()
10185              << " interleave=" << Hints.getInterleave() << "\n");
10186 
10187   // Function containing loop
10188   Function *F = L->getHeader()->getParent();
10189 
10190   // Looking at the diagnostic output is the only way to determine if a loop
10191   // was vectorized (other than looking at the IR or machine code), so it
10192   // is important to generate an optimization remark for each loop. Most of
10193   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10194   // generated as OptimizationRemark and OptimizationRemarkMissed are
10195   // less verbose reporting vectorized loops and unvectorized loops that may
10196   // benefit from vectorization, respectively.
10197 
10198   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10199     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10200     return false;
10201   }
10202 
10203   PredicatedScalarEvolution PSE(*SE, *L);
10204 
10205   // Check if it is legal to vectorize the loop.
10206   LoopVectorizationRequirements Requirements;
10207   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10208                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10209   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10210     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10211     Hints.emitRemarkWithHints();
10212     return false;
10213   }
10214 
10215   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10216   // here. They may require CFG and instruction level transformations before
10217   // even evaluating whether vectorization is profitable. Since we cannot modify
10218   // the incoming IR, we need to build VPlan upfront in the vectorization
10219   // pipeline.
10220   if (!L->isInnermost())
10221     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10222                                         ORE, BFI, PSI, Hints, Requirements);
10223 
10224   assert(L->isInnermost() && "Inner loop expected.");
10225 
10226   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10227   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10228 
10229   // If an override option has been passed in for interleaved accesses, use it.
10230   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10231     UseInterleaved = EnableInterleavedMemAccesses;
10232 
10233   // Analyze interleaved memory accesses.
10234   if (UseInterleaved)
10235     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10236 
10237   // Check the function attributes and profiles to find out if this function
10238   // should be optimized for size.
10239   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10240       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10241 
10242   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10243   // count by optimizing for size, to minimize overheads.
10244   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10245   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10246     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10247                       << "This loop is worth vectorizing only if no scalar "
10248                       << "iteration overheads are incurred.");
10249     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10250       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10251     else {
10252       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10253         LLVM_DEBUG(dbgs() << "\n");
10254         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10255       } else {
10256         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10257                              "small to consider vectorizing.\n");
10258         reportVectorizationFailure(
10259             "The trip count is below the minial threshold value.",
10260             "loop trip count is too low, avoiding vectorization",
10261             "LowTripCount", ORE, L);
10262         Hints.emitRemarkWithHints();
10263         return false;
10264       }
10265     }
10266   }
10267 
10268   // Check the function attributes to see if implicit floats or vectors are
10269   // allowed.
10270   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10271     reportVectorizationFailure(
10272         "Can't vectorize when the NoImplicitFloat attribute is used",
10273         "loop not vectorized due to NoImplicitFloat attribute",
10274         "NoImplicitFloat", ORE, L);
10275     Hints.emitRemarkWithHints();
10276     return false;
10277   }
10278 
10279   // Check if the target supports potentially unsafe FP vectorization.
10280   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10281   // for the target we're vectorizing for, to make sure none of the
10282   // additional fp-math flags can help.
10283   if (Hints.isPotentiallyUnsafe() &&
10284       TTI->isFPVectorizationPotentiallyUnsafe()) {
10285     reportVectorizationFailure(
10286         "Potentially unsafe FP op prevents vectorization",
10287         "loop not vectorized due to unsafe FP support.",
10288         "UnsafeFP", ORE, L);
10289     Hints.emitRemarkWithHints();
10290     return false;
10291   }
10292 
10293   bool AllowOrderedReductions;
10294   // If the flag is set, use that instead and override the TTI behaviour.
10295   if (ForceOrderedReductions.getNumOccurrences() > 0)
10296     AllowOrderedReductions = ForceOrderedReductions;
10297   else
10298     AllowOrderedReductions = TTI->enableOrderedReductions();
10299   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10300     ORE->emit([&]() {
10301       auto *ExactFPMathInst = Requirements.getExactFPInst();
10302       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10303                                                  ExactFPMathInst->getDebugLoc(),
10304                                                  ExactFPMathInst->getParent())
10305              << "loop not vectorized: cannot prove it is safe to reorder "
10306                 "floating-point operations";
10307     });
10308     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10309                          "reorder floating-point operations\n");
10310     Hints.emitRemarkWithHints();
10311     return false;
10312   }
10313 
10314   // Use the cost model.
10315   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10316                                 F, &Hints, IAI);
10317   CM.collectValuesToIgnore();
10318   CM.collectElementTypesForWidening();
10319 
10320   // Use the planner for vectorization.
10321   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10322 
10323   // Get user vectorization factor and interleave count.
10324   ElementCount UserVF = Hints.getWidth();
10325   unsigned UserIC = Hints.getInterleave();
10326 
10327   // Plan how to best vectorize, return the best VF and its cost.
10328   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10329 
10330   VectorizationFactor VF = VectorizationFactor::Disabled();
10331   unsigned IC = 1;
10332 
10333   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10334                            F->getParent()->getDataLayout());
10335   if (MaybeVF) {
10336     VF = *MaybeVF;
10337     // Select the interleave count.
10338     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10339 
10340     unsigned SelectedIC = std::max(IC, UserIC);
10341     //  Optimistically generate runtime checks if they are needed. Drop them if
10342     //  they turn out to not be profitable.
10343     if (VF.Width.isVector() || SelectedIC > 1)
10344       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10345 
10346     // Check if it is profitable to vectorize with runtime checks.
10347     bool ForceVectorization =
10348         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10349     if (!ForceVectorization &&
10350         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10351                                     *PSE.getSE())) {
10352       ORE->emit([&]() {
10353         return OptimizationRemarkAnalysisAliasing(
10354                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10355                    L->getHeader())
10356                << "loop not vectorized: cannot prove it is safe to reorder "
10357                   "memory operations";
10358       });
10359       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10360       Hints.emitRemarkWithHints();
10361       return false;
10362     }
10363   }
10364 
10365   // Identify the diagnostic messages that should be produced.
10366   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10367   bool VectorizeLoop = true, InterleaveLoop = true;
10368   if (VF.Width.isScalar()) {
10369     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10370     VecDiagMsg = std::make_pair(
10371         "VectorizationNotBeneficial",
10372         "the cost-model indicates that vectorization is not beneficial");
10373     VectorizeLoop = false;
10374   }
10375 
10376   if (!MaybeVF && UserIC > 1) {
10377     // Tell the user interleaving was avoided up-front, despite being explicitly
10378     // requested.
10379     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10380                          "interleaving should be avoided up front\n");
10381     IntDiagMsg = std::make_pair(
10382         "InterleavingAvoided",
10383         "Ignoring UserIC, because interleaving was avoided up front");
10384     InterleaveLoop = false;
10385   } else if (IC == 1 && UserIC <= 1) {
10386     // Tell the user interleaving is not beneficial.
10387     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10388     IntDiagMsg = std::make_pair(
10389         "InterleavingNotBeneficial",
10390         "the cost-model indicates that interleaving is not beneficial");
10391     InterleaveLoop = false;
10392     if (UserIC == 1) {
10393       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10394       IntDiagMsg.second +=
10395           " and is explicitly disabled or interleave count is set to 1";
10396     }
10397   } else if (IC > 1 && UserIC == 1) {
10398     // Tell the user interleaving is beneficial, but it explicitly disabled.
10399     LLVM_DEBUG(
10400         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10401     IntDiagMsg = std::make_pair(
10402         "InterleavingBeneficialButDisabled",
10403         "the cost-model indicates that interleaving is beneficial "
10404         "but is explicitly disabled or interleave count is set to 1");
10405     InterleaveLoop = false;
10406   }
10407 
10408   // Override IC if user provided an interleave count.
10409   IC = UserIC > 0 ? UserIC : IC;
10410 
10411   // Emit diagnostic messages, if any.
10412   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10413   if (!VectorizeLoop && !InterleaveLoop) {
10414     // Do not vectorize or interleaving the loop.
10415     ORE->emit([&]() {
10416       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10417                                       L->getStartLoc(), L->getHeader())
10418              << VecDiagMsg.second;
10419     });
10420     ORE->emit([&]() {
10421       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10422                                       L->getStartLoc(), L->getHeader())
10423              << IntDiagMsg.second;
10424     });
10425     return false;
10426   } else if (!VectorizeLoop && InterleaveLoop) {
10427     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10428     ORE->emit([&]() {
10429       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10430                                         L->getStartLoc(), L->getHeader())
10431              << VecDiagMsg.second;
10432     });
10433   } else if (VectorizeLoop && !InterleaveLoop) {
10434     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10435                       << ") in " << DebugLocStr << '\n');
10436     ORE->emit([&]() {
10437       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10438                                         L->getStartLoc(), L->getHeader())
10439              << IntDiagMsg.second;
10440     });
10441   } else if (VectorizeLoop && InterleaveLoop) {
10442     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10443                       << ") in " << DebugLocStr << '\n');
10444     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10445   }
10446 
10447   bool DisableRuntimeUnroll = false;
10448   MDNode *OrigLoopID = L->getLoopID();
10449   {
10450     using namespace ore;
10451     if (!VectorizeLoop) {
10452       assert(IC > 1 && "interleave count should not be 1 or 0");
10453       // If we decided that it is not legal to vectorize the loop, then
10454       // interleave it.
10455       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10456                                  &CM, BFI, PSI, Checks);
10457 
10458       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10459       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10460 
10461       ORE->emit([&]() {
10462         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10463                                   L->getHeader())
10464                << "interleaved loop (interleaved count: "
10465                << NV("InterleaveCount", IC) << ")";
10466       });
10467     } else {
10468       // If we decided that it is *legal* to vectorize the loop, then do it.
10469 
10470       // Consider vectorizing the epilogue too if it's profitable.
10471       VectorizationFactor EpilogueVF =
10472           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10473       if (EpilogueVF.Width.isVector()) {
10474 
10475         // The first pass vectorizes the main loop and creates a scalar epilogue
10476         // to be vectorized by executing the plan (potentially with a different
10477         // factor) again shortly afterwards.
10478         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10479         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10480                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10481 
10482         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10483         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10484                         DT, true);
10485         ++LoopsVectorized;
10486 
10487         // Second pass vectorizes the epilogue and adjusts the control flow
10488         // edges from the first pass.
10489         EPI.MainLoopVF = EPI.EpilogueVF;
10490         EPI.MainLoopUF = EPI.EpilogueUF;
10491         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10492                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10493                                                  Checks);
10494 
10495         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10496         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10497         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10498         Header->setName("vec.epilog.vector.body");
10499 
10500         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10501         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10502         // before vectorizing the epilogue loop.
10503         for (VPRecipeBase &R : Header->phis()) {
10504           if (isa<VPCanonicalIVPHIRecipe>(&R))
10505             continue;
10506 
10507           Value *ResumeV = nullptr;
10508           // TODO: Move setting of resume values to prepareToExecute.
10509           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10510             ResumeV = MainILV.getReductionResumeValue(
10511                 ReductionPhi->getRecurrenceDescriptor());
10512           } else {
10513             // Create induction resume values for both widened pointer and
10514             // integer/fp inductions and update the start value of the induction
10515             // recipes to use the resume value.
10516             PHINode *IndPhi = nullptr;
10517             const InductionDescriptor *ID;
10518             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10519               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10520               ID = &Ind->getInductionDescriptor();
10521             } else {
10522               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10523               IndPhi = WidenInd->getPHINode();
10524               ID = &WidenInd->getInductionDescriptor();
10525             }
10526 
10527             ResumeV = MainILV.createInductionResumeValue(
10528                 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10529           }
10530           assert(ResumeV && "Must have a resume value");
10531           VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10532           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10533         }
10534 
10535         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10536                         DT, true);
10537         ++LoopsEpilogueVectorized;
10538 
10539         if (!MainILV.areSafetyChecksAdded())
10540           DisableRuntimeUnroll = true;
10541       } else {
10542         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10543                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10544                                PSI, Checks);
10545 
10546         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10547         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10548         ++LoopsVectorized;
10549 
10550         // Add metadata to disable runtime unrolling a scalar loop when there
10551         // are no runtime checks about strides and memory. A scalar loop that is
10552         // rarely used is not worth unrolling.
10553         if (!LB.areSafetyChecksAdded())
10554           DisableRuntimeUnroll = true;
10555       }
10556       // Report the vectorization decision.
10557       ORE->emit([&]() {
10558         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10559                                   L->getHeader())
10560                << "vectorized loop (vectorization width: "
10561                << NV("VectorizationFactor", VF.Width)
10562                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10563       });
10564     }
10565 
10566     if (ORE->allowExtraAnalysis(LV_NAME))
10567       checkMixedPrecision(L, ORE);
10568   }
10569 
10570   std::optional<MDNode *> RemainderLoopID =
10571       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10572                                       LLVMLoopVectorizeFollowupEpilogue});
10573   if (RemainderLoopID) {
10574     L->setLoopID(*RemainderLoopID);
10575   } else {
10576     if (DisableRuntimeUnroll)
10577       AddRuntimeUnrollDisableMetaData(L);
10578 
10579     // Mark the loop as already vectorized to avoid vectorizing again.
10580     Hints.setAlreadyVectorized();
10581   }
10582 
10583   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10584   return true;
10585 }
10586 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo & BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)10587 LoopVectorizeResult LoopVectorizePass::runImpl(
10588     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10589     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10590     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10591     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10592   SE = &SE_;
10593   LI = &LI_;
10594   TTI = &TTI_;
10595   DT = &DT_;
10596   BFI = &BFI_;
10597   TLI = TLI_;
10598   AC = &AC_;
10599   LAIs = &LAIs_;
10600   DB = &DB_;
10601   ORE = &ORE_;
10602   PSI = PSI_;
10603 
10604   // Don't attempt if
10605   // 1. the target claims to have no vector registers, and
10606   // 2. interleaving won't help ILP.
10607   //
10608   // The second condition is necessary because, even if the target has no
10609   // vector registers, loop vectorization may still enable scalar
10610   // interleaving.
10611   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10612       TTI->getMaxInterleaveFactor(1) < 2)
10613     return LoopVectorizeResult(false, false);
10614 
10615   bool Changed = false, CFGChanged = false;
10616 
10617   // The vectorizer requires loops to be in simplified form.
10618   // Since simplification may add new inner loops, it has to run before the
10619   // legality and profitability checks. This means running the loop vectorizer
10620   // will simplify all loops, regardless of whether anything end up being
10621   // vectorized.
10622   for (const auto &L : *LI)
10623     Changed |= CFGChanged |=
10624         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10625 
10626   // Build up a worklist of inner-loops to vectorize. This is necessary as
10627   // the act of vectorizing or partially unrolling a loop creates new loops
10628   // and can invalidate iterators across the loops.
10629   SmallVector<Loop *, 8> Worklist;
10630 
10631   for (Loop *L : *LI)
10632     collectSupportedLoops(*L, LI, ORE, Worklist);
10633 
10634   LoopsAnalyzed += Worklist.size();
10635 
10636   // Now walk the identified inner loops.
10637   while (!Worklist.empty()) {
10638     Loop *L = Worklist.pop_back_val();
10639 
10640     // For the inner loops we actually process, form LCSSA to simplify the
10641     // transform.
10642     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10643 
10644     Changed |= CFGChanged |= processLoop(L);
10645 
10646     if (Changed)
10647       LAIs->clear();
10648   }
10649 
10650   // Process each loop nest in the function.
10651   return LoopVectorizeResult(Changed, CFGChanged);
10652 }
10653 
run(Function & F,FunctionAnalysisManager & AM)10654 PreservedAnalyses LoopVectorizePass::run(Function &F,
10655                                          FunctionAnalysisManager &AM) {
10656     auto &LI = AM.getResult<LoopAnalysis>(F);
10657     // There are no loops in the function. Return before computing other expensive
10658     // analyses.
10659     if (LI.empty())
10660       return PreservedAnalyses::all();
10661     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10662     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10663     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10664     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10665     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10666     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10667     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10668     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10669 
10670     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10671     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10672     ProfileSummaryInfo *PSI =
10673         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10674     LoopVectorizeResult Result =
10675         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10676     if (!Result.MadeAnyChange)
10677       return PreservedAnalyses::all();
10678     PreservedAnalyses PA;
10679 
10680     // We currently do not preserve loopinfo/dominator analyses with outer loop
10681     // vectorization. Until this is addressed, mark these analyses as preserved
10682     // only for non-VPlan-native path.
10683     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10684     if (!EnableVPlanNativePath) {
10685       PA.preserve<LoopAnalysis>();
10686       PA.preserve<DominatorTreeAnalysis>();
10687     }
10688 
10689     if (Result.MadeCFGChange) {
10690       // Making CFG changes likely means a loop got vectorized. Indicate that
10691       // extra simplification passes should be run.
10692       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10693       // be run if runtime checks have been added.
10694       AM.getResult<ShouldRunExtraVectorPasses>(F);
10695       PA.preserve<ShouldRunExtraVectorPasses>();
10696     } else {
10697       PA.preserveSet<CFGAnalyses>();
10698     }
10699     return PA;
10700 }
10701 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10702 void LoopVectorizePass::printPipeline(
10703     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10704   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10705       OS, MapClassName2PassName);
10706 
10707   OS << "<";
10708   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10709   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10710   OS << ">";
10711 }
10712