1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cmath>
146 #include <cstdint>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <map>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks"));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237     "force-tail-folding-style", cl::desc("Force the tail folding style"),
238     cl::init(TailFoldingStyle::None),
239     cl::values(
240         clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241         clEnumValN(
242             TailFoldingStyle::Data, "data",
243             "Create lane mask for data only, using active.lane.mask intrinsic"),
244         clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                    "data-without-lane-mask",
246                    "Create lane mask with compare/stepvector"),
247         clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                    "Create lane mask using active.lane.mask intrinsic, and use "
249                    "it for both data and control flow"),
250         clEnumValN(
251             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252             "data-and-control-without-rt-check",
253             "Similar to data-and-control, but remove the runtime check")));
254 
255 static cl::opt<bool> MaximizeBandwidth(
256     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257     cl::desc("Maximize bandwidth when selecting vectorization factor which "
258              "will be determined by the smallest type in loop."));
259 
260 static cl::opt<bool> EnableInterleavedMemAccesses(
261     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
263 
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269 
270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272     cl::desc("We don't interleave loops with a estimated constant trip count "
273              "below this number"));
274 
275 static cl::opt<unsigned> ForceTargetNumScalarRegs(
276     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's number of scalar registers."));
278 
279 static cl::opt<unsigned> ForceTargetNumVectorRegs(
280     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281     cl::desc("A flag that overrides the target's number of vector registers."));
282 
283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285     cl::desc("A flag that overrides the target's max interleave factor for "
286              "scalar loops."));
287 
288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290     cl::desc("A flag that overrides the target's max interleave factor for "
291              "vectorized loops."));
292 
293 static cl::opt<unsigned> ForceTargetInstructionCost(
294     "force-target-instruction-cost", cl::init(0), cl::Hidden,
295     cl::desc("A flag that overrides the target's expected cost for "
296              "an instruction to a single constant value. Mostly "
297              "useful for getting consistent testing."));
298 
299 static cl::opt<bool> ForceTargetSupportsScalableVectors(
300     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301     cl::desc(
302         "Pretend that scalable vectors are supported, even if the target does "
303         "not support them. This flag should only be used for testing."));
304 
305 static cl::opt<unsigned> SmallLoopCost(
306     "small-loop-cost", cl::init(20), cl::Hidden,
307     cl::desc(
308         "The cost of a loop that is considered 'small' by the interleaver."));
309 
310 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312     cl::desc("Enable the use of the block frequency analysis to access PGO "
313              "heuristics minimizing code growth in cold regions and being more "
314              "aggressive in hot regions."));
315 
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319     cl::desc(
320         "Enable runtime interleaving until load/store ports are saturated"));
321 
322 /// Interleave small loops with scalar reductions.
323 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325     cl::desc("Enable interleaving for loops with small iteration counts that "
326              "contain scalar reductions to expose ILP."));
327 
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt<unsigned> NumberOfStoresToPredicate(
330     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331     cl::desc("Max number of stores to be predicated behind an if."));
332 
333 static cl::opt<bool> EnableIndVarRegisterHeur(
334     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335     cl::desc("Count the induction variable only once when interleaving"));
336 
337 static cl::opt<bool> EnableCondStoresVectorization(
338     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339     cl::desc("Enable if predication of stores during vectorization."));
340 
341 static cl::opt<unsigned> MaxNestedScalarReductionIC(
342     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343     cl::desc("The maximum interleave count to use when interleaving a scalar "
344              "reduction in a nested loop."));
345 
346 static cl::opt<bool>
347     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348                            cl::Hidden,
349                            cl::desc("Prefer in-loop vector reductions, "
350                                     "overriding the targets preference."));
351 
352 static cl::opt<bool> ForceOrderedReductions(
353     "force-ordered-reductions", cl::init(false), cl::Hidden,
354     cl::desc("Enable the vectorisation of loops with in-order (strict) "
355              "FP reductions"));
356 
357 static cl::opt<bool> PreferPredicatedReductionSelect(
358     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359     cl::desc(
360         "Prefer predicating a reduction operation over an after loop select."));
361 
362 namespace llvm {
363 cl::opt<bool> EnableVPlanNativePath(
364     "enable-vplan-native-path", cl::Hidden,
365     cl::desc("Enable VPlan-native vectorization path with "
366              "support for outer loop vectorization."));
367 }
368 
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt<bool> VPlanBuildStressTest(
374     "vplan-build-stress-test", cl::init(false), cl::Hidden,
375     cl::desc(
376         "Build VPlan for every supported loop nest in the function and bail "
377         "out right after the build (stress test the VPlan H-CFG construction "
378         "in the VPlan-native vectorization path)."));
379 
380 cl::opt<bool> llvm::EnableLoopInterleaving(
381     "interleave-loops", cl::init(true), cl::Hidden,
382     cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt<bool> llvm::EnableLoopVectorization(
384     "vectorize-loops", cl::init(true), cl::Hidden,
385     cl::desc("Run the Loop vectorization passes"));
386 
387 static cl::opt<bool> PrintVPlansInDotFormat(
388     "vplan-print-in-dot-format", cl::Hidden,
389     cl::desc("Use dot format instead of plain text when dumping VPlans"));
390 
391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392     "force-widen-divrem-via-safe-divisor", cl::Hidden,
393     cl::desc(
394         "Override cost based safe divisor widening for div/rem instructions"));
395 
396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398     cl::Hidden,
399     cl::desc("Try wider VFs if they enable the use of vector variants"));
400 
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410 
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
hasIrregularType(Type * Ty,const DataLayout & DL)414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415   // Determine if an array of N elements of type Ty is "bitcast compatible"
416   // with a <N x Ty> vector.
417   // This is only true if there is no padding between the array elements.
418   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419 }
420 
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
424 ///
425 /// TODO: We should use actual block probability here, if available. Currently,
426 ///       we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()427 static unsigned getReciprocalPredBlockProb() { return 2; }
428 
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 ///   1) Returns exact trip count if it is known.
432 ///   2) Returns expected trip count according to profile data if any.
433 ///   3) Returns upper bound estimate if it is known.
434 ///   4) Returns std::nullopt if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436                                                    Loop *L) {
437   // Check if exact trip count is known.
438   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439     return ExpectedTC;
440 
441   // Check if there is an expected trip count available from profile data.
442   if (LoopVectorizeWithBlockFrequency)
443     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444       return *EstimatedTC;
445 
446   // Check if upper bound estimate is known.
447   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448     return ExpectedTC;
449 
450   return std::nullopt;
451 }
452 
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
interleaveVectors(IRBuilderBase & Builder,ArrayRef<Value * > Vals,const Twine & Name)455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456                                 const Twine &Name) {
457   unsigned Factor = Vals.size();
458   assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459 
460   VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461 #ifndef NDEBUG
462   for (Value *Val : Vals)
463     assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464 #endif
465 
466   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467   // must use intrinsics to interleave.
468   if (VecTy->isScalableTy()) {
469     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470     return Builder.CreateIntrinsic(
471         WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472         /*FMFSource=*/nullptr, Name);
473   }
474 
475   // Fixed length. Start by concatenating all vectors into a wide vector.
476   Value *WideVec = concatenateVectors(Builder, Vals);
477 
478   // Interleave the elements into the wide vector.
479   const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480   return Builder.CreateShuffleVector(
481       WideVec, createInterleaveMask(NumElts, Factor), Name);
482 }
483 
484 namespace {
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks;
487 
488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489 } // namespace
490 
491 namespace llvm {
492 
493 AnalysisKey ShouldRunExtraVectorPasses::Key;
494 
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 ///   counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
503 ///   instructions.
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer {
510 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,ElementCount VecWidth,ElementCount MinProfitableTripCount,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & RTChecks)511   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512                       LoopInfo *LI, DominatorTree *DT,
513                       const TargetLibraryInfo *TLI,
514                       const TargetTransformInfo *TTI, AssumptionCache *AC,
515                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516                       ElementCount MinProfitableTripCount,
517                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523         PSI(PSI), RTChecks(RTChecks) {
524     // Query this against the original loop and save it here because the profile
525     // of the original loop header may change as the transformation happens.
526     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528 
529     if (MinProfitableTripCount.isZero())
530       this->MinProfitableTripCount = VecWidth;
531     else
532       this->MinProfitableTripCount = MinProfitableTripCount;
533   }
534 
535   virtual ~InnerLoopVectorizer() = default;
536 
537   /// Create a new empty loop that will contain vectorized instructions later
538   /// on, while the old loop will be used as the scalar remainder. Control flow
539   /// is generated around the vectorized (and scalar epilogue) loops consisting
540   /// of various checks and bypasses. Return the pre-header block of the new
541   /// loop and the start value for the canonical induction, if it is != 0. The
542   /// latter is the case when vectorizing the epilogue loop. In the case of
543   /// epilogue vectorization, this function is overriden to handle the more
544   /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545   /// look up SCEV expansions for expressions needed during skeleton creation.
546   virtual std::pair<BasicBlock *, Value *>
547   createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
548 
549   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
551 
552   // Return true if any runtime check is added.
areSafetyChecksAdded()553   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
554 
555   /// A type for vectorized values in the new loop. Each value from the
556   /// original loop, when vectorized, is represented by UF vector values in the
557   /// new unrolled loop, where UF is the unroll factor.
558   using VectorParts = SmallVector<Value *, 2>;
559 
560   /// A helper function to scalarize a single Instruction in the innermost loop.
561   /// Generates a sequence of scalar instances for each lane between \p MinLane
562   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564   /// Instr's operands.
565   void scalarizeInstruction(const Instruction *Instr,
566                             VPReplicateRecipe *RepRecipe,
567                             const VPIteration &Instance,
568                             VPTransformState &State);
569 
570   /// Try to vectorize interleaved access group \p Group with the base address
571   /// given in \p Addr, optionally masking the vector operations if \p
572   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573   /// values in the vectorized loop.
574   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575                                 ArrayRef<VPValue *> VPDefs,
576                                 VPTransformState &State, VPValue *Addr,
577                                 ArrayRef<VPValue *> StoredValues,
578                                 VPValue *BlockInMask, bool NeedsMaskForGaps);
579 
580   /// Fix the non-induction PHIs in \p Plan.
581   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582 
583   /// Returns true if the reordering of FP operations is not allowed, but we are
584   /// able to vectorize with strict in-order reductions for the given RdxDesc.
585   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586 
587   /// Create a new phi node for the induction variable \p OrigPhi to resume
588   /// iteration count in the scalar epilogue, from where the vectorized loop
589   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591   /// and the resume values can come from an additional bypass block, the \p
592   /// AdditionalBypass pair provides information about the bypass block and the
593   /// end value on the edge from bypass to this loop.
594   PHINode *createInductionResumeValue(
595       PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596       ArrayRef<BasicBlock *> BypassBlocks,
597       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598 
599   /// Returns the original loop trip count.
getTripCount() const600   Value *getTripCount() const { return TripCount; }
601 
602   /// Used to set the trip count after ILV's construction and after the
603   /// preheader block has been executed. Note that this always holds the trip
604   /// count of the original loop for both main loop and epilogue vectorization.
setTripCount(Value * TC)605   void setTripCount(Value *TC) { TripCount = TC; }
606 
607 protected:
608   friend class LoopVectorizationPlanner;
609 
610   /// A small list of PHINodes.
611   using PhiVector = SmallVector<PHINode *, 4>;
612 
613   /// A type for scalarized values in the new loop. Each value from the
614   /// original loop, when scalarized, is represented by UF x VF scalar values
615   /// in the new unrolled loop, where UF is the unroll factor and VF is the
616   /// vectorization factor.
617   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
618 
619   /// Set up the values of the IVs correctly when exiting the vector loop.
620   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
621                     Value *VectorTripCount, Value *EndValue,
622                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623                     VPlan &Plan, VPTransformState &State);
624 
625   /// Create the exit value of first order recurrences in the middle block and
626   /// update their users.
627   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
628                                VPTransformState &State);
629 
630   /// Create code for the loop exit value of the reduction.
631   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Returns (and creates if needed) the trip count of the widened loop.
638   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
639 
640   /// Returns a bitcasted value to the requested vector type.
641   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
643                                 const DataLayout &DL);
644 
645   /// Emit a bypass check to see if the vector trip count is zero, including if
646   /// it overflows.
647   void emitIterationCountCheck(BasicBlock *Bypass);
648 
649   /// Emit a bypass check to see if all of the SCEV assumptions we've
650   /// had to make are correct. Returns the block containing the checks or
651   /// nullptr if no checks have been added.
652   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
653 
654   /// Emit bypass checks to check any memory assumptions we may have made.
655   /// Returns the block containing the checks or nullptr if no checks have been
656   /// added.
657   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
658 
659   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660   /// vector loop preheader, middle block and scalar preheader.
661   void createVectorLoopSkeleton(StringRef Prefix);
662 
663   /// Create new phi nodes for the induction variables to resume iteration count
664   /// in the scalar epilogue, from where the vectorized loop left off.
665   /// In cases where the loop skeleton is more complicated (eg. epilogue
666   /// vectorization) and the resume values can come from an additional bypass
667   /// block, the \p AdditionalBypass pair provides information about the bypass
668   /// block and the end value on the edge from bypass to this loop.
669   void createInductionResumeValues(
670       const SCEV2ValueTy &ExpandedSCEVs,
671       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672 
673   /// Complete the loop skeleton by adding debug MDs, creating appropriate
674   /// conditional branches in the middle block, preparing the builder and
675   /// running the verifier. Return the preheader of the completed vector loop.
676   BasicBlock *completeLoopSkeleton();
677 
678   /// Collect poison-generating recipes that may generate a poison value that is
679   /// used after vectorization, even when their operands are not poison. Those
680   /// recipes meet the following conditions:
681   ///  * Contribute to the address computation of a recipe generating a widen
682   ///    memory load/store (VPWidenMemoryInstructionRecipe or
683   ///    VPInterleaveRecipe).
684   ///  * Such a widen memory load/store has at least one underlying Instruction
685   ///    that is in a basic block that needs predication and after vectorization
686   ///    the generated instruction won't be predicated.
687   void collectPoisonGeneratingRecipes(VPTransformState &State);
688 
689   /// Allow subclasses to override and print debug traces before/after vplan
690   /// execution, when trace information is requested.
printDebugTracesAtStart()691   virtual void printDebugTracesAtStart(){};
printDebugTracesAtEnd()692   virtual void printDebugTracesAtEnd(){};
693 
694   /// The original loop.
695   Loop *OrigLoop;
696 
697   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698   /// dynamic knowledge to simplify SCEV expressions and converts them to a
699   /// more usable form.
700   PredicatedScalarEvolution &PSE;
701 
702   /// Loop Info.
703   LoopInfo *LI;
704 
705   /// Dominator Tree.
706   DominatorTree *DT;
707 
708   /// Target Library Info.
709   const TargetLibraryInfo *TLI;
710 
711   /// Target Transform Info.
712   const TargetTransformInfo *TTI;
713 
714   /// Assumption Cache.
715   AssumptionCache *AC;
716 
717   /// Interface to emit optimization remarks.
718   OptimizationRemarkEmitter *ORE;
719 
720   /// The vectorization SIMD factor to use. Each vector will have this many
721   /// vector elements.
722   ElementCount VF;
723 
724   ElementCount MinProfitableTripCount;
725 
726   /// The vectorization unroll factor to use. Each scalar is vectorized to this
727   /// many different vector instructions.
728   unsigned UF;
729 
730   /// The builder that we use
731   IRBuilder<> Builder;
732 
733   // --- Vectorization state ---
734 
735   /// The vector-loop preheader.
736   BasicBlock *LoopVectorPreHeader;
737 
738   /// The scalar-loop preheader.
739   BasicBlock *LoopScalarPreHeader;
740 
741   /// Middle Block between the vector and the scalar.
742   BasicBlock *LoopMiddleBlock;
743 
744   /// The unique ExitBlock of the scalar loop if one exists.  Note that
745   /// there can be multiple exiting edges reaching this block.
746   BasicBlock *LoopExitBlock;
747 
748   /// The scalar loop body.
749   BasicBlock *LoopScalarBody;
750 
751   /// A list of all bypass blocks. The first block is the entry of the loop.
752   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
753 
754   /// Store instructions that were predicated.
755   SmallVector<Instruction *, 4> PredicatedInstructions;
756 
757   /// Trip count of the original loop.
758   Value *TripCount = nullptr;
759 
760   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761   Value *VectorTripCount = nullptr;
762 
763   /// The legality analysis.
764   LoopVectorizationLegality *Legal;
765 
766   /// The profitablity analysis.
767   LoopVectorizationCostModel *Cost;
768 
769   // Record whether runtime checks are added.
770   bool AddedSafetyChecks = false;
771 
772   // Holds the end values for each induction variable. We save the end values
773   // so we can later fix-up the external users of the induction variables.
774   DenseMap<PHINode *, Value *> IVEndValues;
775 
776   /// BFI and PSI are used to check for profile guided size optimizations.
777   BlockFrequencyInfo *BFI;
778   ProfileSummaryInfo *PSI;
779 
780   // Whether this loop should be optimized for size based on profile guided size
781   // optimizatios.
782   bool OptForSizeBasedOnProfile;
783 
784   /// Structure to hold information about generated runtime checks, responsible
785   /// for cleaning the checks, if vectorization turns out unprofitable.
786   GeneratedRTChecks &RTChecks;
787 
788   // Holds the resume values for reductions in the loops, used to set the
789   // correct start value of reduction PHIs when vectorizing the epilogue.
790   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
791       ReductionResumeValues;
792 };
793 
794 class InnerLoopUnroller : public InnerLoopVectorizer {
795 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)796   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
797                     LoopInfo *LI, DominatorTree *DT,
798                     const TargetLibraryInfo *TLI,
799                     const TargetTransformInfo *TTI, AssumptionCache *AC,
800                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
801                     LoopVectorizationLegality *LVL,
802                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805                             ElementCount::getFixed(1),
806                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807                             BFI, PSI, Check) {}
808 };
809 
810 /// Encapsulate information regarding vectorization of a loop and its epilogue.
811 /// This information is meant to be updated and used across two stages of
812 /// epilogue vectorization.
813 struct EpilogueLoopVectorizationInfo {
814   ElementCount MainLoopVF = ElementCount::getFixed(0);
815   unsigned MainLoopUF = 0;
816   ElementCount EpilogueVF = ElementCount::getFixed(0);
817   unsigned EpilogueUF = 0;
818   BasicBlock *MainLoopIterationCountCheck = nullptr;
819   BasicBlock *EpilogueIterationCountCheck = nullptr;
820   BasicBlock *SCEVSafetyCheck = nullptr;
821   BasicBlock *MemSafetyCheck = nullptr;
822   Value *TripCount = nullptr;
823   Value *VectorTripCount = nullptr;
824 
EpilogueLoopVectorizationInfollvm::EpilogueLoopVectorizationInfo825   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826                                 ElementCount EVF, unsigned EUF)
827       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828     assert(EUF == 1 &&
829            "A high UF for the epilogue loop is likely not beneficial.");
830   }
831 };
832 
833 /// An extension of the inner loop vectorizer that creates a skeleton for a
834 /// vectorized loop that has its epilogue (residual) also vectorized.
835 /// The idea is to run the vplan on a given loop twice, firstly to setup the
836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837 /// from the first step and vectorize the epilogue.  This is achieved by
838 /// deriving two concrete strategy classes from this base class and invoking
839 /// them in succession from the loop vectorizer planner.
840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841 public:
InnerLoopAndEpilogueVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)842   InnerLoopAndEpilogueVectorizer(
843       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844       DominatorTree *DT, const TargetLibraryInfo *TLI,
845       const TargetTransformInfo *TTI, AssumptionCache *AC,
846       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849       GeneratedRTChecks &Checks)
850       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852                             CM, BFI, PSI, Checks),
853         EPI(EPI) {}
854 
855   // Override this function to handle the more complex control flow around the
856   // three loops.
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)857   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858       const SCEV2ValueTy &ExpandedSCEVs) final {
859     return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860   }
861 
862   /// The interface for creating a vectorized skeleton using one of two
863   /// different strategies, each corresponding to one execution of the vplan
864   /// as described above.
865   virtual std::pair<BasicBlock *, Value *>
866   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867 
868   /// Holds and updates state information required to vectorize the main loop
869   /// and its epilogue in two separate passes. This setup helps us avoid
870   /// regenerating and recomputing runtime safety checks. It also helps us to
871   /// shorten the iteration-count-check path length for the cases where the
872   /// iteration count of the loop is so small that the main vector loop is
873   /// completely skipped.
874   EpilogueLoopVectorizationInfo &EPI;
875 };
876 
877 /// A specialized derived class of inner loop vectorizer that performs
878 /// vectorization of *main* loops in the process of vectorizing loops and their
879 /// epilogues.
880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881 public:
EpilogueVectorizerMainLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Check)882   EpilogueVectorizerMainLoop(
883       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884       DominatorTree *DT, const TargetLibraryInfo *TLI,
885       const TargetTransformInfo *TTI, AssumptionCache *AC,
886       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889       GeneratedRTChecks &Check)
890       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891                                        EPI, LVL, CM, BFI, PSI, Check) {}
892   /// Implements the interface for creating a vectorized skeleton using the
893   /// *main loop* strategy (ie the first pass of vplan execution).
894   std::pair<BasicBlock *, Value *>
895   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896 
897 protected:
898   /// Emits an iteration count bypass check once for the main loop (when \p
899   /// ForEpilogue is false) and once for the epilogue loop (when \p
900   /// ForEpilogue is true).
901   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902   void printDebugTracesAtStart() override;
903   void printDebugTracesAtEnd() override;
904 };
905 
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
908 // their epilogues.
909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910 public:
EpilogueVectorizerEpilogueLoop(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,EpilogueLoopVectorizationInfo & EPI,LoopVectorizationLegality * LVL,llvm::LoopVectorizationCostModel * CM,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,GeneratedRTChecks & Checks)911   EpilogueVectorizerEpilogueLoop(
912       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913       DominatorTree *DT, const TargetLibraryInfo *TLI,
914       const TargetTransformInfo *TTI, AssumptionCache *AC,
915       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918       GeneratedRTChecks &Checks)
919       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920                                        EPI, LVL, CM, BFI, PSI, Checks) {
921     TripCount = EPI.TripCount;
922   }
923   /// Implements the interface for creating a vectorized skeleton using the
924   /// *epilogue loop* strategy (ie the second pass of vplan execution).
925   std::pair<BasicBlock *, Value *>
926   createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927 
928 protected:
929   /// Emits an iteration count bypass check after the main vector loop has
930   /// finished to see if there are any iterations left to execute by either
931   /// the vector epilogue or the scalar epilogue.
932   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933                                                       BasicBlock *Bypass,
934                                                       BasicBlock *Insert);
935   void printDebugTracesAtStart() override;
936   void printDebugTracesAtEnd() override;
937 };
938 } // end namespace llvm
939 
940 /// Look for a meaningful debug location on the instruction or it's
941 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
943   if (!I)
944     return DebugLoc();
945 
946   DebugLoc Empty;
947   if (I->getDebugLoc() != Empty)
948     return I->getDebugLoc();
949 
950   for (Use &Op : I->operands()) {
951     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
952       if (OpInst->getDebugLoc() != Empty)
953         return OpInst->getDebugLoc();
954   }
955 
956   return I->getDebugLoc();
957 }
958 
959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960 /// is passed, the message relates to that particular instruction.
961 #ifndef NDEBUG
debugVectorizationMessage(const StringRef Prefix,const StringRef DebugMsg,Instruction * I)962 static void debugVectorizationMessage(const StringRef Prefix,
963                                       const StringRef DebugMsg,
964                                       Instruction *I) {
965   dbgs() << "LV: " << Prefix << DebugMsg;
966   if (I != nullptr)
967     dbgs() << " " << *I;
968   else
969     dbgs() << '.';
970   dbgs() << '\n';
971 }
972 #endif
973 
974 /// Create an analysis remark that explains why vectorization failed
975 ///
976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
977 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
978 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
979 /// the location of the remark.  \return the remark object that can be
980 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
982     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
983   Value *CodeRegion = TheLoop->getHeader();
984   DebugLoc DL = TheLoop->getStartLoc();
985 
986   if (I) {
987     CodeRegion = I->getParent();
988     // If there is no debug location attached to the instruction, revert back to
989     // using the loop's.
990     if (I->getDebugLoc())
991       DL = I->getDebugLoc();
992   }
993 
994   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
995 }
996 
997 namespace llvm {
998 
999 /// Return a value for Step multiplied by VF.
createStepForVF(IRBuilderBase & B,Type * Ty,ElementCount VF,int64_t Step)1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001                        int64_t Step) {
1002   assert(Ty->isIntegerTy() && "Expected an integer step");
1003   return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004 }
1005 
1006 /// Return the runtime value for VF.
getRuntimeVF(IRBuilderBase & B,Type * Ty,ElementCount VF)1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008   return B.CreateElementCount(Ty, VF);
1009 }
1010 
createTripCountSCEV(Type * IdxTy,PredicatedScalarEvolution & PSE,Loop * OrigLoop)1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012                                 Loop *OrigLoop) {
1013   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015 
1016   ScalarEvolution &SE = *PSE.getSE();
1017   return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018 }
1019 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1020 void reportVectorizationFailure(const StringRef DebugMsg,
1021                                 const StringRef OREMsg, const StringRef ORETag,
1022                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023                                 Instruction *I) {
1024   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1025   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026   ORE->emit(
1027       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028       << "loop not vectorized: " << OREMsg);
1029 }
1030 
reportVectorizationInfo(const StringRef Msg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033                              Instruction *I) {
1034   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036   ORE->emit(
1037       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038       << Msg);
1039 }
1040 
1041 /// Report successful vectorization of the loop. In case an outer loop is
1042 /// vectorized, prepend "outer" to the vectorization remark.
reportVectorization(OptimizationRemarkEmitter * ORE,Loop * TheLoop,VectorizationFactor VF,unsigned IC)1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044                                 VectorizationFactor VF, unsigned IC) {
1045   LLVM_DEBUG(debugVectorizationMessage(
1046       "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047       nullptr));
1048   StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049   ORE->emit([&]() {
1050     return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051                               TheLoop->getHeader())
1052            << "vectorized " << LoopType << "loop (vectorization width: "
1053            << ore::NV("VectorizationFactor", VF.Width)
1054            << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055   });
1056 }
1057 
1058 } // end namespace llvm
1059 
1060 #ifndef NDEBUG
1061 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)1062 static std::string getDebugLocString(const Loop *L) {
1063   std::string Result;
1064   if (L) {
1065     raw_string_ostream OS(Result);
1066     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1067       LoopDbgLoc.print(OS);
1068     else
1069       // Just print the module name.
1070       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1071     OS.flush();
1072   }
1073   return Result;
1074 }
1075 #endif
1076 
collectPoisonGeneratingRecipes(VPTransformState & State)1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078     VPTransformState &State) {
1079 
1080   // Collect recipes in the backward slice of `Root` that may generate a poison
1081   // value that is used after vectorization.
1082   SmallPtrSet<VPRecipeBase *, 16> Visited;
1083   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1084     SmallVector<VPRecipeBase *, 16> Worklist;
1085     Worklist.push_back(Root);
1086 
1087     // Traverse the backward slice of Root through its use-def chain.
1088     while (!Worklist.empty()) {
1089       VPRecipeBase *CurRec = Worklist.back();
1090       Worklist.pop_back();
1091 
1092       if (!Visited.insert(CurRec).second)
1093         continue;
1094 
1095       // Prune search if we find another recipe generating a widen memory
1096       // instruction. Widen memory instructions involved in address computation
1097       // will lead to gather/scatter instructions, which don't need to be
1098       // handled.
1099       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1100           isa<VPInterleaveRecipe>(CurRec) ||
1101           isa<VPScalarIVStepsRecipe>(CurRec) ||
1102           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1104         continue;
1105 
1106       // This recipe contributes to the address computation of a widen
1107       // load/store. If the underlying instruction has poison-generating flags,
1108       // drop them directly.
1109       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110         RecWithFlags->dropPoisonGeneratingFlags();
1111       } else {
1112         Instruction *Instr = dyn_cast_or_null<Instruction>(
1113             CurRec->getVPSingleValue()->getUnderlyingValue());
1114         (void)Instr;
1115         assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116                "found instruction with poison generating flags not covered by "
1117                "VPRecipeWithIRFlags");
1118       }
1119 
1120       // Add new definitions to the worklist.
1121       for (VPValue *operand : CurRec->operands())
1122         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123           Worklist.push_back(OpDef);
1124     }
1125   });
1126 
1127   // Traverse all the recipes in the VPlan and collect the poison-generating
1128   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129   // VPInterleaveRecipe.
1130   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1131   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1132     for (VPRecipeBase &Recipe : *VPBB) {
1133       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1134         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1136         if (AddrDef && WidenRec->isConsecutive() &&
1137             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1139       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1141         if (AddrDef) {
1142           // Check if any member of the interleave group needs predication.
1143           const InterleaveGroup<Instruction> *InterGroup =
1144               InterleaveRec->getInterleaveGroup();
1145           bool NeedPredication = false;
1146           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1147                I < NumMembers; ++I) {
1148             Instruction *Member = InterGroup->getMember(I);
1149             if (Member)
1150               NeedPredication |=
1151                   Legal->blockNeedsPredication(Member->getParent());
1152           }
1153 
1154           if (NeedPredication)
1155             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1156         }
1157       }
1158     }
1159   }
1160 }
1161 
1162 namespace llvm {
1163 
1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1165 // lowered.
1166 enum ScalarEpilogueLowering {
1167 
1168   // The default: allowing scalar epilogues.
1169   CM_ScalarEpilogueAllowed,
1170 
1171   // Vectorization with OptForSize: don't allow epilogues.
1172   CM_ScalarEpilogueNotAllowedOptSize,
1173 
1174   // A special case of vectorisation with OptForSize: loops with a very small
1175   // trip count are considered for vectorization under OptForSize, thereby
1176   // making sure the cost of their loop body is dominant, free of runtime
1177   // guards and scalar iteration overheads.
1178   CM_ScalarEpilogueNotAllowedLowTripLoop,
1179 
1180   // Loop hint predicate indicating an epilogue is undesired.
1181   CM_ScalarEpilogueNotNeededUsePredicate,
1182 
1183   // Directive indicating we must either tail fold or not vectorize
1184   CM_ScalarEpilogueNotAllowedUsePredicate
1185 };
1186 
1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188 
1189 /// LoopVectorizationCostModel - estimates the expected speedups due to
1190 /// vectorization.
1191 /// In many cases vectorization is not profitable. This can happen because of
1192 /// a number of reasons. In this class we mainly attempt to predict the
1193 /// expected speedup/slowdowns due to the supported instruction set. We use the
1194 /// TargetTransformInfo to query the different backends for the cost of
1195 /// different operations.
1196 class LoopVectorizationCostModel {
1197 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)1198   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1199                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1200                              LoopVectorizationLegality *Legal,
1201                              const TargetTransformInfo &TTI,
1202                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1203                              AssumptionCache *AC,
1204                              OptimizationRemarkEmitter *ORE, const Function *F,
1205                              const LoopVectorizeHints *Hints,
1206                              InterleavedAccessInfo &IAI)
1207       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209         Hints(Hints), InterleaveInfo(IAI) {}
1210 
1211   /// \return An upper bound for the vectorization factors (both fixed and
1212   /// scalable). If the factors are 0, vectorization and interleaving should be
1213   /// avoided up front.
1214   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1215 
1216   /// \return True if runtime checks are required for vectorization, and false
1217   /// otherwise.
1218   bool runtimeChecksRequired();
1219 
1220   /// Setup cost-based decisions for user vectorization factor.
1221   /// \return true if the UserVF is a feasible VF to be chosen.
selectUserVectorizationFactor(ElementCount UserVF)1222   bool selectUserVectorizationFactor(ElementCount UserVF) {
1223     collectUniformsAndScalars(UserVF);
1224     collectInstsToScalarize(UserVF);
1225     return expectedCost(UserVF).first.isValid();
1226   }
1227 
1228   /// \return The size (in bits) of the smallest and widest types in the code
1229   /// that needs to be vectorized. We ignore values that remain scalar such as
1230   /// 64 bit loop indices.
1231   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232 
1233   /// \return The desired interleave count.
1234   /// If interleave count has been specified by metadata it will be returned.
1235   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236   /// are the selected vectorization factor and the cost of the selected VF.
1237   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1238 
1239   /// Memory access instruction may be vectorized in more than one way.
1240   /// Form of instruction after vectorization depends on cost.
1241   /// This function takes cost-based decisions for Load/Store instructions
1242   /// and collects them in a map. This decisions map is used for building
1243   /// the lists of loop-uniform and loop-scalar instructions.
1244   /// The calculated cost is saved with widening decision in order to
1245   /// avoid redundant calculations.
1246   void setCostBasedWideningDecision(ElementCount VF);
1247 
1248   /// A call may be vectorized in different ways depending on whether we have
1249   /// vectorized variants available and whether the target supports masking.
1250   /// This function analyzes all calls in the function at the supplied VF,
1251   /// makes a decision based on the costs of available options, and stores that
1252   /// decision in a map for use in planning and plan execution.
1253   void setVectorizedCallDecision(ElementCount VF);
1254 
1255   /// A struct that represents some properties of the register usage
1256   /// of a loop.
1257   struct RegisterUsage {
1258     /// Holds the number of loop invariant values that are used in the loop.
1259     /// The key is ClassID of target-provided register class.
1260     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1261     /// Holds the maximum number of concurrent live intervals in the loop.
1262     /// The key is ClassID of target-provided register class.
1263     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1264   };
1265 
1266   /// \return Returns information about the register usages of the loop for the
1267   /// given vectorization factors.
1268   SmallVector<RegisterUsage, 8>
1269   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1270 
1271   /// Collect values we want to ignore in the cost model.
1272   void collectValuesToIgnore();
1273 
1274   /// Collect all element types in the loop for which widening is needed.
1275   void collectElementTypesForWidening();
1276 
1277   /// Split reductions into those that happen in the loop, and those that happen
1278   /// outside. In loop reductions are collected into InLoopReductions.
1279   void collectInLoopReductions();
1280 
1281   /// Returns true if we should use strict in-order reductions for the given
1282   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284   /// of FP operations.
useOrderedReductions(const RecurrenceDescriptor & RdxDesc) const1285   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286     return !Hints->allowReordering() && RdxDesc.isOrdered();
1287   }
1288 
1289   /// \returns The smallest bitwidth each instruction can be represented with.
1290   /// The vector equivalents of these instructions should be truncated to this
1291   /// type.
getMinimalBitwidths() const1292   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1293     return MinBWs;
1294   }
1295 
1296   /// \returns True if it is more profitable to scalarize instruction \p I for
1297   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,ElementCount VF) const1298   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299     assert(VF.isVector() &&
1300            "Profitable to scalarize relevant only for VF > 1.");
1301 
1302     // Cost model is not run in the VPlan-native path - return conservative
1303     // result until this changes.
1304     if (EnableVPlanNativePath)
1305       return false;
1306 
1307     auto Scalars = InstsToScalarize.find(VF);
1308     assert(Scalars != InstsToScalarize.end() &&
1309            "VF not yet analyzed for scalarization profitability");
1310     return Scalars->second.contains(I);
1311   }
1312 
1313   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,ElementCount VF) const1314   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315     // Pseudo probe needs to be duplicated for each unrolled iteration and
1316     // vector lane so that profiled loop trip count can be accurately
1317     // accumulated instead of being under counted.
1318     if (isa<PseudoProbeInst>(I))
1319       return false;
1320 
1321     if (VF.isScalar())
1322       return true;
1323 
1324     // Cost model is not run in the VPlan-native path - return conservative
1325     // result until this changes.
1326     if (EnableVPlanNativePath)
1327       return false;
1328 
1329     auto UniformsPerVF = Uniforms.find(VF);
1330     assert(UniformsPerVF != Uniforms.end() &&
1331            "VF not yet analyzed for uniformity");
1332     return UniformsPerVF->second.count(I);
1333   }
1334 
1335   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,ElementCount VF) const1336   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337     if (VF.isScalar())
1338       return true;
1339 
1340     // Cost model is not run in the VPlan-native path - return conservative
1341     // result until this changes.
1342     if (EnableVPlanNativePath)
1343       return false;
1344 
1345     auto ScalarsPerVF = Scalars.find(VF);
1346     assert(ScalarsPerVF != Scalars.end() &&
1347            "Scalar values are not calculated for VF");
1348     return ScalarsPerVF->second.count(I);
1349   }
1350 
1351   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,ElementCount VF) const1353   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354     return VF.isVector() && MinBWs.contains(I) &&
1355            !isProfitableToScalarize(I, VF) &&
1356            !isScalarAfterVectorization(I, VF);
1357   }
1358 
1359   /// Decision that was taken during cost calculation for memory instruction.
1360   enum InstWidening {
1361     CM_Unknown,
1362     CM_Widen,         // For consecutive accesses with stride +1.
1363     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364     CM_Interleave,
1365     CM_GatherScatter,
1366     CM_Scalarize,
1367     CM_VectorCall,
1368     CM_IntrinsicCall
1369   };
1370 
1371   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,ElementCount VF,InstWidening W,InstructionCost Cost)1373   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374                            InstructionCost Cost) {
1375     assert(VF.isVector() && "Expected VF >=2");
1376     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1377   }
1378 
1379   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,ElementCount VF,InstWidening W,InstructionCost Cost)1381   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382                            ElementCount VF, InstWidening W,
1383                            InstructionCost Cost) {
1384     assert(VF.isVector() && "Expected VF >=2");
1385     /// Broadcast this decicion to all instructions inside the group.
1386     /// But the cost will be assigned to one instruction only.
1387     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388       if (auto *I = Grp->getMember(i)) {
1389         if (Grp->getInsertPos() == I)
1390           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391         else
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393       }
1394     }
1395   }
1396 
1397   /// Return the cost model decision for the given instruction \p I and vector
1398   /// width \p VF. Return CM_Unknown if this instruction did not pass
1399   /// through the cost modeling.
getWideningDecision(Instruction * I,ElementCount VF) const1400   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401     assert(VF.isVector() && "Expected VF to be a vector VF");
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return CM_GatherScatter;
1406 
1407     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408     auto Itr = WideningDecisions.find(InstOnVF);
1409     if (Itr == WideningDecisions.end())
1410       return CM_Unknown;
1411     return Itr->second.first;
1412   }
1413 
1414   /// Return the vectorization cost for the given instruction \p I and vector
1415   /// width \p VF.
getWideningCost(Instruction * I,ElementCount VF)1416   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419     assert(WideningDecisions.contains(InstOnVF) &&
1420            "The cost is not calculated");
1421     return WideningDecisions[InstOnVF].second;
1422   }
1423 
1424   struct CallWideningDecision {
1425     InstWidening Kind;
1426     Function *Variant;
1427     Intrinsic::ID IID;
1428     std::optional<unsigned> MaskPos;
1429     InstructionCost Cost;
1430   };
1431 
setCallWideningDecision(CallInst * CI,ElementCount VF,InstWidening Kind,Function * Variant,Intrinsic::ID IID,std::optional<unsigned> MaskPos,InstructionCost Cost)1432   void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433                                Function *Variant, Intrinsic::ID IID,
1434                                std::optional<unsigned> MaskPos,
1435                                InstructionCost Cost) {
1436     assert(!VF.isScalar() && "Expected vector VF");
1437     CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438                                                      MaskPos, Cost};
1439   }
1440 
getCallWideningDecision(CallInst * CI,ElementCount VF) const1441   CallWideningDecision getCallWideningDecision(CallInst *CI,
1442                                                ElementCount VF) const {
1443     assert(!VF.isScalar() && "Expected vector VF");
1444     return CallWideningDecisions.at(std::make_pair(CI, VF));
1445   }
1446 
1447   /// Return True if instruction \p I is an optimizable truncate whose operand
1448   /// is an induction variable. Such a truncate will be removed by adding a new
1449   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,ElementCount VF)1450   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1451     // If the instruction is not a truncate, return false.
1452     auto *Trunc = dyn_cast<TruncInst>(I);
1453     if (!Trunc)
1454       return false;
1455 
1456     // Get the source and destination types of the truncate.
1457     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1458     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1459 
1460     // If the truncate is free for the given types, return false. Replacing a
1461     // free truncate with an induction variable would add an induction variable
1462     // update instruction to each iteration of the loop. We exclude from this
1463     // check the primary induction variable since it will need an update
1464     // instruction regardless.
1465     Value *Op = Trunc->getOperand(0);
1466     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1467       return false;
1468 
1469     // If the truncated value is not an induction variable, return false.
1470     return Legal->isInductionPhi(Op);
1471   }
1472 
1473   /// Collects the instructions to scalarize for each predicated instruction in
1474   /// the loop.
1475   void collectInstsToScalarize(ElementCount VF);
1476 
1477   /// Collect Uniform and Scalar values for the given \p VF.
1478   /// The sets depend on CM decision for Load/Store instructions
1479   /// that may be vectorized as interleave, gather-scatter or scalarized.
1480   /// Also make a decision on what to do about call instructions in the loop
1481   /// at that VF -- scalarize, call a known vector routine, or call a
1482   /// vector intrinsic.
collectUniformsAndScalars(ElementCount VF)1483   void collectUniformsAndScalars(ElementCount VF) {
1484     // Do the analysis once.
1485     if (VF.isScalar() || Uniforms.contains(VF))
1486       return;
1487     setCostBasedWideningDecision(VF);
1488     setVectorizedCallDecision(VF);
1489     collectLoopUniforms(VF);
1490     collectLoopScalars(VF);
1491   }
1492 
1493   /// Returns true if the target machine supports masked store operation
1494   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,Align Alignment) const1495   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496     return Legal->isConsecutivePtr(DataType, Ptr) &&
1497            TTI.isLegalMaskedStore(DataType, Alignment);
1498   }
1499 
1500   /// Returns true if the target machine supports masked load operation
1501   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,Align Alignment) const1502   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503     return Legal->isConsecutivePtr(DataType, Ptr) &&
1504            TTI.isLegalMaskedLoad(DataType, Alignment);
1505   }
1506 
1507   /// Returns true if the target machine can represent \p V as a masked gather
1508   /// or scatter operation.
isLegalGatherOrScatter(Value * V,ElementCount VF)1509   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1510     bool LI = isa<LoadInst>(V);
1511     bool SI = isa<StoreInst>(V);
1512     if (!LI && !SI)
1513       return false;
1514     auto *Ty = getLoadStoreType(V);
1515     Align Align = getLoadStoreAlignment(V);
1516     if (VF.isVector())
1517       Ty = VectorType::get(Ty, VF);
1518     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520   }
1521 
1522   /// Returns true if the target machine supports all of the reduction
1523   /// variables found for the given VF.
canVectorizeReductions(ElementCount VF) const1524   bool canVectorizeReductions(ElementCount VF) const {
1525     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528     }));
1529   }
1530 
1531   /// Given costs for both strategies, return true if the scalar predication
1532   /// lowering should be used for div/rem.  This incorporates an override
1533   /// option so it is not simply a cost comparison.
isDivRemScalarWithPredication(InstructionCost ScalarCost,InstructionCost SafeDivisorCost) const1534   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535                                      InstructionCost SafeDivisorCost) const {
1536     switch (ForceSafeDivisor) {
1537     case cl::BOU_UNSET:
1538       return ScalarCost < SafeDivisorCost;
1539     case cl::BOU_TRUE:
1540       return false;
1541     case cl::BOU_FALSE:
1542       return true;
1543     };
1544     llvm_unreachable("impossible case value");
1545   }
1546 
1547   /// Returns true if \p I is an instruction which requires predication and
1548   /// for which our chosen predication strategy is scalarization (i.e. we
1549   /// don't have an alternate strategy such as masking available).
1550   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1552 
1553   /// Returns true if \p I is an instruction that needs to be predicated
1554   /// at runtime.  The result is independent of the predication mechanism.
1555   /// Superset of instructions that return true for isScalarWithPredication.
1556   bool isPredicatedInst(Instruction *I) const;
1557 
1558   /// Return the costs for our two available strategies for lowering a
1559   /// div/rem operation which requires speculating at least one lane.
1560   /// First result is for scalarization (will be invalid for scalable
1561   /// vectors); second is for the safe-divisor strategy.
1562   std::pair<InstructionCost, InstructionCost>
1563   getDivRemSpeculationCost(Instruction *I,
1564                            ElementCount VF) const;
1565 
1566   /// Returns true if \p I is a memory instruction with consecutive memory
1567   /// access that can be widened.
1568   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1569 
1570   /// Returns true if \p I is a memory instruction in an interleaved-group
1571   /// of memory accesses that can be vectorized with wide vector loads/stores
1572   /// and shuffles.
1573   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1574 
1575   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1576   bool isAccessInterleaved(Instruction *Instr) {
1577     return InterleaveInfo.isInterleaved(Instr);
1578   }
1579 
1580   /// Get the interleaved access group that \p Instr belongs to.
1581   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1582   getInterleavedAccessGroup(Instruction *Instr) {
1583     return InterleaveInfo.getInterleaveGroup(Instr);
1584   }
1585 
1586   /// Returns true if we're required to use a scalar epilogue for at least
1587   /// the final iteration of the original loop.
requiresScalarEpilogue(bool IsVectorizing) const1588   bool requiresScalarEpilogue(bool IsVectorizing) const {
1589     if (!isScalarEpilogueAllowed())
1590       return false;
1591     // If we might exit from anywhere but the latch, must run the exiting
1592     // iteration in scalar form.
1593     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594       return true;
1595     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596   }
1597 
1598   /// Returns true if we're required to use a scalar epilogue for at least
1599   /// the final iteration of the original loop for all VFs in \p Range.
1600   /// A scalar epilogue must either be required for all VFs in \p Range or for
1601   /// none.
requiresScalarEpilogue(VFRange Range) const1602   bool requiresScalarEpilogue(VFRange Range) const {
1603     auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604       return requiresScalarEpilogue(VF.isVector());
1605     };
1606     bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607     assert(
1608         (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609         "all VFs in range must agree on whether a scalar epilogue is required");
1610     return IsRequired;
1611   }
1612 
1613   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614   /// loop hint annotation.
isScalarEpilogueAllowed() const1615   bool isScalarEpilogueAllowed() const {
1616     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1617   }
1618 
1619   /// Returns the TailFoldingStyle that is best for the current loop.
1620   TailFoldingStyle
getTailFoldingStyle(bool IVUpdateMayOverflow=true) const1621   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622     if (!CanFoldTailByMasking)
1623       return TailFoldingStyle::None;
1624 
1625     if (ForceTailFoldingStyle.getNumOccurrences())
1626       return ForceTailFoldingStyle;
1627 
1628     return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629   }
1630 
1631   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1632   bool foldTailByMasking() const {
1633     return getTailFoldingStyle() != TailFoldingStyle::None;
1634   }
1635 
1636   /// Returns true if the instructions in this block requires predication
1637   /// for any reason, e.g. because tail folding now requires a predicate
1638   /// or because the block in the original loop was predicated.
blockNeedsPredicationForAnyReason(BasicBlock * BB) const1639   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1640     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1641   }
1642 
1643   /// Returns true if the Phi is part of an inloop reduction.
isInLoopReduction(PHINode * Phi) const1644   bool isInLoopReduction(PHINode *Phi) const {
1645     return InLoopReductions.contains(Phi);
1646   }
1647 
1648   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649   /// with factor VF.  Return the cost of the instruction, including
1650   /// scalarization overhead if it's needed.
1651   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1652 
1653   /// Estimate cost of a call instruction CI if it were vectorized with factor
1654   /// VF. Return the cost of the instruction, including scalarization overhead
1655   /// if it's needed.
1656   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1657 
1658   /// Invalidates decisions already taken by the cost model.
invalidateCostModelingDecisions()1659   void invalidateCostModelingDecisions() {
1660     WideningDecisions.clear();
1661     CallWideningDecisions.clear();
1662     Uniforms.clear();
1663     Scalars.clear();
1664   }
1665 
1666   /// The vectorization cost is a combination of the cost itself and a boolean
1667   /// indicating whether any of the contributing operations will actually
1668   /// operate on vector values after type legalization in the backend. If this
1669   /// latter value is false, then all operations will be scalarized (i.e. no
1670   /// vectorization has actually taken place).
1671   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672 
1673   /// Returns the expected execution cost. The unit of the cost does
1674   /// not matter because we use the 'cost' units to compare different
1675   /// vector widths. The cost that is returned is *not* normalized by
1676   /// the factor width. If \p Invalid is not nullptr, this function
1677   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678   /// each instruction that has an Invalid cost for the given VF.
1679   VectorizationCostTy
1680   expectedCost(ElementCount VF,
1681                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682 
hasPredStores() const1683   bool hasPredStores() const { return NumPredStores > 0; }
1684 
1685   /// Returns true if epilogue vectorization is considered profitable, and
1686   /// false otherwise.
1687   /// \p VF is the vectorization factor chosen for the original loop.
1688   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689 
1690 private:
1691   unsigned NumPredStores = 0;
1692 
1693   /// \return An upper bound for the vectorization factors for both
1694   /// fixed and scalable vectorization, where the minimum-known number of
1695   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696   /// disabled or unsupported, then the scalable part will be equal to
1697   /// ElementCount::getScalable(0).
1698   FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1699                                            ElementCount UserVF,
1700                                            bool FoldTailByMasking);
1701 
1702   /// \return the maximized element count based on the targets vector
1703   /// registers and the loop trip-count, but limited to a maximum safe VF.
1704   /// This is a helper function of computeFeasibleMaxVF.
1705   ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706                                        unsigned SmallestType,
1707                                        unsigned WidestType,
1708                                        ElementCount MaxSafeVF,
1709                                        bool FoldTailByMasking);
1710 
1711   /// \return the maximum legal scalable VF, based on the safe max number
1712   /// of elements.
1713   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714 
1715   /// Returns the execution time cost of an instruction for a given vector
1716   /// width. Vector width of one means scalar.
1717   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1718 
1719   /// The cost-computation logic from getInstructionCost which provides
1720   /// the vector type as an output parameter.
1721   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722                                      Type *&VectorTy);
1723 
1724   /// Return the cost of instructions in an inloop reduction pattern, if I is
1725   /// part of that pattern.
1726   std::optional<InstructionCost>
1727   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728                           TTI::TargetCostKind CostKind) const;
1729 
1730   /// Calculate vectorization cost of memory instruction \p I.
1731   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1732 
1733   /// The cost computation for scalarized memory instruction.
1734   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1735 
1736   /// The cost computation for interleaving group of memory instructions.
1737   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1738 
1739   /// The cost computation for Gather/Scatter instruction.
1740   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1741 
1742   /// The cost computation for widening instruction \p I with consecutive
1743   /// memory access.
1744   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1745 
1746   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747   /// Load: scalar load + broadcast.
1748   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1749   /// element)
1750   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1751 
1752   /// Estimate the overhead of scalarizing an instruction. This is a
1753   /// convenience wrapper for the type-based getScalarizationOverhead API.
1754   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755                                            TTI::TargetCostKind CostKind) const;
1756 
1757   /// Returns true if an artificially high cost for emulated masked memrefs
1758   /// should be used.
1759   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1760 
1761   /// Map of scalar integer values to the smallest bitwidth they can be legally
1762   /// represented as. The vector equivalents of these values should be truncated
1763   /// to this type.
1764   MapVector<Instruction *, uint64_t> MinBWs;
1765 
1766   /// A type representing the costs for instructions if they were to be
1767   /// scalarized rather than vectorized. The entries are Instruction-Cost
1768   /// pairs.
1769   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1770 
1771   /// A set containing all BasicBlocks that are known to present after
1772   /// vectorization as a predicated block.
1773   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774       PredicatedBBsAfterVectorization;
1775 
1776   /// Records whether it is allowed to have the original scalar loop execute at
1777   /// least once. This may be needed as a fallback loop in case runtime
1778   /// aliasing/dependence checks fail, or to handle the tail/remainder
1779   /// iterations when the trip count is unknown or doesn't divide by the VF,
1780   /// or as a peel-loop to handle gaps in interleave-groups.
1781   /// Under optsize and when the trip count is very small we don't allow any
1782   /// iterations to execute in the scalar loop.
1783   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1784 
1785   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786   bool CanFoldTailByMasking = false;
1787 
1788   /// A map holding scalar costs for different vectorization factors. The
1789   /// presence of a cost for an instruction in the mapping indicates that the
1790   /// instruction will be scalarized when vectorizing with the associated
1791   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1793 
1794   /// Holds the instructions known to be uniform after vectorization.
1795   /// The data is collected per VF.
1796   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1797 
1798   /// Holds the instructions known to be scalar after vectorization.
1799   /// The data is collected per VF.
1800   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1801 
1802   /// Holds the instructions (address computations) that are forced to be
1803   /// scalarized.
1804   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805 
1806   /// PHINodes of the reductions that should be expanded in-loop.
1807   SmallPtrSet<PHINode *, 4> InLoopReductions;
1808 
1809   /// A Map of inloop reduction operations and their immediate chain operand.
1810   /// FIXME: This can be removed once reductions can be costed correctly in
1811   /// VPlan. This was added to allow quick lookup of the inloop operations.
1812   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1813 
1814   /// Returns the expected difference in cost from scalarizing the expression
1815   /// feeding a predicated instruction \p PredInst. The instructions to
1816   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817   /// non-negative return value implies the expression will be scalarized.
1818   /// Currently, only single-use chains are considered for scalarization.
1819   InstructionCost computePredInstDiscount(Instruction *PredInst,
1820                                           ScalarCostsTy &ScalarCosts,
1821                                           ElementCount VF);
1822 
1823   /// Collect the instructions that are uniform after vectorization. An
1824   /// instruction is uniform if we represent it with a single scalar value in
1825   /// the vectorized loop corresponding to each vector iteration. Examples of
1826   /// uniform instructions include pointer operands of consecutive or
1827   /// interleaved memory accesses. Note that although uniformity implies an
1828   /// instruction will be scalar, the reverse is not true. In general, a
1829   /// scalarized instruction will be represented by VF scalar values in the
1830   /// vectorized loop, each corresponding to an iteration of the original
1831   /// scalar loop.
1832   void collectLoopUniforms(ElementCount VF);
1833 
1834   /// Collect the instructions that are scalar after vectorization. An
1835   /// instruction is scalar if it is known to be uniform or will be scalarized
1836   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837   /// to the list if they are used by a load/store instruction that is marked as
1838   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839   /// VF values in the vectorized loop, each corresponding to an iteration of
1840   /// the original scalar loop.
1841   void collectLoopScalars(ElementCount VF);
1842 
1843   /// Keeps cost model vectorization decision and cost for instructions.
1844   /// Right now it is used for memory instructions only.
1845   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846                                 std::pair<InstWidening, InstructionCost>>;
1847 
1848   DecisionList WideningDecisions;
1849 
1850   using CallDecisionList =
1851       DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852 
1853   CallDecisionList CallWideningDecisions;
1854 
1855   /// Returns true if \p V is expected to be vectorized and it needs to be
1856   /// extracted.
needsExtract(Value * V,ElementCount VF) const1857   bool needsExtract(Value *V, ElementCount VF) const {
1858     Instruction *I = dyn_cast<Instruction>(V);
1859     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860         TheLoop->isLoopInvariant(I))
1861       return false;
1862 
1863     // Assume we can vectorize V (and hence we need extraction) if the
1864     // scalars are not computed yet. This can happen, because it is called
1865     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866     // the scalars are collected. That should be a safe assumption in most
1867     // cases, because we check if the operands have vectorizable types
1868     // beforehand in LoopVectorizationLegality.
1869     return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1870   };
1871 
1872   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,ElementCount VF) const1873   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874                                                    ElementCount VF) const {
1875     return SmallVector<Value *, 4>(make_filter_range(
1876         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1877   }
1878 
1879 public:
1880   /// The loop that we evaluate.
1881   Loop *TheLoop;
1882 
1883   /// Predicated scalar evolution analysis.
1884   PredicatedScalarEvolution &PSE;
1885 
1886   /// Loop Info analysis.
1887   LoopInfo *LI;
1888 
1889   /// Vectorization legality.
1890   LoopVectorizationLegality *Legal;
1891 
1892   /// Vector target information.
1893   const TargetTransformInfo &TTI;
1894 
1895   /// Target Library Info.
1896   const TargetLibraryInfo *TLI;
1897 
1898   /// Demanded bits analysis.
1899   DemandedBits *DB;
1900 
1901   /// Assumption cache.
1902   AssumptionCache *AC;
1903 
1904   /// Interface to emit optimization remarks.
1905   OptimizationRemarkEmitter *ORE;
1906 
1907   const Function *TheFunction;
1908 
1909   /// Loop Vectorize Hint.
1910   const LoopVectorizeHints *Hints;
1911 
1912   /// The interleave access information contains groups of interleaved accesses
1913   /// with the same stride and close to each other.
1914   InterleavedAccessInfo &InterleaveInfo;
1915 
1916   /// Values to ignore in the cost model.
1917   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1918 
1919   /// Values to ignore in the cost model when VF > 1.
1920   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921 
1922   /// All element types found in the loop.
1923   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1924 };
1925 } // end namespace llvm
1926 
1927 namespace {
1928 /// Helper struct to manage generating runtime checks for vectorization.
1929 ///
1930 /// The runtime checks are created up-front in temporary blocks to allow better
1931 /// estimating the cost and un-linked from the existing IR. After deciding to
1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933 /// temporary blocks are completely removed.
1934 class GeneratedRTChecks {
1935   /// Basic block which contains the generated SCEV checks, if any.
1936   BasicBlock *SCEVCheckBlock = nullptr;
1937 
1938   /// The value representing the result of the generated SCEV checks. If it is
1939   /// nullptr, either no SCEV checks have been generated or they have been used.
1940   Value *SCEVCheckCond = nullptr;
1941 
1942   /// Basic block which contains the generated memory runtime checks, if any.
1943   BasicBlock *MemCheckBlock = nullptr;
1944 
1945   /// The value representing the result of the generated memory runtime checks.
1946   /// If it is nullptr, either no memory runtime checks have been generated or
1947   /// they have been used.
1948   Value *MemRuntimeCheckCond = nullptr;
1949 
1950   DominatorTree *DT;
1951   LoopInfo *LI;
1952   TargetTransformInfo *TTI;
1953 
1954   SCEVExpander SCEVExp;
1955   SCEVExpander MemCheckExp;
1956 
1957   bool CostTooHigh = false;
1958   const bool AddBranchWeights;
1959 
1960   Loop *OuterLoop = nullptr;
1961 
1962 public:
GeneratedRTChecks(ScalarEvolution & SE,DominatorTree * DT,LoopInfo * LI,TargetTransformInfo * TTI,const DataLayout & DL,bool AddBranchWeights)1963   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1964                     TargetTransformInfo *TTI, const DataLayout &DL,
1965                     bool AddBranchWeights)
1966       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1967         MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1968 
1969   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970   /// accurately estimate the cost of the runtime checks. The blocks are
1971   /// un-linked from the IR and is added back during vector code generation. If
1972   /// there is no vector code generation, the check blocks are removed
1973   /// completely.
Create(Loop * L,const LoopAccessInfo & LAI,const SCEVPredicate & UnionPred,ElementCount VF,unsigned IC)1974   void Create(Loop *L, const LoopAccessInfo &LAI,
1975               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1976 
1977     // Hard cutoff to limit compile-time increase in case a very large number of
1978     // runtime checks needs to be generated.
1979     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1980     // profile info.
1981     CostTooHigh =
1982         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1983     if (CostTooHigh)
1984       return;
1985 
1986     BasicBlock *LoopHeader = L->getHeader();
1987     BasicBlock *Preheader = L->getLoopPreheader();
1988 
1989     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991     // may be used by SCEVExpander. The blocks will be un-linked from their
1992     // predecessors and removed from LI & DT at the end of the function.
1993     if (!UnionPred.isAlwaysTrue()) {
1994       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1995                                   nullptr, "vector.scevcheck");
1996 
1997       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1998           &UnionPred, SCEVCheckBlock->getTerminator());
1999     }
2000 
2001     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2002     if (RtPtrChecking.Need) {
2003       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2004       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2005                                  "vector.memcheck");
2006 
2007       auto DiffChecks = RtPtrChecking.getDiffChecks();
2008       if (DiffChecks) {
2009         Value *RuntimeVF = nullptr;
2010         MemRuntimeCheckCond = addDiffRuntimeChecks(
2011             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2012             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2013               if (!RuntimeVF)
2014                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2015               return RuntimeVF;
2016             },
2017             IC);
2018       } else {
2019         MemRuntimeCheckCond = addRuntimeChecks(
2020             MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2021             MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2022       }
2023       assert(MemRuntimeCheckCond &&
2024              "no RT checks generated although RtPtrChecking "
2025              "claimed checks are required");
2026     }
2027 
2028     if (!MemCheckBlock && !SCEVCheckBlock)
2029       return;
2030 
2031     // Unhook the temporary block with the checks, update various places
2032     // accordingly.
2033     if (SCEVCheckBlock)
2034       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2035     if (MemCheckBlock)
2036       MemCheckBlock->replaceAllUsesWith(Preheader);
2037 
2038     if (SCEVCheckBlock) {
2039       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2040       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2041       Preheader->getTerminator()->eraseFromParent();
2042     }
2043     if (MemCheckBlock) {
2044       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2045       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2046       Preheader->getTerminator()->eraseFromParent();
2047     }
2048 
2049     DT->changeImmediateDominator(LoopHeader, Preheader);
2050     if (MemCheckBlock) {
2051       DT->eraseNode(MemCheckBlock);
2052       LI->removeBlock(MemCheckBlock);
2053     }
2054     if (SCEVCheckBlock) {
2055       DT->eraseNode(SCEVCheckBlock);
2056       LI->removeBlock(SCEVCheckBlock);
2057     }
2058 
2059     // Outer loop is used as part of the later cost calculations.
2060     OuterLoop = L->getParentLoop();
2061   }
2062 
getCost()2063   InstructionCost getCost() {
2064     if (SCEVCheckBlock || MemCheckBlock)
2065       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2066 
2067     if (CostTooHigh) {
2068       InstructionCost Cost;
2069       Cost.setInvalid();
2070       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2071       return Cost;
2072     }
2073 
2074     InstructionCost RTCheckCost = 0;
2075     if (SCEVCheckBlock)
2076       for (Instruction &I : *SCEVCheckBlock) {
2077         if (SCEVCheckBlock->getTerminator() == &I)
2078           continue;
2079         InstructionCost C =
2080             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2081         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2082         RTCheckCost += C;
2083       }
2084     if (MemCheckBlock) {
2085       InstructionCost MemCheckCost = 0;
2086       for (Instruction &I : *MemCheckBlock) {
2087         if (MemCheckBlock->getTerminator() == &I)
2088           continue;
2089         InstructionCost C =
2090             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2092         MemCheckCost += C;
2093       }
2094 
2095       // If the runtime memory checks are being created inside an outer loop
2096       // we should find out if these checks are outer loop invariant. If so,
2097       // the checks will likely be hoisted out and so the effective cost will
2098       // reduce according to the outer loop trip count.
2099       if (OuterLoop) {
2100         ScalarEvolution *SE = MemCheckExp.getSE();
2101         // TODO: If profitable, we could refine this further by analysing every
2102         // individual memory check, since there could be a mixture of loop
2103         // variant and invariant checks that mean the final condition is
2104         // variant.
2105         const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106         if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107           // It seems reasonable to assume that we can reduce the effective
2108           // cost of the checks even when we know nothing about the trip
2109           // count. Assume that the outer loop executes at least twice.
2110           unsigned BestTripCount = 2;
2111 
2112           // If exact trip count is known use that.
2113           if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114             BestTripCount = SmallTC;
2115           else if (LoopVectorizeWithBlockFrequency) {
2116             // Else use profile data if available.
2117             if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118               BestTripCount = *EstimatedTC;
2119           }
2120 
2121           InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122 
2123           // Let's ensure the cost is always at least 1.
2124           NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125                                      (InstructionCost::CostType)1);
2126 
2127           LLVM_DEBUG(dbgs()
2128                      << "We expect runtime memory checks to be hoisted "
2129                      << "out of the outer loop. Cost reduced from "
2130                      << MemCheckCost << " to " << NewMemCheckCost << '\n');
2131 
2132           MemCheckCost = NewMemCheckCost;
2133         }
2134       }
2135 
2136       RTCheckCost += MemCheckCost;
2137     }
2138 
2139     if (SCEVCheckBlock || MemCheckBlock)
2140       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2141                         << "\n");
2142 
2143     return RTCheckCost;
2144   }
2145 
2146   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2147   /// unused.
~GeneratedRTChecks()2148   ~GeneratedRTChecks() {
2149     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2150     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2151     if (!SCEVCheckCond)
2152       SCEVCleaner.markResultUsed();
2153 
2154     if (!MemRuntimeCheckCond)
2155       MemCheckCleaner.markResultUsed();
2156 
2157     if (MemRuntimeCheckCond) {
2158       auto &SE = *MemCheckExp.getSE();
2159       // Memory runtime check generation creates compares that use expanded
2160       // values. Remove them before running the SCEVExpanderCleaners.
2161       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2162         if (MemCheckExp.isInsertedInstruction(&I))
2163           continue;
2164         SE.forgetValue(&I);
2165         I.eraseFromParent();
2166       }
2167     }
2168     MemCheckCleaner.cleanup();
2169     SCEVCleaner.cleanup();
2170 
2171     if (SCEVCheckCond)
2172       SCEVCheckBlock->eraseFromParent();
2173     if (MemRuntimeCheckCond)
2174       MemCheckBlock->eraseFromParent();
2175   }
2176 
2177   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179   /// depending on the generated condition.
emitSCEVChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader,BasicBlock * LoopExitBlock)2180   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2181                              BasicBlock *LoopVectorPreHeader,
2182                              BasicBlock *LoopExitBlock) {
2183     if (!SCEVCheckCond)
2184       return nullptr;
2185 
2186     Value *Cond = SCEVCheckCond;
2187     // Mark the check as used, to prevent it from being removed during cleanup.
2188     SCEVCheckCond = nullptr;
2189     if (auto *C = dyn_cast<ConstantInt>(Cond))
2190       if (C->isZero())
2191         return nullptr;
2192 
2193     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2194 
2195     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2196     // Create new preheader for vector loop.
2197     if (OuterLoop)
2198       OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2199 
2200     SCEVCheckBlock->getTerminator()->eraseFromParent();
2201     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2202     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2203                                                 SCEVCheckBlock);
2204 
2205     DT->addNewBlock(SCEVCheckBlock, Pred);
2206     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2207 
2208     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2209     if (AddBranchWeights)
2210       setBranchWeights(BI, SCEVCheckBypassWeights);
2211     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2212     return SCEVCheckBlock;
2213   }
2214 
2215   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216   /// the branches to branch to the vector preheader or \p Bypass, depending on
2217   /// the generated condition.
emitMemRuntimeChecks(BasicBlock * Bypass,BasicBlock * LoopVectorPreHeader)2218   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2219                                    BasicBlock *LoopVectorPreHeader) {
2220     // Check if we generated code that checks in runtime if arrays overlap.
2221     if (!MemRuntimeCheckCond)
2222       return nullptr;
2223 
2224     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2225     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2226                                                 MemCheckBlock);
2227 
2228     DT->addNewBlock(MemCheckBlock, Pred);
2229     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2230     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2231 
2232     if (OuterLoop)
2233       OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2234 
2235     BranchInst &BI =
2236         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2237     if (AddBranchWeights) {
2238       setBranchWeights(BI, MemCheckBypassWeights);
2239     }
2240     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2241     MemCheckBlock->getTerminator()->setDebugLoc(
2242         Pred->getTerminator()->getDebugLoc());
2243 
2244     // Mark the check as used, to prevent it from being removed during cleanup.
2245     MemRuntimeCheckCond = nullptr;
2246     return MemCheckBlock;
2247   }
2248 };
2249 } // namespace
2250 
useActiveLaneMask(TailFoldingStyle Style)2251 static bool useActiveLaneMask(TailFoldingStyle Style) {
2252   return Style == TailFoldingStyle::Data ||
2253          Style == TailFoldingStyle::DataAndControlFlow ||
2254          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2255 }
2256 
useActiveLaneMaskForControlFlow(TailFoldingStyle Style)2257 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2258   return Style == TailFoldingStyle::DataAndControlFlow ||
2259          Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2260 }
2261 
2262 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2263 // vectorization. The loop needs to be annotated with #pragma omp simd
2264 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2265 // vector length information is not provided, vectorization is not considered
2266 // explicit. Interleave hints are not allowed either. These limitations will be
2267 // relaxed in the future.
2268 // Please, note that we are currently forced to abuse the pragma 'clang
2269 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2270 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2271 // provides *explicit vectorization hints* (LV can bypass legal checks and
2272 // assume that vectorization is legal). However, both hints are implemented
2273 // using the same metadata (llvm.loop.vectorize, processed by
2274 // LoopVectorizeHints). This will be fixed in the future when the native IR
2275 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)2276 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2277                                    OptimizationRemarkEmitter *ORE) {
2278   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2279   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2280 
2281   // Only outer loops with an explicit vectorization hint are supported.
2282   // Unannotated outer loops are ignored.
2283   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2284     return false;
2285 
2286   Function *Fn = OuterLp->getHeader()->getParent();
2287   if (!Hints.allowVectorization(Fn, OuterLp,
2288                                 true /*VectorizeOnlyWhenForced*/)) {
2289     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2290     return false;
2291   }
2292 
2293   if (Hints.getInterleave() > 1) {
2294     // TODO: Interleave support is future work.
2295     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2296                          "outer loops.\n");
2297     Hints.emitRemarkWithHints();
2298     return false;
2299   }
2300 
2301   return true;
2302 }
2303 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)2304 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2305                                   OptimizationRemarkEmitter *ORE,
2306                                   SmallVectorImpl<Loop *> &V) {
2307   // Collect inner loops and outer loops without irreducible control flow. For
2308   // now, only collect outer loops that have explicit vectorization hints. If we
2309   // are stress testing the VPlan H-CFG construction, we collect the outermost
2310   // loop of every loop nest.
2311   if (L.isInnermost() || VPlanBuildStressTest ||
2312       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2313     LoopBlocksRPO RPOT(&L);
2314     RPOT.perform(LI);
2315     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2316       V.push_back(&L);
2317       // TODO: Collect inner loops inside marked outer loops in case
2318       // vectorization fails for the outer loop. Do not invoke
2319       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2320       // already known to be reducible. We can use an inherited attribute for
2321       // that.
2322       return;
2323     }
2324   }
2325   for (Loop *InnerL : L)
2326     collectSupportedLoops(*InnerL, LI, ORE, V);
2327 }
2328 
2329 //===----------------------------------------------------------------------===//
2330 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2331 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2332 //===----------------------------------------------------------------------===//
2333 
2334 /// Compute the transformed value of Index at offset StartValue using step
2335 /// StepValue.
2336 /// For integer induction, returns StartValue + Index * StepValue.
2337 /// For pointer induction, returns StartValue[Index * StepValue].
2338 /// FIXME: The newly created binary instructions should contain nsw/nuw
2339 /// flags, which can be found from the original scalar operations.
2340 static Value *
emitTransformedIndex(IRBuilderBase & B,Value * Index,Value * StartValue,Value * Step,InductionDescriptor::InductionKind InductionKind,const BinaryOperator * InductionBinOp)2341 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2342                      Value *Step,
2343                      InductionDescriptor::InductionKind InductionKind,
2344                      const BinaryOperator *InductionBinOp) {
2345   Type *StepTy = Step->getType();
2346   Value *CastedIndex = StepTy->isIntegerTy()
2347                            ? B.CreateSExtOrTrunc(Index, StepTy)
2348                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2349   if (CastedIndex != Index) {
2350     CastedIndex->setName(CastedIndex->getName() + ".cast");
2351     Index = CastedIndex;
2352   }
2353 
2354   // Note: the IR at this point is broken. We cannot use SE to create any new
2355   // SCEV and then expand it, hoping that SCEV's simplification will give us
2356   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2357   // lead to various SCEV crashes. So all we can do is to use builder and rely
2358   // on InstCombine for future simplifications. Here we handle some trivial
2359   // cases only.
2360   auto CreateAdd = [&B](Value *X, Value *Y) {
2361     assert(X->getType() == Y->getType() && "Types don't match!");
2362     if (auto *CX = dyn_cast<ConstantInt>(X))
2363       if (CX->isZero())
2364         return Y;
2365     if (auto *CY = dyn_cast<ConstantInt>(Y))
2366       if (CY->isZero())
2367         return X;
2368     return B.CreateAdd(X, Y);
2369   };
2370 
2371   // We allow X to be a vector type, in which case Y will potentially be
2372   // splatted into a vector with the same element count.
2373   auto CreateMul = [&B](Value *X, Value *Y) {
2374     assert(X->getType()->getScalarType() == Y->getType() &&
2375            "Types don't match!");
2376     if (auto *CX = dyn_cast<ConstantInt>(X))
2377       if (CX->isOne())
2378         return Y;
2379     if (auto *CY = dyn_cast<ConstantInt>(Y))
2380       if (CY->isOne())
2381         return X;
2382     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2383     if (XVTy && !isa<VectorType>(Y->getType()))
2384       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2385     return B.CreateMul(X, Y);
2386   };
2387 
2388   switch (InductionKind) {
2389   case InductionDescriptor::IK_IntInduction: {
2390     assert(!isa<VectorType>(Index->getType()) &&
2391            "Vector indices not supported for integer inductions yet");
2392     assert(Index->getType() == StartValue->getType() &&
2393            "Index type does not match StartValue type");
2394     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2395       return B.CreateSub(StartValue, Index);
2396     auto *Offset = CreateMul(Index, Step);
2397     return CreateAdd(StartValue, Offset);
2398   }
2399   case InductionDescriptor::IK_PtrInduction:
2400     return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2401   case InductionDescriptor::IK_FpInduction: {
2402     assert(!isa<VectorType>(Index->getType()) &&
2403            "Vector indices not supported for FP inductions yet");
2404     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2405     assert(InductionBinOp &&
2406            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2407             InductionBinOp->getOpcode() == Instruction::FSub) &&
2408            "Original bin op should be defined for FP induction");
2409 
2410     Value *MulExp = B.CreateFMul(Step, Index);
2411     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2412                          "induction");
2413   }
2414   case InductionDescriptor::IK_NoInduction:
2415     return nullptr;
2416   }
2417   llvm_unreachable("invalid enum");
2418 }
2419 
getMaxVScale(const Function & F,const TargetTransformInfo & TTI)2420 std::optional<unsigned> getMaxVScale(const Function &F,
2421                                      const TargetTransformInfo &TTI) {
2422   if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2423     return MaxVScale;
2424 
2425   if (F.hasFnAttribute(Attribute::VScaleRange))
2426     return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2427 
2428   return std::nullopt;
2429 }
2430 
2431 /// For the given VF and UF and maximum trip count computed for the loop, return
2432 /// whether the induction variable might overflow in the vectorized loop. If not,
2433 /// then we know a runtime overflow check always evaluates to false and can be
2434 /// removed.
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel * Cost,ElementCount VF,std::optional<unsigned> UF=std::nullopt)2435 static bool isIndvarOverflowCheckKnownFalse(
2436     const LoopVectorizationCostModel *Cost,
2437     ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2438   // Always be conservative if we don't know the exact unroll factor.
2439   unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2440 
2441   Type *IdxTy = Cost->Legal->getWidestInductionType();
2442   APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2443 
2444   // We know the runtime overflow check is known false iff the (max) trip-count
2445   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446   // the vector loop induction variable.
2447   if (unsigned TC =
2448           Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2449     uint64_t MaxVF = VF.getKnownMinValue();
2450     if (VF.isScalable()) {
2451       std::optional<unsigned> MaxVScale =
2452           getMaxVScale(*Cost->TheFunction, Cost->TTI);
2453       if (!MaxVScale)
2454         return false;
2455       MaxVF *= *MaxVScale;
2456     }
2457 
2458     return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2459   }
2460 
2461   return false;
2462 }
2463 
2464 // Return whether we allow using masked interleave-groups (for dealing with
2465 // strided loads/stores that reside in predicated blocks, or for dealing
2466 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2467 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2468   // If an override option has been passed in for interleaved accesses, use it.
2469   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2470     return EnableMaskedInterleavedMemAccesses;
2471 
2472   return TTI.enableMaskedInterleavedAccessVectorization();
2473 }
2474 
2475 // Try to vectorize the interleave group that \p Instr belongs to.
2476 //
2477 // E.g. Translate following interleaved load group (factor = 3):
2478 //   for (i = 0; i < N; i+=3) {
2479 //     R = Pic[i];             // Member of index 0
2480 //     G = Pic[i+1];           // Member of index 1
2481 //     B = Pic[i+2];           // Member of index 2
2482 //     ... // do something to R, G, B
2483 //   }
2484 // To:
2485 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2486 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2487 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2488 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2489 //
2490 // Or translate following interleaved store group (factor = 3):
2491 //   for (i = 0; i < N; i+=3) {
2492 //     ... do something to R, G, B
2493 //     Pic[i]   = R;           // Member of index 0
2494 //     Pic[i+1] = G;           // Member of index 1
2495 //     Pic[i+2] = B;           // Member of index 2
2496 //   }
2497 // To:
2498 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2500 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2501 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2502 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(const InterleaveGroup<Instruction> * Group,ArrayRef<VPValue * > VPDefs,VPTransformState & State,VPValue * Addr,ArrayRef<VPValue * > StoredValues,VPValue * BlockInMask,bool NeedsMaskForGaps)2503 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2505     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2506     VPValue *BlockInMask, bool NeedsMaskForGaps) {
2507   Instruction *Instr = Group->getInsertPos();
2508   const DataLayout &DL = Instr->getModule()->getDataLayout();
2509 
2510   // Prepare for the vector type of the interleaved load/store.
2511   Type *ScalarTy = getLoadStoreType(Instr);
2512   unsigned InterleaveFactor = Group->getFactor();
2513   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2514 
2515   // Prepare for the new pointers.
2516   SmallVector<Value *, 2> AddrParts;
2517   unsigned Index = Group->getIndex(Instr);
2518 
2519   // TODO: extend the masked interleaved-group support to reversed access.
2520   assert((!BlockInMask || !Group->isReverse()) &&
2521          "Reversed masked interleave-group not supported.");
2522 
2523   Value *Idx;
2524   // If the group is reverse, adjust the index to refer to the last vector lane
2525   // instead of the first. We adjust the index from the first vector lane,
2526   // rather than directly getting the pointer for lane VF - 1, because the
2527   // pointer operand of the interleaved access is supposed to be uniform. For
2528   // uniform instructions, we're only required to generate a value for the
2529   // first vector lane in each unroll iteration.
2530   if (Group->isReverse()) {
2531     Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2532     Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2533     Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2534     Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2535     Idx = Builder.CreateNeg(Idx);
2536   } else
2537     Idx = Builder.getInt32(-Index);
2538 
2539   for (unsigned Part = 0; Part < UF; Part++) {
2540     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2541     if (auto *I = dyn_cast<Instruction>(AddrPart))
2542       State.setDebugLocFrom(I->getDebugLoc());
2543 
2544     // Notice current instruction could be any index. Need to adjust the address
2545     // to the member of index 0.
2546     //
2547     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2548     //       b = A[i];       // Member of index 0
2549     // Current pointer is pointed to A[i+1], adjust it to A[i].
2550     //
2551     // E.g.  A[i+1] = a;     // Member of index 1
2552     //       A[i]   = b;     // Member of index 0
2553     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2554     // Current pointer is pointed to A[i+2], adjust it to A[i].
2555 
2556     bool InBounds = false;
2557     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2558       InBounds = gep->isInBounds();
2559     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2560     AddrParts.push_back(AddrPart);
2561   }
2562 
2563   State.setDebugLocFrom(Instr->getDebugLoc());
2564   Value *PoisonVec = PoisonValue::get(VecTy);
2565 
2566   auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2567                              unsigned Part, Value *MaskForGaps) -> Value * {
2568     if (VF.isScalable()) {
2569       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2570       assert(InterleaveFactor == 2 &&
2571              "Unsupported deinterleave factor for scalable vectors");
2572       auto *BlockInMaskPart = State.get(BlockInMask, Part);
2573       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2574       auto *MaskTy =
2575           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2576       return Builder.CreateIntrinsic(
2577           MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2578           /*FMFSource=*/nullptr, "interleaved.mask");
2579     }
2580 
2581     if (!BlockInMask)
2582       return MaskForGaps;
2583 
2584     Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585     Value *ShuffledMask = Builder.CreateShuffleVector(
2586         BlockInMaskPart,
2587         createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588         "interleaved.mask");
2589     return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2590                                              MaskForGaps)
2591                        : ShuffledMask;
2592   };
2593 
2594   // Vectorize the interleaved load group.
2595   if (isa<LoadInst>(Instr)) {
2596     Value *MaskForGaps = nullptr;
2597     if (NeedsMaskForGaps) {
2598       MaskForGaps =
2599           createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2600       assert(MaskForGaps && "Mask for Gaps is required but it is null");
2601     }
2602 
2603     // For each unroll part, create a wide load for the group.
2604     SmallVector<Value *, 2> NewLoads;
2605     for (unsigned Part = 0; Part < UF; Part++) {
2606       Instruction *NewLoad;
2607       if (BlockInMask || MaskForGaps) {
2608         assert(useMaskedInterleavedAccesses(*TTI) &&
2609                "masked interleaved groups are not allowed.");
2610         Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2611         NewLoad =
2612             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2613                                      GroupMask, PoisonVec, "wide.masked.vec");
2614       }
2615       else
2616         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2617                                             Group->getAlign(), "wide.vec");
2618       Group->addMetadata(NewLoad);
2619       NewLoads.push_back(NewLoad);
2620     }
2621 
2622     if (VecTy->isScalableTy()) {
2623       assert(InterleaveFactor == 2 &&
2624              "Unsupported deinterleave factor for scalable vectors");
2625 
2626       for (unsigned Part = 0; Part < UF; ++Part) {
2627         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628         // so must use intrinsics to deinterleave.
2629         Value *DI = Builder.CreateIntrinsic(
2630             Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2631             /*FMFSource=*/nullptr, "strided.vec");
2632         unsigned J = 0;
2633         for (unsigned I = 0; I < InterleaveFactor; ++I) {
2634           Instruction *Member = Group->getMember(I);
2635 
2636           if (!Member)
2637             continue;
2638 
2639           Value *StridedVec = Builder.CreateExtractValue(DI, I);
2640           // If this member has different type, cast the result type.
2641           if (Member->getType() != ScalarTy) {
2642             VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643             StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2644           }
2645 
2646           if (Group->isReverse())
2647             StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2648 
2649           State.set(VPDefs[J], StridedVec, Part);
2650           ++J;
2651         }
2652       }
2653 
2654       return;
2655     }
2656 
2657     // For each member in the group, shuffle out the appropriate data from the
2658     // wide loads.
2659     unsigned J = 0;
2660     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2661       Instruction *Member = Group->getMember(I);
2662 
2663       // Skip the gaps in the group.
2664       if (!Member)
2665         continue;
2666 
2667       auto StrideMask =
2668           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2669       for (unsigned Part = 0; Part < UF; Part++) {
2670         Value *StridedVec = Builder.CreateShuffleVector(
2671             NewLoads[Part], StrideMask, "strided.vec");
2672 
2673         // If this member has different type, cast the result type.
2674         if (Member->getType() != ScalarTy) {
2675           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2677           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2678         }
2679 
2680         if (Group->isReverse())
2681           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2682 
2683         State.set(VPDefs[J], StridedVec, Part);
2684       }
2685       ++J;
2686     }
2687     return;
2688   }
2689 
2690   // The sub vector type for current instruction.
2691   auto *SubVT = VectorType::get(ScalarTy, VF);
2692 
2693   // Vectorize the interleaved store group.
2694   Value *MaskForGaps =
2695       createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2696   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2697          "masked interleaved groups are not allowed.");
2698   assert((!MaskForGaps || !VF.isScalable()) &&
2699          "masking gaps for scalable vectors is not yet supported.");
2700   for (unsigned Part = 0; Part < UF; Part++) {
2701     // Collect the stored vector from each member.
2702     SmallVector<Value *, 4> StoredVecs;
2703     unsigned StoredIdx = 0;
2704     for (unsigned i = 0; i < InterleaveFactor; i++) {
2705       assert((Group->getMember(i) || MaskForGaps) &&
2706              "Fail to get a member from an interleaved store group");
2707       Instruction *Member = Group->getMember(i);
2708 
2709       // Skip the gaps in the group.
2710       if (!Member) {
2711         Value *Undef = PoisonValue::get(SubVT);
2712         StoredVecs.push_back(Undef);
2713         continue;
2714       }
2715 
2716       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2717       ++StoredIdx;
2718 
2719       if (Group->isReverse())
2720         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2721 
2722       // If this member has different type, cast it to a unified type.
2723 
2724       if (StoredVec->getType() != SubVT)
2725         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2726 
2727       StoredVecs.push_back(StoredVec);
2728     }
2729 
2730     // Interleave all the smaller vectors into one wider vector.
2731     Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2732     Instruction *NewStoreInstr;
2733     if (BlockInMask || MaskForGaps) {
2734       Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2735       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736                                                 Group->getAlign(), GroupMask);
2737     } else
2738       NewStoreInstr =
2739           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2740 
2741     Group->addMetadata(NewStoreInstr);
2742   }
2743 }
2744 
scalarizeInstruction(const Instruction * Instr,VPReplicateRecipe * RepRecipe,const VPIteration & Instance,VPTransformState & State)2745 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2746                                                VPReplicateRecipe *RepRecipe,
2747                                                const VPIteration &Instance,
2748                                                VPTransformState &State) {
2749   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2750 
2751   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752   // the first lane and part.
2753   if (isa<NoAliasScopeDeclInst>(Instr))
2754     if (!Instance.isFirstIteration())
2755       return;
2756 
2757   // Does this instruction return a value ?
2758   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2759 
2760   Instruction *Cloned = Instr->clone();
2761   if (!IsVoidRetTy) {
2762     Cloned->setName(Instr->getName() + ".cloned");
2763 #if !defined(NDEBUG)
2764     // Verify that VPlan type inference results agree with the type of the
2765     // generated values.
2766     assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2767            "inferred type and type from generated instructions do not match");
2768 #endif
2769   }
2770 
2771   RepRecipe->setFlags(Cloned);
2772 
2773   if (auto DL = Instr->getDebugLoc())
2774     State.setDebugLocFrom(DL);
2775 
2776   // Replace the operands of the cloned instructions with their scalar
2777   // equivalents in the new loop.
2778   for (const auto &I : enumerate(RepRecipe->operands())) {
2779     auto InputInstance = Instance;
2780     VPValue *Operand = I.value();
2781     if (vputils::isUniformAfterVectorization(Operand))
2782       InputInstance.Lane = VPLane::getFirstLane();
2783     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2784   }
2785   State.addNewMetadata(Cloned, Instr);
2786 
2787   // Place the cloned scalar in the new loop.
2788   State.Builder.Insert(Cloned);
2789 
2790   State.set(RepRecipe, Cloned, Instance);
2791 
2792   // If we just cloned a new assumption, add it the assumption cache.
2793   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2794     AC->registerAssumption(II);
2795 
2796   // End if-block.
2797   bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2798   if (IfPredicateInstr)
2799     PredicatedInstructions.push_back(Cloned);
2800 }
2801 
2802 Value *
getOrCreateVectorTripCount(BasicBlock * InsertBlock)2803 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2804   if (VectorTripCount)
2805     return VectorTripCount;
2806 
2807   Value *TC = getTripCount();
2808   IRBuilder<> Builder(InsertBlock->getTerminator());
2809 
2810   Type *Ty = TC->getType();
2811   // This is where we can make the step a runtime constant.
2812   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2813 
2814   // If the tail is to be folded by masking, round the number of iterations N
2815   // up to a multiple of Step instead of rounding down. This is done by first
2816   // adding Step-1 and then rounding down. Note that it's ok if this addition
2817   // overflows: the vector induction variable will eventually wrap to zero given
2818   // that it starts at zero and its Step is a power of two; the loop will then
2819   // exit, with the last early-exit vector comparison also producing all-true.
2820   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2821   // is accounted for in emitIterationCountCheck that adds an overflow check.
2822   if (Cost->foldTailByMasking()) {
2823     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2824            "VF*UF must be a power of 2 when folding tail by masking");
2825     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2826     TC = Builder.CreateAdd(
2827         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2828   }
2829 
2830   // Now we need to generate the expression for the part of the loop that the
2831   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2832   // iterations are not required for correctness, or N - Step, otherwise. Step
2833   // is equal to the vectorization factor (number of SIMD elements) times the
2834   // unroll factor (number of SIMD instructions).
2835   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2836 
2837   // There are cases where we *must* run at least one iteration in the remainder
2838   // loop.  See the cost model for when this can happen.  If the step evenly
2839   // divides the trip count, we set the remainder to be equal to the step. If
2840   // the step does not evenly divide the trip count, no adjustment is necessary
2841   // since there will already be scalar iterations. Note that the minimum
2842   // iterations check ensures that N >= Step.
2843   if (Cost->requiresScalarEpilogue(VF.isVector())) {
2844     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2845     R = Builder.CreateSelect(IsZero, Step, R);
2846   }
2847 
2848   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2849 
2850   return VectorTripCount;
2851 }
2852 
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)2853 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2854                                                    const DataLayout &DL) {
2855   // Verify that V is a vector type with same number of elements as DstVTy.
2856   auto *DstFVTy = cast<VectorType>(DstVTy);
2857   auto VF = DstFVTy->getElementCount();
2858   auto *SrcVecTy = cast<VectorType>(V->getType());
2859   assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2860   Type *SrcElemTy = SrcVecTy->getElementType();
2861   Type *DstElemTy = DstFVTy->getElementType();
2862   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2863          "Vector elements must have same size");
2864 
2865   // Do a direct cast if element types are castable.
2866   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2867     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2868   }
2869   // V cannot be directly casted to desired vector type.
2870   // May happen when V is a floating point vector but DstVTy is a vector of
2871   // pointers or vice-versa. Handle this using a two-step bitcast using an
2872   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2873   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2874          "Only one type should be a pointer type");
2875   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2876          "Only one type should be a floating point type");
2877   Type *IntTy =
2878       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2879   auto *VecIntTy = VectorType::get(IntTy, VF);
2880   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2881   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2882 }
2883 
emitIterationCountCheck(BasicBlock * Bypass)2884 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2885   Value *Count = getTripCount();
2886   // Reuse existing vector loop preheader for TC checks.
2887   // Note that new preheader block is generated for vector loop.
2888   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2889   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2890 
2891   // Generate code to check if the loop's trip count is less than VF * UF, or
2892   // equal to it in case a scalar epilogue is required; this implies that the
2893   // vector trip count is zero. This check also covers the case where adding one
2894   // to the backedge-taken count overflowed leading to an incorrect trip count
2895   // of zero. In this case we will also jump to the scalar loop.
2896   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2897                                                        : ICmpInst::ICMP_ULT;
2898 
2899   // If tail is to be folded, vector loop takes care of all iterations.
2900   Type *CountTy = Count->getType();
2901   Value *CheckMinIters = Builder.getFalse();
2902   auto CreateStep = [&]() -> Value * {
2903     // Create step with max(MinProTripCount, UF * VF).
2904     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2905       return createStepForVF(Builder, CountTy, VF, UF);
2906 
2907     Value *MinProfTC =
2908         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2909     if (!VF.isScalable())
2910       return MinProfTC;
2911     return Builder.CreateBinaryIntrinsic(
2912         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2913   };
2914 
2915   TailFoldingStyle Style = Cost->getTailFoldingStyle();
2916   if (Style == TailFoldingStyle::None)
2917     CheckMinIters =
2918         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2919   else if (VF.isScalable() &&
2920            !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2921            Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2922     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2923     // an overflow to zero when updating induction variables and so an
2924     // additional overflow check is required before entering the vector loop.
2925 
2926     // Get the maximum unsigned value for the type.
2927     Value *MaxUIntTripCount =
2928         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2929     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2930 
2931     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2933   }
2934 
2935   // Create new preheader for vector loop.
2936   LoopVectorPreHeader =
2937       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2938                  "vector.ph");
2939 
2940   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2941                                DT->getNode(Bypass)->getIDom()) &&
2942          "TC check is expected to dominate Bypass");
2943 
2944   // Update dominator for Bypass & LoopExit (if needed).
2945   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2946   if (!Cost->requiresScalarEpilogue(VF.isVector()))
2947     // If there is an epilogue which must run, there's no edge from the
2948     // middle block to exit blocks  and thus no need to update the immediate
2949     // dominator of the exit blocks.
2950     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2951 
2952   BranchInst &BI =
2953       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2954   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2955     setBranchWeights(BI, MinItersBypassWeights);
2956   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2957   LoopBypassBlocks.push_back(TCCheckBlock);
2958 }
2959 
emitSCEVChecks(BasicBlock * Bypass)2960 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2961   BasicBlock *const SCEVCheckBlock =
2962       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2963   if (!SCEVCheckBlock)
2964     return nullptr;
2965 
2966   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2967            (OptForSizeBasedOnProfile &&
2968             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2969          "Cannot SCEV check stride or overflow when optimizing for size");
2970 
2971 
2972   // Update dominator only if this is first RT check.
2973   if (LoopBypassBlocks.empty()) {
2974     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2975     if (!Cost->requiresScalarEpilogue(VF.isVector()))
2976       // If there is an epilogue which must run, there's no edge from the
2977       // middle block to exit blocks  and thus no need to update the immediate
2978       // dominator of the exit blocks.
2979       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2980   }
2981 
2982   LoopBypassBlocks.push_back(SCEVCheckBlock);
2983   AddedSafetyChecks = true;
2984   return SCEVCheckBlock;
2985 }
2986 
emitMemRuntimeChecks(BasicBlock * Bypass)2987 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2988   // VPlan-native path does not do any analysis for runtime checks currently.
2989   if (EnableVPlanNativePath)
2990     return nullptr;
2991 
2992   BasicBlock *const MemCheckBlock =
2993       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2994 
2995   // Check if we generated code that checks in runtime if arrays overlap. We put
2996   // the checks into a separate block to make the more common case of few
2997   // elements faster.
2998   if (!MemCheckBlock)
2999     return nullptr;
3000 
3001   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3002     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3003            "Cannot emit memory checks when optimizing for size, unless forced "
3004            "to vectorize.");
3005     ORE->emit([&]() {
3006       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3007                                         OrigLoop->getStartLoc(),
3008                                         OrigLoop->getHeader())
3009              << "Code-size may be reduced by not forcing "
3010                 "vectorization, or by source-code modifications "
3011                 "eliminating the need for runtime checks "
3012                 "(e.g., adding 'restrict').";
3013     });
3014   }
3015 
3016   LoopBypassBlocks.push_back(MemCheckBlock);
3017 
3018   AddedSafetyChecks = true;
3019 
3020   return MemCheckBlock;
3021 }
3022 
createVectorLoopSkeleton(StringRef Prefix)3023 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3024   LoopScalarBody = OrigLoop->getHeader();
3025   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3026   assert(LoopVectorPreHeader && "Invalid loop structure");
3027   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3028   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
3029          "multiple exit loop without required epilogue?");
3030 
3031   LoopMiddleBlock =
3032       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3033                  LI, nullptr, Twine(Prefix) + "middle.block");
3034   LoopScalarPreHeader =
3035       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3036                  nullptr, Twine(Prefix) + "scalar.ph");
3037 
3038   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3039 
3040   // Set up the middle block terminator.  Two cases:
3041   // 1) If we know that we must execute the scalar epilogue, emit an
3042   //    unconditional branch.
3043   // 2) Otherwise, we must have a single unique exit block (due to how we
3044   //    implement the multiple exit case).  In this case, set up a conditional
3045   //    branch from the middle block to the loop scalar preheader, and the
3046   //    exit block.  completeLoopSkeleton will update the condition to use an
3047   //    iteration check, if required to decide whether to execute the remainder.
3048   BranchInst *BrInst =
3049       Cost->requiresScalarEpilogue(VF.isVector())
3050           ? BranchInst::Create(LoopScalarPreHeader)
3051           : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3052                                Builder.getTrue());
3053   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3054   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3055 
3056   // Update dominator for loop exit. During skeleton creation, only the vector
3057   // pre-header and the middle block are created. The vector loop is entirely
3058   // created during VPlan exection.
3059   if (!Cost->requiresScalarEpilogue(VF.isVector()))
3060     // If there is an epilogue which must run, there's no edge from the
3061     // middle block to exit blocks  and thus no need to update the immediate
3062     // dominator of the exit blocks.
3063     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3064 }
3065 
createInductionResumeValue(PHINode * OrigPhi,const InductionDescriptor & II,Value * Step,ArrayRef<BasicBlock * > BypassBlocks,std::pair<BasicBlock *,Value * > AdditionalBypass)3066 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3067     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3068     ArrayRef<BasicBlock *> BypassBlocks,
3069     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3070   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3071   assert(VectorTripCount && "Expected valid arguments");
3072 
3073   Instruction *OldInduction = Legal->getPrimaryInduction();
3074   Value *&EndValue = IVEndValues[OrigPhi];
3075   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3076   if (OrigPhi == OldInduction) {
3077     // We know what the end value is.
3078     EndValue = VectorTripCount;
3079   } else {
3080     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3081 
3082     // Fast-math-flags propagate from the original induction instruction.
3083     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3084       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3085 
3086     EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3087                                     Step, II.getKind(), II.getInductionBinOp());
3088     EndValue->setName("ind.end");
3089 
3090     // Compute the end value for the additional bypass (if applicable).
3091     if (AdditionalBypass.first) {
3092       B.SetInsertPoint(AdditionalBypass.first,
3093                        AdditionalBypass.first->getFirstInsertionPt());
3094       EndValueFromAdditionalBypass =
3095           emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3096                                Step, II.getKind(), II.getInductionBinOp());
3097       EndValueFromAdditionalBypass->setName("ind.end");
3098     }
3099   }
3100 
3101   // Create phi nodes to merge from the  backedge-taken check block.
3102   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3103                                          LoopScalarPreHeader->getTerminator());
3104   // Copy original phi DL over to the new one.
3105   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3106 
3107   // The new PHI merges the original incoming value, in case of a bypass,
3108   // or the value at the end of the vectorized loop.
3109   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3110 
3111   // Fix the scalar body counter (PHI node).
3112   // The old induction's phi node in the scalar body needs the truncated
3113   // value.
3114   for (BasicBlock *BB : BypassBlocks)
3115     BCResumeVal->addIncoming(II.getStartValue(), BB);
3116 
3117   if (AdditionalBypass.first)
3118     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3119                                           EndValueFromAdditionalBypass);
3120   return BCResumeVal;
3121 }
3122 
3123 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124 /// expansion results.
getExpandedStep(const InductionDescriptor & ID,const SCEV2ValueTy & ExpandedSCEVs)3125 static Value *getExpandedStep(const InductionDescriptor &ID,
3126                               const SCEV2ValueTy &ExpandedSCEVs) {
3127   const SCEV *Step = ID.getStep();
3128   if (auto *C = dyn_cast<SCEVConstant>(Step))
3129     return C->getValue();
3130   if (auto *U = dyn_cast<SCEVUnknown>(Step))
3131     return U->getValue();
3132   auto I = ExpandedSCEVs.find(Step);
3133   assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3134   return I->second;
3135 }
3136 
createInductionResumeValues(const SCEV2ValueTy & ExpandedSCEVs,std::pair<BasicBlock *,Value * > AdditionalBypass)3137 void InnerLoopVectorizer::createInductionResumeValues(
3138     const SCEV2ValueTy &ExpandedSCEVs,
3139     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3140   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3141           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3142          "Inconsistent information about additional bypass.");
3143   // We are going to resume the execution of the scalar loop.
3144   // Go over all of the induction variables that we found and fix the
3145   // PHIs that are left in the scalar version of the loop.
3146   // The starting values of PHI nodes depend on the counter of the last
3147   // iteration in the vectorized loop.
3148   // If we come from a bypass edge then we need to start from the original
3149   // start value.
3150   for (const auto &InductionEntry : Legal->getInductionVars()) {
3151     PHINode *OrigPhi = InductionEntry.first;
3152     const InductionDescriptor &II = InductionEntry.second;
3153     PHINode *BCResumeVal = createInductionResumeValue(
3154         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3155         AdditionalBypass);
3156     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3157   }
3158 }
3159 
completeLoopSkeleton()3160 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3161   // The trip counts should be cached by now.
3162   Value *Count = getTripCount();
3163   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3164 
3165   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3166 
3167   // Add a check in the middle block to see if we have completed
3168   // all of the iterations in the first vector loop.  Three cases:
3169   // 1) If we require a scalar epilogue, there is no conditional branch as
3170   //    we unconditionally branch to the scalar preheader.  Do nothing.
3171   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172   //    Thus if tail is to be folded, we know we don't need to run the
3173   //    remainder and we can use the previous value for the condition (true).
3174   // 3) Otherwise, construct a runtime check.
3175   if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3176       !Cost->foldTailByMasking()) {
3177     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178     // of the corresponding compare because they may have ended up with
3179     // different line numbers and we want to avoid awkward line stepping while
3180     // debugging. Eg. if the compare has got a line number inside the loop.
3181     // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182     // operands. Perform simplification directly on VPlan once the branch is
3183     // modeled there.
3184     IRBuilder<> B(LoopMiddleBlock->getTerminator());
3185     B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3186     Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3187     BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3188     BI.setCondition(CmpN);
3189     if (hasBranchWeightMD(*ScalarLatchTerm)) {
3190       // Assume that `Count % VectorTripCount` is equally distributed.
3191       unsigned TripCount = UF * VF.getKnownMinValue();
3192       assert(TripCount > 0 && "trip count should not be zero");
3193       const uint32_t Weights[] = {1, TripCount - 1};
3194       setBranchWeights(BI, Weights);
3195     }
3196   }
3197 
3198 #ifdef EXPENSIVE_CHECKS
3199   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3200 #endif
3201 
3202   return LoopVectorPreHeader;
3203 }
3204 
3205 std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)3206 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207     const SCEV2ValueTy &ExpandedSCEVs) {
3208   /*
3209    In this function we generate a new loop. The new loop will contain
3210    the vectorized instructions while the old loop will continue to run the
3211    scalar remainder.
3212 
3213        [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214      /  |      preheader are expanded here. Eventually all required SCEV
3215     /   |      expansion should happen here.
3216    /    v
3217   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3218   |  /  |
3219   | /   v
3220   ||   [ ]     <-- vector pre header.
3221   |/    |
3222   |     v
3223   |    [  ] \
3224   |    [  ]_|   <-- vector loop (created during VPlan execution).
3225   |     |
3226   |     v
3227   \   -[ ]   <--- middle-block.
3228    \/   |
3229    /\   v
3230    | ->[ ]     <--- new preheader.
3231    |    |
3232  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3233    |   [ ] \
3234    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3235     \   |
3236      \  v
3237       >[ ]     <-- exit block(s).
3238    ...
3239    */
3240 
3241   // Create an empty vector loop, and prepare basic blocks for the runtime
3242   // checks.
3243   createVectorLoopSkeleton("");
3244 
3245   // Now, compare the new count to zero. If it is zero skip the vector loop and
3246   // jump to the scalar loop. This check also covers the case where the
3247   // backedge-taken count is uint##_max: adding one to it will overflow leading
3248   // to an incorrect trip count of zero. In this (rare) case we will also jump
3249   // to the scalar loop.
3250   emitIterationCountCheck(LoopScalarPreHeader);
3251 
3252   // Generate the code to check any assumptions that we've made for SCEV
3253   // expressions.
3254   emitSCEVChecks(LoopScalarPreHeader);
3255 
3256   // Generate the code that checks in runtime if arrays overlap. We put the
3257   // checks into a separate block to make the more common case of few elements
3258   // faster.
3259   emitMemRuntimeChecks(LoopScalarPreHeader);
3260 
3261   // Emit phis for the new starting index of the scalar loop.
3262   createInductionResumeValues(ExpandedSCEVs);
3263 
3264   return {completeLoopSkeleton(), nullptr};
3265 }
3266 
3267 // Fix up external users of the induction variable. At this point, we are
3268 // in LCSSA form, with all external PHIs that use the IV having one input value,
3269 // coming from the remainder loop. We need those PHIs to also have a correct
3270 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * VectorTripCount,Value * EndValue,BasicBlock * MiddleBlock,BasicBlock * VectorHeader,VPlan & Plan,VPTransformState & State)3271 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3272                                        const InductionDescriptor &II,
3273                                        Value *VectorTripCount, Value *EndValue,
3274                                        BasicBlock *MiddleBlock,
3275                                        BasicBlock *VectorHeader, VPlan &Plan,
3276                                        VPTransformState &State) {
3277   // There are two kinds of external IV usages - those that use the value
3278   // computed in the last iteration (the PHI) and those that use the penultimate
3279   // value (the value that feeds into the phi from the loop latch).
3280   // We allow both, but they, obviously, have different values.
3281 
3282   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3283 
3284   DenseMap<Value *, Value *> MissingVals;
3285 
3286   // An external user of the last iteration's value should see the value that
3287   // the remainder loop uses to initialize its own IV.
3288   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3289   for (User *U : PostInc->users()) {
3290     Instruction *UI = cast<Instruction>(U);
3291     if (!OrigLoop->contains(UI)) {
3292       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3293       MissingVals[UI] = EndValue;
3294     }
3295   }
3296 
3297   // An external user of the penultimate value need to see EndValue - Step.
3298   // The simplest way to get this is to recompute it from the constituent SCEVs,
3299   // that is Start + (Step * (CRD - 1)).
3300   for (User *U : OrigPhi->users()) {
3301     auto *UI = cast<Instruction>(U);
3302     if (!OrigLoop->contains(UI)) {
3303       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3304       IRBuilder<> B(MiddleBlock->getTerminator());
3305 
3306       // Fast-math-flags propagate from the original induction instruction.
3307       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3308         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3309 
3310       Value *CountMinusOne = B.CreateSub(
3311           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3312       CountMinusOne->setName("cmo");
3313 
3314       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3315       assert(StepVPV && "step must have been expanded during VPlan execution");
3316       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3317                                         : State.get(StepVPV, {0, 0});
3318       Value *Escape =
3319           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3320                                II.getKind(), II.getInductionBinOp());
3321       Escape->setName("ind.escape");
3322       MissingVals[UI] = Escape;
3323     }
3324   }
3325 
3326   for (auto &I : MissingVals) {
3327     PHINode *PHI = cast<PHINode>(I.first);
3328     // One corner case we have to handle is two IVs "chasing" each-other,
3329     // that is %IV2 = phi [...], [ %IV1, %latch ]
3330     // In this case, if IV1 has an external use, we need to avoid adding both
3331     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3332     // don't already have an incoming value for the middle block.
3333     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3334       PHI->addIncoming(I.second, MiddleBlock);
3335       Plan.removeLiveOut(PHI);
3336     }
3337   }
3338 }
3339 
3340 namespace {
3341 
3342 struct CSEDenseMapInfo {
canHandle__anone404d1f50e11::CSEDenseMapInfo3343   static bool canHandle(const Instruction *I) {
3344     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3345            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3346   }
3347 
getEmptyKey__anone404d1f50e11::CSEDenseMapInfo3348   static inline Instruction *getEmptyKey() {
3349     return DenseMapInfo<Instruction *>::getEmptyKey();
3350   }
3351 
getTombstoneKey__anone404d1f50e11::CSEDenseMapInfo3352   static inline Instruction *getTombstoneKey() {
3353     return DenseMapInfo<Instruction *>::getTombstoneKey();
3354   }
3355 
getHashValue__anone404d1f50e11::CSEDenseMapInfo3356   static unsigned getHashValue(const Instruction *I) {
3357     assert(canHandle(I) && "Unknown instruction!");
3358     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3359                                                            I->value_op_end()));
3360   }
3361 
isEqual__anone404d1f50e11::CSEDenseMapInfo3362   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3363     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3364         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3365       return LHS == RHS;
3366     return LHS->isIdenticalTo(RHS);
3367   }
3368 };
3369 
3370 } // end anonymous namespace
3371 
3372 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3373 static void cse(BasicBlock *BB) {
3374   // Perform simple cse.
3375   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3376   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3377     if (!CSEDenseMapInfo::canHandle(&In))
3378       continue;
3379 
3380     // Check if we can replace this instruction with any of the
3381     // visited instructions.
3382     if (Instruction *V = CSEMap.lookup(&In)) {
3383       In.replaceAllUsesWith(V);
3384       In.eraseFromParent();
3385       continue;
3386     }
3387 
3388     CSEMap[&In] = &In;
3389   }
3390 }
3391 
3392 InstructionCost
getVectorCallCost(CallInst * CI,ElementCount VF) const3393 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3394                                               ElementCount VF) const {
3395   // We only need to calculate a cost if the VF is scalar; for actual vectors
3396   // we should already have a pre-calculated cost at each VF.
3397   if (!VF.isScalar())
3398     return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3399 
3400   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3401   Type *RetTy = CI->getType();
3402   if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3403     if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3404       return *RedCost;
3405 
3406   SmallVector<Type *, 4> Tys;
3407   for (auto &ArgOp : CI->args())
3408     Tys.push_back(ArgOp->getType());
3409 
3410   InstructionCost ScalarCallCost =
3411       TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3412 
3413   // If this is an intrinsic we may have a lower cost for it.
3414   if (getVectorIntrinsicIDForCall(CI, TLI)) {
3415     InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3416     return std::min(ScalarCallCost, IntrinsicCost);
3417   }
3418   return ScalarCallCost;
3419 }
3420 
MaybeVectorizeType(Type * Elt,ElementCount VF)3421 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3422   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3423     return Elt;
3424   return VectorType::get(Elt, VF);
3425 }
3426 
3427 InstructionCost
getVectorIntrinsicCost(CallInst * CI,ElementCount VF) const3428 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3429                                                    ElementCount VF) const {
3430   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3431   assert(ID && "Expected intrinsic call!");
3432   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3433   FastMathFlags FMF;
3434   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3435     FMF = FPMO->getFastMathFlags();
3436 
3437   SmallVector<const Value *> Arguments(CI->args());
3438   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3439   SmallVector<Type *> ParamTys;
3440   std::transform(FTy->param_begin(), FTy->param_end(),
3441                  std::back_inserter(ParamTys),
3442                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3443 
3444   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3445                                     dyn_cast<IntrinsicInst>(CI));
3446   return TTI.getIntrinsicInstrCost(CostAttrs,
3447                                    TargetTransformInfo::TCK_RecipThroughput);
3448 }
3449 
smallestIntegerVectorType(Type * T1,Type * T2)3450 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3451   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3452   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3453   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3454 }
3455 
largestIntegerVectorType(Type * T1,Type * T2)3456 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3457   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3458   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3459   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3460 }
3461 
fixVectorizedLoop(VPTransformState & State,VPlan & Plan)3462 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3463                                             VPlan &Plan) {
3464   // Fix widened non-induction PHIs by setting up the PHI operands.
3465   if (EnableVPlanNativePath)
3466     fixNonInductionPHIs(Plan, State);
3467 
3468   // At this point every instruction in the original loop is widened to a
3469   // vector form. Now we need to fix the recurrences in the loop. These PHI
3470   // nodes are currently empty because we did not want to introduce cycles.
3471   // This is the second stage of vectorizing recurrences. Note that fixing
3472   // reduction phis are already modeled in VPlan.
3473   // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474   VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3475   VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3476   for (VPRecipeBase &R : HeaderVPBB->phis()) {
3477     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3478       fixFixedOrderRecurrence(FOR, State);
3479   }
3480 
3481   // Forget the original basic block.
3482   PSE.getSE()->forgetLoop(OrigLoop);
3483   PSE.getSE()->forgetBlockAndLoopDispositions();
3484 
3485   // After vectorization, the exit blocks of the original loop will have
3486   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487   // looked through single-entry phis.
3488   SmallVector<BasicBlock *> ExitBlocks;
3489   OrigLoop->getExitBlocks(ExitBlocks);
3490   for (BasicBlock *Exit : ExitBlocks)
3491     for (PHINode &PN : Exit->phis())
3492       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3493 
3494   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3495   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3496   if (Cost->requiresScalarEpilogue(VF.isVector())) {
3497     // No edge from the middle block to the unique exit block has been inserted
3498     // and there is nothing to fix from vector loop; phis should have incoming
3499     // from scalar loop only.
3500   } else {
3501     // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3502     // the cost model.
3503 
3504     // If we inserted an edge from the middle block to the unique exit block,
3505     // update uses outside the loop (phis) to account for the newly inserted
3506     // edge.
3507 
3508     // Fix-up external users of the induction variables.
3509     for (const auto &Entry : Legal->getInductionVars())
3510       fixupIVUsers(Entry.first, Entry.second,
3511                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3512                    IVEndValues[Entry.first], LoopMiddleBlock,
3513                    VectorLoop->getHeader(), Plan, State);
3514   }
3515 
3516   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3517   // in the exit block, so update the builder.
3518   State.Builder.SetInsertPoint(State.CFG.ExitBB,
3519                                State.CFG.ExitBB->getFirstNonPHIIt());
3520   for (const auto &KV : Plan.getLiveOuts())
3521     KV.second->fixPhi(Plan, State);
3522 
3523   for (Instruction *PI : PredicatedInstructions)
3524     sinkScalarOperands(&*PI);
3525 
3526   // Remove redundant induction instructions.
3527   cse(VectorLoop->getHeader());
3528 
3529   // Set/update profile weights for the vector and remainder loops as original
3530   // loop iterations are now distributed among them. Note that original loop
3531   // represented by LoopScalarBody becomes remainder loop after vectorization.
3532   //
3533   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3534   // end up getting slightly roughened result but that should be OK since
3535   // profile is not inherently precise anyway. Note also possible bypass of
3536   // vector code caused by legality checks is ignored, assigning all the weight
3537   // to the vector loop, optimistically.
3538   //
3539   // For scalable vectorization we can't know at compile time how many iterations
3540   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3541   // vscale of '1'.
3542   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3543                                LI->getLoopFor(LoopScalarBody),
3544                                VF.getKnownMinValue() * UF);
3545 }
3546 
fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe * PhiR,VPTransformState & State)3547 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3548     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3549   // This is the second phase of vectorizing first-order recurrences. An
3550   // overview of the transformation is described below. Suppose we have the
3551   // following loop.
3552   //
3553   //   for (int i = 0; i < n; ++i)
3554   //     b[i] = a[i] - a[i - 1];
3555   //
3556   // There is a first-order recurrence on "a". For this loop, the shorthand
3557   // scalar IR looks like:
3558   //
3559   //   scalar.ph:
3560   //     s_init = a[-1]
3561   //     br scalar.body
3562   //
3563   //   scalar.body:
3564   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3565   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3566   //     s2 = a[i]
3567   //     b[i] = s2 - s1
3568   //     br cond, scalar.body, ...
3569   //
3570   // In this example, s1 is a recurrence because it's value depends on the
3571   // previous iteration. In the first phase of vectorization, we created a
3572   // vector phi v1 for s1. We now complete the vectorization and produce the
3573   // shorthand vector IR shown below (for VF = 4, UF = 1).
3574   //
3575   //   vector.ph:
3576   //     v_init = vector(..., ..., ..., a[-1])
3577   //     br vector.body
3578   //
3579   //   vector.body
3580   //     i = phi [0, vector.ph], [i+4, vector.body]
3581   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3582   //     v2 = a[i, i+1, i+2, i+3];
3583   //     v3 = vector(v1(3), v2(0, 1, 2))
3584   //     b[i, i+1, i+2, i+3] = v2 - v3
3585   //     br cond, vector.body, middle.block
3586   //
3587   //   middle.block:
3588   //     x = v2(3)
3589   //     br scalar.ph
3590   //
3591   //   scalar.ph:
3592   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3593   //     br scalar.body
3594   //
3595   // After execution completes the vector loop, we extract the next value of
3596   // the recurrence (x) to use as the initial value in the scalar loop.
3597 
3598   // Extract the last vector element in the middle block. This will be the
3599   // initial value for the recurrence when jumping to the scalar loop.
3600   VPValue *PreviousDef = PhiR->getBackedgeValue();
3601   Value *Incoming = State.get(PreviousDef, UF - 1);
3602   auto *ExtractForScalar = Incoming;
3603   auto *IdxTy = Builder.getInt32Ty();
3604   Value *RuntimeVF = nullptr;
3605   if (VF.isVector()) {
3606     auto *One = ConstantInt::get(IdxTy, 1);
3607     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3608     RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3609     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3610     ExtractForScalar =
3611         Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3612   }
3613 
3614   auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3615   assert(PhiR->getNumUsers() == 1 &&
3616          RecurSplice->getOpcode() ==
3617              VPInstruction::FirstOrderRecurrenceSplice &&
3618          "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619   SmallVector<VPLiveOut *> LiveOuts;
3620   for (VPUser *U : RecurSplice->users())
3621     if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3622       LiveOuts.push_back(LiveOut);
3623 
3624   if (!LiveOuts.empty()) {
3625     // Extract the second last element in the middle block if the
3626     // Phi is used outside the loop. We need to extract the phi itself
3627     // and not the last element (the phi update in the current iteration). This
3628     // will be the value when jumping to the exit block from the
3629     // LoopMiddleBlock, when the scalar loop is not run at all.
3630     Value *ExtractForPhiUsedOutsideLoop = nullptr;
3631     if (VF.isVector()) {
3632       auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3633       ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3634           Incoming, Idx, "vector.recur.extract.for.phi");
3635     } else {
3636       assert(UF > 1 && "VF and UF cannot both be 1");
3637       // When loop is unrolled without vectorizing, initialize
3638       // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639       // value of `Incoming`. This is analogous to the vectorized case above:
3640       // extracting the second last element when VF > 1.
3641       ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3642     }
3643 
3644     for (VPLiveOut *LiveOut : LiveOuts) {
3645       assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3646       PHINode *LCSSAPhi = LiveOut->getPhi();
3647       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3648       State.Plan->removeLiveOut(LCSSAPhi);
3649     }
3650   }
3651 
3652   // Fix the initial value of the original recurrence in the scalar loop.
3653   Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3654   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3655   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3656   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3657   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3658     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3659     Start->addIncoming(Incoming, BB);
3660   }
3661 
3662   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3663   Phi->setName("scalar.recur");
3664 }
3665 
sinkScalarOperands(Instruction * PredInst)3666 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3667   // The basic block and loop containing the predicated instruction.
3668   auto *PredBB = PredInst->getParent();
3669   auto *VectorLoop = LI->getLoopFor(PredBB);
3670 
3671   // Initialize a worklist with the operands of the predicated instruction.
3672   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3673 
3674   // Holds instructions that we need to analyze again. An instruction may be
3675   // reanalyzed if we don't yet know if we can sink it or not.
3676   SmallVector<Instruction *, 8> InstsToReanalyze;
3677 
3678   // Returns true if a given use occurs in the predicated block. Phi nodes use
3679   // their operands in their corresponding predecessor blocks.
3680   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3681     auto *I = cast<Instruction>(U.getUser());
3682     BasicBlock *BB = I->getParent();
3683     if (auto *Phi = dyn_cast<PHINode>(I))
3684       BB = Phi->getIncomingBlock(
3685           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3686     return BB == PredBB;
3687   };
3688 
3689   // Iteratively sink the scalarized operands of the predicated instruction
3690   // into the block we created for it. When an instruction is sunk, it's
3691   // operands are then added to the worklist. The algorithm ends after one pass
3692   // through the worklist doesn't sink a single instruction.
3693   bool Changed;
3694   do {
3695     // Add the instructions that need to be reanalyzed to the worklist, and
3696     // reset the changed indicator.
3697     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3698     InstsToReanalyze.clear();
3699     Changed = false;
3700 
3701     while (!Worklist.empty()) {
3702       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3703 
3704       // We can't sink an instruction if it is a phi node, is not in the loop,
3705       // may have side effects or may read from memory.
3706       // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3708           I->mayHaveSideEffects() || I->mayReadFromMemory())
3709           continue;
3710 
3711       // If the instruction is already in PredBB, check if we can sink its
3712       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713       // sinking the scalar instruction I, hence it appears in PredBB; but it
3714       // may have failed to sink I's operands (recursively), which we try
3715       // (again) here.
3716       if (I->getParent() == PredBB) {
3717         Worklist.insert(I->op_begin(), I->op_end());
3718         continue;
3719       }
3720 
3721       // It's legal to sink the instruction if all its uses occur in the
3722       // predicated block. Otherwise, there's nothing to do yet, and we may
3723       // need to reanalyze the instruction.
3724       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3725         InstsToReanalyze.push_back(I);
3726         continue;
3727       }
3728 
3729       // Move the instruction to the beginning of the predicated block, and add
3730       // it's operands to the worklist.
3731       I->moveBefore(&*PredBB->getFirstInsertionPt());
3732       Worklist.insert(I->op_begin(), I->op_end());
3733 
3734       // The sinking may have enabled other instructions to be sunk, so we will
3735       // need to iterate.
3736       Changed = true;
3737     }
3738   } while (Changed);
3739 }
3740 
fixNonInductionPHIs(VPlan & Plan,VPTransformState & State)3741 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3742                                               VPTransformState &State) {
3743   auto Iter = vp_depth_first_deep(Plan.getEntry());
3744   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3745     for (VPRecipeBase &P : VPBB->phis()) {
3746       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3747       if (!VPPhi)
3748         continue;
3749       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3750       // Make sure the builder has a valid insert point.
3751       Builder.SetInsertPoint(NewPhi);
3752       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3753         VPValue *Inc = VPPhi->getIncomingValue(i);
3754         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3755         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3756       }
3757     }
3758   }
3759 }
3760 
useOrderedReductions(const RecurrenceDescriptor & RdxDesc)3761 bool InnerLoopVectorizer::useOrderedReductions(
3762     const RecurrenceDescriptor &RdxDesc) {
3763   return Cost->useOrderedReductions(RdxDesc);
3764 }
3765 
collectLoopScalars(ElementCount VF)3766 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3767   // We should not collect Scalars more than once per VF. Right now, this
3768   // function is called from collectUniformsAndScalars(), which already does
3769   // this check. Collecting Scalars for VF=1 does not make any sense.
3770   assert(VF.isVector() && !Scalars.contains(VF) &&
3771          "This function should not be visited twice for the same VF");
3772 
3773   // This avoids any chances of creating a REPLICATE recipe during planning
3774   // since that would result in generation of scalarized code during execution,
3775   // which is not supported for scalable vectors.
3776   if (VF.isScalable()) {
3777     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3778     return;
3779   }
3780 
3781   SmallSetVector<Instruction *, 8> Worklist;
3782 
3783   // These sets are used to seed the analysis with pointers used by memory
3784   // accesses that will remain scalar.
3785   SmallSetVector<Instruction *, 8> ScalarPtrs;
3786   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3787   auto *Latch = TheLoop->getLoopLatch();
3788 
3789   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3790   // The pointer operands of loads and stores will be scalar as long as the
3791   // memory access is not a gather or scatter operation. The value operand of a
3792   // store will remain scalar if the store is scalarized.
3793   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3794     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3795     assert(WideningDecision != CM_Unknown &&
3796            "Widening decision should be ready at this moment");
3797     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3798       if (Ptr == Store->getValueOperand())
3799         return WideningDecision == CM_Scalarize;
3800     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3801            "Ptr is neither a value or pointer operand");
3802     return WideningDecision != CM_GatherScatter;
3803   };
3804 
3805   // A helper that returns true if the given value is a bitcast or
3806   // getelementptr instruction contained in the loop.
3807   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3808     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3809             isa<GetElementPtrInst>(V)) &&
3810            !TheLoop->isLoopInvariant(V);
3811   };
3812 
3813   // A helper that evaluates a memory access's use of a pointer. If the use will
3814   // be a scalar use and the pointer is only used by memory accesses, we place
3815   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3816   // PossibleNonScalarPtrs.
3817   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3818     // We only care about bitcast and getelementptr instructions contained in
3819     // the loop.
3820     if (!isLoopVaryingBitCastOrGEP(Ptr))
3821       return;
3822 
3823     // If the pointer has already been identified as scalar (e.g., if it was
3824     // also identified as uniform), there's nothing to do.
3825     auto *I = cast<Instruction>(Ptr);
3826     if (Worklist.count(I))
3827       return;
3828 
3829     // If the use of the pointer will be a scalar use, and all users of the
3830     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831     // place the pointer in PossibleNonScalarPtrs.
3832     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3833           return isa<LoadInst>(U) || isa<StoreInst>(U);
3834         }))
3835       ScalarPtrs.insert(I);
3836     else
3837       PossibleNonScalarPtrs.insert(I);
3838   };
3839 
3840   // We seed the scalars analysis with three classes of instructions: (1)
3841   // instructions marked uniform-after-vectorization and (2) bitcast,
3842   // getelementptr and (pointer) phi instructions used by memory accesses
3843   // requiring a scalar use.
3844   //
3845   // (1) Add to the worklist all instructions that have been identified as
3846   // uniform-after-vectorization.
3847   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3848 
3849   // (2) Add to the worklist all bitcast and getelementptr instructions used by
3850   // memory accesses requiring a scalar use. The pointer operands of loads and
3851   // stores will be scalar as long as the memory accesses is not a gather or
3852   // scatter operation. The value operand of a store will remain scalar if the
3853   // store is scalarized.
3854   for (auto *BB : TheLoop->blocks())
3855     for (auto &I : *BB) {
3856       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3857         evaluatePtrUse(Load, Load->getPointerOperand());
3858       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3859         evaluatePtrUse(Store, Store->getPointerOperand());
3860         evaluatePtrUse(Store, Store->getValueOperand());
3861       }
3862     }
3863   for (auto *I : ScalarPtrs)
3864     if (!PossibleNonScalarPtrs.count(I)) {
3865       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3866       Worklist.insert(I);
3867     }
3868 
3869   // Insert the forced scalars.
3870   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3871   // induction variable when the PHI user is scalarized.
3872   auto ForcedScalar = ForcedScalars.find(VF);
3873   if (ForcedScalar != ForcedScalars.end())
3874     for (auto *I : ForcedScalar->second) {
3875       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3876       Worklist.insert(I);
3877     }
3878 
3879   // Expand the worklist by looking through any bitcasts and getelementptr
3880   // instructions we've already identified as scalar. This is similar to the
3881   // expansion step in collectLoopUniforms(); however, here we're only
3882   // expanding to include additional bitcasts and getelementptr instructions.
3883   unsigned Idx = 0;
3884   while (Idx != Worklist.size()) {
3885     Instruction *Dst = Worklist[Idx++];
3886     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3887       continue;
3888     auto *Src = cast<Instruction>(Dst->getOperand(0));
3889     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3890           auto *J = cast<Instruction>(U);
3891           return !TheLoop->contains(J) || Worklist.count(J) ||
3892                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3893                   isScalarUse(J, Src));
3894         })) {
3895       Worklist.insert(Src);
3896       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3897     }
3898   }
3899 
3900   // An induction variable will remain scalar if all users of the induction
3901   // variable and induction variable update remain scalar.
3902   for (const auto &Induction : Legal->getInductionVars()) {
3903     auto *Ind = Induction.first;
3904     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3905 
3906     // If tail-folding is applied, the primary induction variable will be used
3907     // to feed a vector compare.
3908     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3909       continue;
3910 
3911     // Returns true if \p Indvar is a pointer induction that is used directly by
3912     // load/store instruction \p I.
3913     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3914                                               Instruction *I) {
3915       return Induction.second.getKind() ==
3916                  InductionDescriptor::IK_PtrInduction &&
3917              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3918              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3919     };
3920 
3921     // Determine if all users of the induction variable are scalar after
3922     // vectorization.
3923     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3924       auto *I = cast<Instruction>(U);
3925       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3926              IsDirectLoadStoreFromPtrIndvar(Ind, I);
3927     });
3928     if (!ScalarInd)
3929       continue;
3930 
3931     // Determine if all users of the induction variable update instruction are
3932     // scalar after vectorization.
3933     auto ScalarIndUpdate =
3934         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3935           auto *I = cast<Instruction>(U);
3936           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3937                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3938         });
3939     if (!ScalarIndUpdate)
3940       continue;
3941 
3942     // The induction variable and its update instruction will remain scalar.
3943     Worklist.insert(Ind);
3944     Worklist.insert(IndUpdate);
3945     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3946     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3947                       << "\n");
3948   }
3949 
3950   Scalars[VF].insert(Worklist.begin(), Worklist.end());
3951 }
3952 
isScalarWithPredication(Instruction * I,ElementCount VF) const3953 bool LoopVectorizationCostModel::isScalarWithPredication(
3954     Instruction *I, ElementCount VF) const {
3955   if (!isPredicatedInst(I))
3956     return false;
3957 
3958   // Do we have a non-scalar lowering for this predicated
3959   // instruction? No - it is scalar with predication.
3960   switch(I->getOpcode()) {
3961   default:
3962     return true;
3963   case Instruction::Call:
3964     if (VF.isScalar())
3965       return true;
3966     return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3967                .Kind == CM_Scalarize;
3968   case Instruction::Load:
3969   case Instruction::Store: {
3970     auto *Ptr = getLoadStorePointerOperand(I);
3971     auto *Ty = getLoadStoreType(I);
3972     Type *VTy = Ty;
3973     if (VF.isVector())
3974       VTy = VectorType::get(Ty, VF);
3975     const Align Alignment = getLoadStoreAlignment(I);
3976     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3977                                 TTI.isLegalMaskedGather(VTy, Alignment))
3978                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3979                                 TTI.isLegalMaskedScatter(VTy, Alignment));
3980   }
3981   case Instruction::UDiv:
3982   case Instruction::SDiv:
3983   case Instruction::SRem:
3984   case Instruction::URem: {
3985     // We have the option to use the safe-divisor idiom to avoid predication.
3986     // The cost based decision here will always select safe-divisor for
3987     // scalable vectors as scalarization isn't legal.
3988     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3989     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3990   }
3991   }
3992 }
3993 
isPredicatedInst(Instruction * I) const3994 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3995   if (!blockNeedsPredicationForAnyReason(I->getParent()))
3996     return false;
3997 
3998   // Can we prove this instruction is safe to unconditionally execute?
3999   // If not, we must use some form of predication.
4000   switch(I->getOpcode()) {
4001   default:
4002     return false;
4003   case Instruction::Load:
4004   case Instruction::Store: {
4005     if (!Legal->isMaskRequired(I))
4006       return false;
4007     // When we know the load's address is loop invariant and the instruction
4008     // in the original scalar loop was unconditionally executed then we
4009     // don't need to mark it as a predicated instruction. Tail folding may
4010     // introduce additional predication, but we're guaranteed to always have
4011     // at least one active lane.  We call Legal->blockNeedsPredication here
4012     // because it doesn't query tail-folding.  For stores, we need to prove
4013     // both speculation safety (which follows from the same argument as loads),
4014     // but also must prove the value being stored is correct.  The easiest
4015     // form of the later is to require that all values stored are the same.
4016     if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4017         (isa<LoadInst>(I) ||
4018          (isa<StoreInst>(I) &&
4019           TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4020         !Legal->blockNeedsPredication(I->getParent()))
4021       return false;
4022     return true;
4023   }
4024   case Instruction::UDiv:
4025   case Instruction::SDiv:
4026   case Instruction::SRem:
4027   case Instruction::URem:
4028     // TODO: We can use the loop-preheader as context point here and get
4029     // context sensitive reasoning
4030     return !isSafeToSpeculativelyExecute(I);
4031   case Instruction::Call:
4032     return Legal->isMaskRequired(I);
4033   }
4034 }
4035 
4036 std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction * I,ElementCount VF) const4037 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4038                                                     ElementCount VF) const {
4039   assert(I->getOpcode() == Instruction::UDiv ||
4040          I->getOpcode() == Instruction::SDiv ||
4041          I->getOpcode() == Instruction::SRem ||
4042          I->getOpcode() == Instruction::URem);
4043   assert(!isSafeToSpeculativelyExecute(I));
4044 
4045   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4046 
4047   // Scalarization isn't legal for scalable vector types
4048   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4049   if (!VF.isScalable()) {
4050     // Get the scalarization cost and scale this amount by the probability of
4051     // executing the predicated block. If the instruction is not predicated,
4052     // we fall through to the next case.
4053     ScalarizationCost = 0;
4054 
4055     // These instructions have a non-void type, so account for the phi nodes
4056     // that we will create. This cost is likely to be zero. The phi node
4057     // cost, if any, should be scaled by the block probability because it
4058     // models a copy at the end of each predicated block.
4059     ScalarizationCost += VF.getKnownMinValue() *
4060       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4061 
4062     // The cost of the non-predicated instruction.
4063     ScalarizationCost += VF.getKnownMinValue() *
4064       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4065 
4066     // The cost of insertelement and extractelement instructions needed for
4067     // scalarization.
4068     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4069 
4070     // Scale the cost by the probability of executing the predicated blocks.
4071     // This assumes the predicated block for each vector lane is equally
4072     // likely.
4073     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4074   }
4075   InstructionCost SafeDivisorCost = 0;
4076 
4077   auto *VecTy = ToVectorTy(I->getType(), VF);
4078 
4079   // The cost of the select guard to ensure all lanes are well defined
4080   // after we speculate above any internal control flow.
4081   SafeDivisorCost += TTI.getCmpSelInstrCost(
4082     Instruction::Select, VecTy,
4083     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4084     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4085 
4086   // Certain instructions can be cheaper to vectorize if they have a constant
4087   // second vector operand. One example of this are shifts on x86.
4088   Value *Op2 = I->getOperand(1);
4089   auto Op2Info = TTI.getOperandInfo(Op2);
4090   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4091       Legal->isInvariant(Op2))
4092     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4093 
4094   SmallVector<const Value *, 4> Operands(I->operand_values());
4095   SafeDivisorCost += TTI.getArithmeticInstrCost(
4096     I->getOpcode(), VecTy, CostKind,
4097     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4098     Op2Info, Operands, I);
4099   return {ScalarizationCost, SafeDivisorCost};
4100 }
4101 
interleavedAccessCanBeWidened(Instruction * I,ElementCount VF)4102 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103     Instruction *I, ElementCount VF) {
4104   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4105   assert(getWideningDecision(I, VF) == CM_Unknown &&
4106          "Decision should not be set yet.");
4107   auto *Group = getInterleavedAccessGroup(I);
4108   assert(Group && "Must have a group.");
4109 
4110   // If the instruction's allocated size doesn't equal it's type size, it
4111   // requires padding and will be scalarized.
4112   auto &DL = I->getModule()->getDataLayout();
4113   auto *ScalarTy = getLoadStoreType(I);
4114   if (hasIrregularType(ScalarTy, DL))
4115     return false;
4116 
4117   // If the group involves a non-integral pointer, we may not be able to
4118   // losslessly cast all values to a common type.
4119   unsigned InterleaveFactor = Group->getFactor();
4120   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4121   for (unsigned i = 0; i < InterleaveFactor; i++) {
4122     Instruction *Member = Group->getMember(i);
4123     if (!Member)
4124       continue;
4125     auto *MemberTy = getLoadStoreType(Member);
4126     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4127     // Don't coerce non-integral pointers to integers or vice versa.
4128     if (MemberNI != ScalarNI) {
4129       // TODO: Consider adding special nullptr value case here
4130       return false;
4131     } else if (MemberNI && ScalarNI &&
4132                ScalarTy->getPointerAddressSpace() !=
4133                MemberTy->getPointerAddressSpace()) {
4134       return false;
4135     }
4136   }
4137 
4138   // Check if masking is required.
4139   // A Group may need masking for one of two reasons: it resides in a block that
4140   // needs predication, or it was decided to use masking to deal with gaps
4141   // (either a gap at the end of a load-access that may result in a speculative
4142   // load, or any gaps in a store-access).
4143   bool PredicatedAccessRequiresMasking =
4144       blockNeedsPredicationForAnyReason(I->getParent()) &&
4145       Legal->isMaskRequired(I);
4146   bool LoadAccessWithGapsRequiresEpilogMasking =
4147       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4148       !isScalarEpilogueAllowed();
4149   bool StoreAccessWithGapsRequiresMasking =
4150       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4151   if (!PredicatedAccessRequiresMasking &&
4152       !LoadAccessWithGapsRequiresEpilogMasking &&
4153       !StoreAccessWithGapsRequiresMasking)
4154     return true;
4155 
4156   // If masked interleaving is required, we expect that the user/target had
4157   // enabled it, because otherwise it either wouldn't have been created or
4158   // it should have been invalidated by the CostModel.
4159   assert(useMaskedInterleavedAccesses(TTI) &&
4160          "Masked interleave-groups for predicated accesses are not enabled.");
4161 
4162   if (Group->isReverse())
4163     return false;
4164 
4165   auto *Ty = getLoadStoreType(I);
4166   const Align Alignment = getLoadStoreAlignment(I);
4167   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4168                           : TTI.isLegalMaskedStore(Ty, Alignment);
4169 }
4170 
memoryInstructionCanBeWidened(Instruction * I,ElementCount VF)4171 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172     Instruction *I, ElementCount VF) {
4173   // Get and ensure we have a valid memory instruction.
4174   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4175 
4176   auto *Ptr = getLoadStorePointerOperand(I);
4177   auto *ScalarTy = getLoadStoreType(I);
4178 
4179   // In order to be widened, the pointer should be consecutive, first of all.
4180   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4181     return false;
4182 
4183   // If the instruction is a store located in a predicated block, it will be
4184   // scalarized.
4185   if (isScalarWithPredication(I, VF))
4186     return false;
4187 
4188   // If the instruction's allocated size doesn't equal it's type size, it
4189   // requires padding and will be scalarized.
4190   auto &DL = I->getModule()->getDataLayout();
4191   if (hasIrregularType(ScalarTy, DL))
4192     return false;
4193 
4194   return true;
4195 }
4196 
collectLoopUniforms(ElementCount VF)4197 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4198   // We should not collect Uniforms more than once per VF. Right now,
4199   // this function is called from collectUniformsAndScalars(), which
4200   // already does this check. Collecting Uniforms for VF=1 does not make any
4201   // sense.
4202 
4203   assert(VF.isVector() && !Uniforms.contains(VF) &&
4204          "This function should not be visited twice for the same VF");
4205 
4206   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4207   // not analyze again.  Uniforms.count(VF) will return 1.
4208   Uniforms[VF].clear();
4209 
4210   // We now know that the loop is vectorizable!
4211   // Collect instructions inside the loop that will remain uniform after
4212   // vectorization.
4213 
4214   // Global values, params and instructions outside of current loop are out of
4215   // scope.
4216   auto isOutOfScope = [&](Value *V) -> bool {
4217     Instruction *I = dyn_cast<Instruction>(V);
4218     return (!I || !TheLoop->contains(I));
4219   };
4220 
4221   // Worklist containing uniform instructions demanding lane 0.
4222   SetVector<Instruction *> Worklist;
4223   BasicBlock *Latch = TheLoop->getLoopLatch();
4224 
4225   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226   // that are scalar with predication must not be considered uniform after
4227   // vectorization, because that would create an erroneous replicating region
4228   // where only a single instance out of VF should be formed.
4229   // TODO: optimize such seldom cases if found important, see PR40816.
4230   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4231     if (isOutOfScope(I)) {
4232       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4233                         << *I << "\n");
4234       return;
4235     }
4236     if (isScalarWithPredication(I, VF)) {
4237       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4238                         << *I << "\n");
4239       return;
4240     }
4241     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4242     Worklist.insert(I);
4243   };
4244 
4245   // Start with the conditional branch. If the branch condition is an
4246   // instruction contained in the loop that is only used by the branch, it is
4247   // uniform.
4248   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4249   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4250     addToWorklistIfAllowed(Cmp);
4251 
4252   auto PrevVF = VF.divideCoefficientBy(2);
4253   // Return true if all lanes perform the same memory operation, and we can
4254   // thus chose to execute only one.
4255   auto isUniformMemOpUse = [&](Instruction *I) {
4256     // If the value was already known to not be uniform for the previous
4257     // (smaller VF), it cannot be uniform for the larger VF.
4258     if (PrevVF.isVector()) {
4259       auto Iter = Uniforms.find(PrevVF);
4260       if (Iter != Uniforms.end() && !Iter->second.contains(I))
4261         return false;
4262     }
4263     if (!Legal->isUniformMemOp(*I, VF))
4264       return false;
4265     if (isa<LoadInst>(I))
4266       // Loading the same address always produces the same result - at least
4267       // assuming aliasing and ordering which have already been checked.
4268       return true;
4269     // Storing the same value on every iteration.
4270     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4271   };
4272 
4273   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4274     InstWidening WideningDecision = getWideningDecision(I, VF);
4275     assert(WideningDecision != CM_Unknown &&
4276            "Widening decision should be ready at this moment");
4277 
4278     if (isUniformMemOpUse(I))
4279       return true;
4280 
4281     return (WideningDecision == CM_Widen ||
4282             WideningDecision == CM_Widen_Reverse ||
4283             WideningDecision == CM_Interleave);
4284   };
4285 
4286   // Returns true if Ptr is the pointer operand of a memory access instruction
4287   // I, I is known to not require scalarization, and the pointer is not also
4288   // stored.
4289   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4290     if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4291       return false;
4292     return getLoadStorePointerOperand(I) == Ptr &&
4293            (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4294   };
4295 
4296   // Holds a list of values which are known to have at least one uniform use.
4297   // Note that there may be other uses which aren't uniform.  A "uniform use"
4298   // here is something which only demands lane 0 of the unrolled iterations;
4299   // it does not imply that all lanes produce the same value (e.g. this is not
4300   // the usual meaning of uniform)
4301   SetVector<Value *> HasUniformUse;
4302 
4303   // Scan the loop for instructions which are either a) known to have only
4304   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4305   for (auto *BB : TheLoop->blocks())
4306     for (auto &I : *BB) {
4307       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4308         switch (II->getIntrinsicID()) {
4309         case Intrinsic::sideeffect:
4310         case Intrinsic::experimental_noalias_scope_decl:
4311         case Intrinsic::assume:
4312         case Intrinsic::lifetime_start:
4313         case Intrinsic::lifetime_end:
4314           if (TheLoop->hasLoopInvariantOperands(&I))
4315             addToWorklistIfAllowed(&I);
4316           break;
4317         default:
4318           break;
4319         }
4320       }
4321 
4322       // ExtractValue instructions must be uniform, because the operands are
4323       // known to be loop-invariant.
4324       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4325         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4326                "Expected aggregate value to be loop invariant");
4327         addToWorklistIfAllowed(EVI);
4328         continue;
4329       }
4330 
4331       // If there's no pointer operand, there's nothing to do.
4332       auto *Ptr = getLoadStorePointerOperand(&I);
4333       if (!Ptr)
4334         continue;
4335 
4336       if (isUniformMemOpUse(&I))
4337         addToWorklistIfAllowed(&I);
4338 
4339       if (isVectorizedMemAccessUse(&I, Ptr))
4340         HasUniformUse.insert(Ptr);
4341     }
4342 
4343   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4345   // disallows uses outside the loop as well.
4346   for (auto *V : HasUniformUse) {
4347     if (isOutOfScope(V))
4348       continue;
4349     auto *I = cast<Instruction>(V);
4350     auto UsersAreMemAccesses =
4351       llvm::all_of(I->users(), [&](User *U) -> bool {
4352         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4353       });
4354     if (UsersAreMemAccesses)
4355       addToWorklistIfAllowed(I);
4356   }
4357 
4358   // Expand Worklist in topological order: whenever a new instruction
4359   // is added , its users should be already inside Worklist.  It ensures
4360   // a uniform instruction will only be used by uniform instructions.
4361   unsigned idx = 0;
4362   while (idx != Worklist.size()) {
4363     Instruction *I = Worklist[idx++];
4364 
4365     for (auto *OV : I->operand_values()) {
4366       // isOutOfScope operands cannot be uniform instructions.
4367       if (isOutOfScope(OV))
4368         continue;
4369       // First order recurrence Phi's should typically be considered
4370       // non-uniform.
4371       auto *OP = dyn_cast<PHINode>(OV);
4372       if (OP && Legal->isFixedOrderRecurrence(OP))
4373         continue;
4374       // If all the users of the operand are uniform, then add the
4375       // operand into the uniform worklist.
4376       auto *OI = cast<Instruction>(OV);
4377       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4378             auto *J = cast<Instruction>(U);
4379             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4380           }))
4381         addToWorklistIfAllowed(OI);
4382     }
4383   }
4384 
4385   // For an instruction to be added into Worklist above, all its users inside
4386   // the loop should also be in Worklist. However, this condition cannot be
4387   // true for phi nodes that form a cyclic dependence. We must process phi
4388   // nodes separately. An induction variable will remain uniform if all users
4389   // of the induction variable and induction variable update remain uniform.
4390   // The code below handles both pointer and non-pointer induction variables.
4391   for (const auto &Induction : Legal->getInductionVars()) {
4392     auto *Ind = Induction.first;
4393     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4394 
4395     // Determine if all users of the induction variable are uniform after
4396     // vectorization.
4397     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4398       auto *I = cast<Instruction>(U);
4399       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4400              isVectorizedMemAccessUse(I, Ind);
4401     });
4402     if (!UniformInd)
4403       continue;
4404 
4405     // Determine if all users of the induction variable update instruction are
4406     // uniform after vectorization.
4407     auto UniformIndUpdate =
4408         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4409           auto *I = cast<Instruction>(U);
4410           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4411                  isVectorizedMemAccessUse(I, IndUpdate);
4412         });
4413     if (!UniformIndUpdate)
4414       continue;
4415 
4416     // The induction variable and its update instruction will remain uniform.
4417     addToWorklistIfAllowed(Ind);
4418     addToWorklistIfAllowed(IndUpdate);
4419   }
4420 
4421   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4422 }
4423 
runtimeChecksRequired()4424 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4425   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4426 
4427   if (Legal->getRuntimePointerChecking()->Need) {
4428     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4429         "runtime pointer checks needed. Enable vectorization of this "
4430         "loop with '#pragma clang loop vectorize(enable)' when "
4431         "compiling with -Os/-Oz",
4432         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4433     return true;
4434   }
4435 
4436   if (!PSE.getPredicate().isAlwaysTrue()) {
4437     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4438         "runtime SCEV checks needed. Enable vectorization of this "
4439         "loop with '#pragma clang loop vectorize(enable)' when "
4440         "compiling with -Os/-Oz",
4441         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4442     return true;
4443   }
4444 
4445   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4446   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4447     reportVectorizationFailure("Runtime stride check for small trip count",
4448         "runtime stride == 1 checks needed. Enable vectorization of "
4449         "this loop without such check by compiling with -Os/-Oz",
4450         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4451     return true;
4452   }
4453 
4454   return false;
4455 }
4456 
4457 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements)4458 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4459   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4460     return ElementCount::getScalable(0);
4461 
4462   if (Hints->isScalableVectorizationDisabled()) {
4463     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464                             "ScalableVectorizationDisabled", ORE, TheLoop);
4465     return ElementCount::getScalable(0);
4466   }
4467 
4468   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4469 
4470   auto MaxScalableVF = ElementCount::getScalable(
4471       std::numeric_limits<ElementCount::ScalarTy>::max());
4472 
4473   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474   // FIXME: While for scalable vectors this is currently sufficient, this should
4475   // be replaced by a more detailed mechanism that filters out specific VFs,
4476   // instead of invalidating vectorization for a whole set of VFs based on the
4477   // MaxVF.
4478 
4479   // Disable scalable vectorization if the loop contains unsupported reductions.
4480   if (!canVectorizeReductions(MaxScalableVF)) {
4481     reportVectorizationInfo(
4482         "Scalable vectorization not supported for the reduction "
4483         "operations found in this loop.",
4484         "ScalableVFUnfeasible", ORE, TheLoop);
4485     return ElementCount::getScalable(0);
4486   }
4487 
4488   // Disable scalable vectorization if the loop contains any instructions
4489   // with element types not supported for scalable vectors.
4490   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4491         return !Ty->isVoidTy() &&
4492                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4493       })) {
4494     reportVectorizationInfo("Scalable vectorization is not supported "
4495                             "for all element types found in this loop.",
4496                             "ScalableVFUnfeasible", ORE, TheLoop);
4497     return ElementCount::getScalable(0);
4498   }
4499 
4500   if (Legal->isSafeForAnyVectorWidth())
4501     return MaxScalableVF;
4502 
4503   // Limit MaxScalableVF by the maximum safe dependence distance.
4504   if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4505     MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4506   else
4507     MaxScalableVF = ElementCount::getScalable(0);
4508 
4509   if (!MaxScalableVF)
4510     reportVectorizationInfo(
4511         "Max legal vector width too small, scalable vectorization "
4512         "unfeasible.",
4513         "ScalableVFUnfeasible", ORE, TheLoop);
4514 
4515   return MaxScalableVF;
4516 }
4517 
computeFeasibleMaxVF(unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking)4518 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4519     unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4520   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4521   unsigned SmallestType, WidestType;
4522   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4523 
4524   // Get the maximum safe dependence distance in bits computed by LAA.
4525   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526   // the memory accesses that is most restrictive (involved in the smallest
4527   // dependence distance).
4528   unsigned MaxSafeElements =
4529       llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4530 
4531   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4532   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4533 
4534   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4535                     << ".\n");
4536   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4537                     << ".\n");
4538 
4539   // First analyze the UserVF, fall back if the UserVF should be ignored.
4540   if (UserVF) {
4541     auto MaxSafeUserVF =
4542         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4543 
4544     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4545       // If `VF=vscale x N` is safe, then so is `VF=N`
4546       if (UserVF.isScalable())
4547         return FixedScalableVFPair(
4548             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4549       else
4550         return UserVF;
4551     }
4552 
4553     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4554 
4555     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556     // is better to ignore the hint and let the compiler choose a suitable VF.
4557     if (!UserVF.isScalable()) {
4558       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559                         << " is unsafe, clamping to max safe VF="
4560                         << MaxSafeFixedVF << ".\n");
4561       ORE->emit([&]() {
4562         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4563                                           TheLoop->getStartLoc(),
4564                                           TheLoop->getHeader())
4565                << "User-specified vectorization factor "
4566                << ore::NV("UserVectorizationFactor", UserVF)
4567                << " is unsafe, clamping to maximum safe vectorization factor "
4568                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4569       });
4570       return MaxSafeFixedVF;
4571     }
4572 
4573     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4574       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575                         << " is ignored because scalable vectors are not "
4576                            "available.\n");
4577       ORE->emit([&]() {
4578         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4579                                           TheLoop->getStartLoc(),
4580                                           TheLoop->getHeader())
4581                << "User-specified vectorization factor "
4582                << ore::NV("UserVectorizationFactor", UserVF)
4583                << " is ignored because the target does not support scalable "
4584                   "vectors. The compiler will pick a more suitable value.";
4585       });
4586     } else {
4587       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588                         << " is unsafe. Ignoring scalable UserVF.\n");
4589       ORE->emit([&]() {
4590         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4591                                           TheLoop->getStartLoc(),
4592                                           TheLoop->getHeader())
4593                << "User-specified vectorization factor "
4594                << ore::NV("UserVectorizationFactor", UserVF)
4595                << " is unsafe. Ignoring the hint to let the compiler pick a "
4596                   "more suitable value.";
4597       });
4598     }
4599   }
4600 
4601   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602                     << " / " << WidestType << " bits.\n");
4603 
4604   FixedScalableVFPair Result(ElementCount::getFixed(1),
4605                              ElementCount::getScalable(0));
4606   if (auto MaxVF =
4607           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4608                                   MaxSafeFixedVF, FoldTailByMasking))
4609     Result.FixedVF = MaxVF;
4610 
4611   if (auto MaxVF =
4612           getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4613                                   MaxSafeScalableVF, FoldTailByMasking))
4614     if (MaxVF.isScalable()) {
4615       Result.ScalableVF = MaxVF;
4616       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4617                         << "\n");
4618     }
4619 
4620   return Result;
4621 }
4622 
4623 FixedScalableVFPair
computeMaxVF(ElementCount UserVF,unsigned UserIC)4624 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4625   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4626     // TODO: It may by useful to do since it's still likely to be dynamically
4627     // uniform if the target can skip.
4628     reportVectorizationFailure(
4629         "Not inserting runtime ptr check for divergent target",
4630         "runtime pointer checks needed. Not enabled for divergent target",
4631         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4632     return FixedScalableVFPair::getNone();
4633   }
4634 
4635   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4636   unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4637   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4638   if (TC == 1) {
4639     reportVectorizationFailure("Single iteration (non) loop",
4640         "loop trip count is one, irrelevant for vectorization",
4641         "SingleIterationLoop", ORE, TheLoop);
4642     return FixedScalableVFPair::getNone();
4643   }
4644 
4645   switch (ScalarEpilogueStatus) {
4646   case CM_ScalarEpilogueAllowed:
4647     return computeFeasibleMaxVF(MaxTC, UserVF, false);
4648   case CM_ScalarEpilogueNotAllowedUsePredicate:
4649     [[fallthrough]];
4650   case CM_ScalarEpilogueNotNeededUsePredicate:
4651     LLVM_DEBUG(
4652         dbgs() << "LV: vector predicate hint/switch found.\n"
4653                << "LV: Not allowing scalar epilogue, creating predicated "
4654                << "vector loop.\n");
4655     break;
4656   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4657     // fallthrough as a special case of OptForSize
4658   case CM_ScalarEpilogueNotAllowedOptSize:
4659     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4660       LLVM_DEBUG(
4661           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4662     else
4663       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4664                         << "count.\n");
4665 
4666     // Bail if runtime checks are required, which are not good when optimising
4667     // for size.
4668     if (runtimeChecksRequired())
4669       return FixedScalableVFPair::getNone();
4670 
4671     break;
4672   }
4673 
4674   // The only loops we can vectorize without a scalar epilogue, are loops with
4675   // a bottom-test and a single exiting block. We'd have to handle the fact
4676   // that not every instruction executes on the last iteration.  This will
4677   // require a lane mask which varies through the vector loop body.  (TODO)
4678   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4679     // If there was a tail-folding hint/switch, but we can't fold the tail by
4680     // masking, fallback to a vectorization with a scalar epilogue.
4681     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4682       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683                            "scalar epilogue instead.\n");
4684       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4685       return computeFeasibleMaxVF(MaxTC, UserVF, false);
4686     }
4687     return FixedScalableVFPair::getNone();
4688   }
4689 
4690   // Now try the tail folding
4691 
4692   // Invalidate interleave groups that require an epilogue if we can't mask
4693   // the interleave-group.
4694   if (!useMaskedInterleavedAccesses(TTI)) {
4695     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4696            "No decisions should have been taken at this point");
4697     // Note: There is no need to invalidate any cost modeling decisions here, as
4698     // non where taken so far.
4699     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4700   }
4701 
4702   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4703 
4704   // Avoid tail folding if the trip count is known to be a multiple of any VF
4705   // we choose.
4706   std::optional<unsigned> MaxPowerOf2RuntimeVF =
4707       MaxFactors.FixedVF.getFixedValue();
4708   if (MaxFactors.ScalableVF) {
4709     std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4710     if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4711       MaxPowerOf2RuntimeVF = std::max<unsigned>(
4712           *MaxPowerOf2RuntimeVF,
4713           *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4714     } else
4715       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4716   }
4717 
4718   if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4719     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4720            "MaxFixedVF must be a power of 2");
4721     unsigned MaxVFtimesIC =
4722         UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4723     ScalarEvolution *SE = PSE.getSE();
4724     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4725     const SCEV *ExitCount = SE->getAddExpr(
4726         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4727     const SCEV *Rem = SE->getURemExpr(
4728         SE->applyLoopGuards(ExitCount, TheLoop),
4729         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4730     if (Rem->isZero()) {
4731       // Accept MaxFixedVF if we do not have a tail.
4732       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4733       return MaxFactors;
4734     }
4735   }
4736 
4737   // If we don't know the precise trip count, or if the trip count that we
4738   // found modulo the vectorization factor is not zero, try to fold the tail
4739   // by masking.
4740   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4741   if (Legal->prepareToFoldTailByMasking()) {
4742     CanFoldTailByMasking = true;
4743     return MaxFactors;
4744   }
4745 
4746   // If there was a tail-folding hint/switch, but we can't fold the tail by
4747   // masking, fallback to a vectorization with a scalar epilogue.
4748   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4749     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750                          "scalar epilogue instead.\n");
4751     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4752     return MaxFactors;
4753   }
4754 
4755   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4756     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757     return FixedScalableVFPair::getNone();
4758   }
4759 
4760   if (TC == 0) {
4761     reportVectorizationFailure(
4762         "Unable to calculate the loop count due to complex control flow",
4763         "unable to calculate the loop count due to complex control flow",
4764         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4765     return FixedScalableVFPair::getNone();
4766   }
4767 
4768   reportVectorizationFailure(
4769       "Cannot optimize for size and vectorize at the same time.",
4770       "cannot optimize for size and vectorize at the same time. "
4771       "Enable vectorization of this loop with '#pragma clang loop "
4772       "vectorize(enable)' when compiling with -Os/-Oz",
4773       "NoTailLoopWithOptForSize", ORE, TheLoop);
4774   return FixedScalableVFPair::getNone();
4775 }
4776 
getMaximizedVFForTarget(unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,ElementCount MaxSafeVF,bool FoldTailByMasking)4777 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4778     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4779     ElementCount MaxSafeVF, bool FoldTailByMasking) {
4780   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4781   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4782       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4783                            : TargetTransformInfo::RGK_FixedWidthVector);
4784 
4785   // Convenience function to return the minimum of two ElementCounts.
4786   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4787     assert((LHS.isScalable() == RHS.isScalable()) &&
4788            "Scalable flags must match");
4789     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4790   };
4791 
4792   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4793   // Note that both WidestRegister and WidestType may not be a powers of 2.
4794   auto MaxVectorElementCount = ElementCount::get(
4795       llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4796       ComputeScalableMaxVF);
4797   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4798   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799                     << (MaxVectorElementCount * WidestType) << " bits.\n");
4800 
4801   if (!MaxVectorElementCount) {
4802     LLVM_DEBUG(dbgs() << "LV: The target has no "
4803                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
4804                       << " vector registers.\n");
4805     return ElementCount::getFixed(1);
4806   }
4807 
4808   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4809   if (MaxVectorElementCount.isScalable() &&
4810       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4811     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4812     auto Min = Attr.getVScaleRangeMin();
4813     WidestRegisterMinEC *= Min;
4814   }
4815 
4816   // When a scalar epilogue is required, at least one iteration of the scalar
4817   // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818   // max VF that results in a dead vector loop.
4819   if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4820     MaxTripCount -= 1;
4821 
4822   if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4823       (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4824     // If upper bound loop trip count (TC) is known at compile time there is no
4825     // point in choosing VF greater than TC (as done in the loop below). Select
4826     // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827     // scalable, we only fall back on a fixed VF when the TC is less than or
4828     // equal to the known number of lanes.
4829     auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4830     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4831                          "exceeding the constant trip count: "
4832                       << ClampedUpperTripCount << "\n");
4833     return ElementCount::get(
4834         ClampedUpperTripCount,
4835         FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4836   }
4837 
4838   TargetTransformInfo::RegisterKind RegKind =
4839       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4840                            : TargetTransformInfo::RGK_FixedWidthVector;
4841   ElementCount MaxVF = MaxVectorElementCount;
4842   if (MaximizeBandwidth ||
4843       (MaximizeBandwidth.getNumOccurrences() == 0 &&
4844        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4845         (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4846     auto MaxVectorElementCountMaxBW = ElementCount::get(
4847         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4848         ComputeScalableMaxVF);
4849     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4850 
4851     // Collect all viable vectorization factors larger than the default MaxVF
4852     // (i.e. MaxVectorElementCount).
4853     SmallVector<ElementCount, 8> VFs;
4854     for (ElementCount VS = MaxVectorElementCount * 2;
4855          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4856       VFs.push_back(VS);
4857 
4858     // For each VF calculate its register usage.
4859     auto RUs = calculateRegisterUsage(VFs);
4860 
4861     // Select the largest VF which doesn't require more registers than existing
4862     // ones.
4863     for (int i = RUs.size() - 1; i >= 0; --i) {
4864       bool Selected = true;
4865       for (auto &pair : RUs[i].MaxLocalUsers) {
4866         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4867         if (pair.second > TargetNumRegisters)
4868           Selected = false;
4869       }
4870       if (Selected) {
4871         MaxVF = VFs[i];
4872         break;
4873       }
4874     }
4875     if (ElementCount MinVF =
4876             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4877       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4878         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4879                           << ") with target's minimum: " << MinVF << '\n');
4880         MaxVF = MinVF;
4881       }
4882     }
4883 
4884     // Invalidate any widening decisions we might have made, in case the loop
4885     // requires prediction (decided later), but we have already made some
4886     // load/store widening decisions.
4887     invalidateCostModelingDecisions();
4888   }
4889   return MaxVF;
4890 }
4891 
4892 /// Convenience function that returns the value of vscale_range iff
4893 /// vscale_range.min == vscale_range.max or otherwise returns the value
4894 /// returned by the corresponding TTI method.
4895 static std::optional<unsigned>
getVScaleForTuning(const Loop * L,const TargetTransformInfo & TTI)4896 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4897   const Function *Fn = L->getHeader()->getParent();
4898   if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4899     auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4900     auto Min = Attr.getVScaleRangeMin();
4901     auto Max = Attr.getVScaleRangeMax();
4902     if (Max && Min == Max)
4903       return Max;
4904   }
4905 
4906   return TTI.getVScaleForTuning();
4907 }
4908 
isMoreProfitable(const VectorizationFactor & A,const VectorizationFactor & B) const4909 bool LoopVectorizationPlanner::isMoreProfitable(
4910     const VectorizationFactor &A, const VectorizationFactor &B) const {
4911   InstructionCost CostA = A.Cost;
4912   InstructionCost CostB = B.Cost;
4913 
4914   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4915 
4916   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4917     // If the trip count is a known (possibly small) constant, the trip count
4918     // will be rounded up to an integer number of iterations under
4919     // FoldTailByMasking. The total cost in that case will be
4920     // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921     // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922     // some extra overheads, but for the purpose of comparing the costs of
4923     // different VFs we can use this to compare the total loop-body cost
4924     // expected after vectorization.
4925     auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4926                                              InstructionCost VectorCost,
4927                                              InstructionCost ScalarCost) {
4928       return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4929                                     : VectorCost * (MaxTripCount / VF) +
4930                                           ScalarCost * (MaxTripCount % VF);
4931     };
4932     auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4933     auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4934 
4935     return RTCostA < RTCostB;
4936   }
4937 
4938   // Improve estimate for the vector width if it is scalable.
4939   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4940   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4941   if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4942     if (A.Width.isScalable())
4943       EstimatedWidthA *= *VScale;
4944     if (B.Width.isScalable())
4945       EstimatedWidthB *= *VScale;
4946   }
4947 
4948   // Assume vscale may be larger than 1 (or the value being tuned for),
4949   // so that scalable vectorization is slightly favorable over fixed-width
4950   // vectorization.
4951   if (A.Width.isScalable() && !B.Width.isScalable())
4952     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4953 
4954   // To avoid the need for FP division:
4955   //      (CostA / A.Width) < (CostB / B.Width)
4956   // <=>  (CostA * B.Width) < (CostB * A.Width)
4957   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4958 }
4959 
emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,OptimizationRemarkEmitter * ORE,Loop * TheLoop)4960 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4961                                    OptimizationRemarkEmitter *ORE,
4962                                    Loop *TheLoop) {
4963   if (InvalidCosts.empty())
4964     return;
4965 
4966   // Emit a report of VFs with invalid costs in the loop.
4967 
4968   // Group the remarks per instruction, keeping the instruction order from
4969   // InvalidCosts.
4970   std::map<Instruction *, unsigned> Numbering;
4971   unsigned I = 0;
4972   for (auto &Pair : InvalidCosts)
4973     if (!Numbering.count(Pair.first))
4974       Numbering[Pair.first] = I++;
4975 
4976   // Sort the list, first on instruction(number) then on VF.
4977   sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4978     if (Numbering[A.first] != Numbering[B.first])
4979       return Numbering[A.first] < Numbering[B.first];
4980     ElementCountComparator ECC;
4981     return ECC(A.second, B.second);
4982   });
4983 
4984   // For a list of ordered instruction-vf pairs:
4985   //   [(load, vf1), (load, vf2), (store, vf1)]
4986   // Group the instructions together to emit separate remarks for:
4987   //   load  (vf1, vf2)
4988   //   store (vf1)
4989   auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4990   auto Subset = ArrayRef<InstructionVFPair>();
4991   do {
4992     if (Subset.empty())
4993       Subset = Tail.take_front(1);
4994 
4995     Instruction *I = Subset.front().first;
4996 
4997     // If the next instruction is different, or if there are no other pairs,
4998     // emit a remark for the collated subset. e.g.
4999     //   [(load, vf1), (load, vf2))]
5000     // to emit:
5001     //  remark: invalid costs for 'load' at VF=(vf, vf2)
5002     if (Subset == Tail || Tail[Subset.size()].first != I) {
5003       std::string OutString;
5004       raw_string_ostream OS(OutString);
5005       assert(!Subset.empty() && "Unexpected empty range");
5006       OS << "Instruction with invalid costs prevented vectorization at VF=(";
5007       for (const auto &Pair : Subset)
5008         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5009       OS << "):";
5010       if (auto *CI = dyn_cast<CallInst>(I))
5011         OS << " call to " << CI->getCalledFunction()->getName();
5012       else
5013         OS << " " << I->getOpcodeName();
5014       OS.flush();
5015       reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5016       Tail = Tail.drop_front(Subset.size());
5017       Subset = {};
5018     } else
5019       // Grow the subset by one element
5020       Subset = Tail.take_front(Subset.size() + 1);
5021   } while (!Tail.empty());
5022 }
5023 
selectVectorizationFactor(const ElementCountSet & VFCandidates)5024 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5025     const ElementCountSet &VFCandidates) {
5026   InstructionCost ExpectedCost =
5027       CM.expectedCost(ElementCount::getFixed(1)).first;
5028   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5029   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5030   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5031          "Expected Scalar VF to be a candidate");
5032 
5033   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5034                                        ExpectedCost);
5035   VectorizationFactor ChosenFactor = ScalarCost;
5036 
5037   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5038   if (ForceVectorization && VFCandidates.size() > 1) {
5039     // Ignore scalar width, because the user explicitly wants vectorization.
5040     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5041     // evaluation.
5042     ChosenFactor.Cost = InstructionCost::getMax();
5043   }
5044 
5045   SmallVector<InstructionVFPair> InvalidCosts;
5046   for (const auto &i : VFCandidates) {
5047     // The cost for scalar VF=1 is already calculated, so ignore it.
5048     if (i.isScalar())
5049       continue;
5050 
5051     LoopVectorizationCostModel::VectorizationCostTy C =
5052         CM.expectedCost(i, &InvalidCosts);
5053     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5054 
5055 #ifndef NDEBUG
5056     unsigned AssumedMinimumVscale =
5057         getVScaleForTuning(OrigLoop, TTI).value_or(1);
5058     unsigned Width =
5059         Candidate.Width.isScalable()
5060             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5061             : Candidate.Width.getFixedValue();
5062     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063                       << " costs: " << (Candidate.Cost / Width));
5064     if (i.isScalable())
5065       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066                         << AssumedMinimumVscale << ")");
5067     LLVM_DEBUG(dbgs() << ".\n");
5068 #endif
5069 
5070     if (!C.second && !ForceVectorization) {
5071       LLVM_DEBUG(
5072           dbgs() << "LV: Not considering vector loop of width " << i
5073                  << " because it will not generate any vector instructions.\n");
5074       continue;
5075     }
5076 
5077     // If profitable add it to ProfitableVF list.
5078     if (isMoreProfitable(Candidate, ScalarCost))
5079       ProfitableVFs.push_back(Candidate);
5080 
5081     if (isMoreProfitable(Candidate, ChosenFactor))
5082       ChosenFactor = Candidate;
5083   }
5084 
5085   emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5086 
5087   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5088     reportVectorizationFailure(
5089         "There are conditional stores.",
5090         "store that is conditionally executed prevents vectorization",
5091         "ConditionalStore", ORE, OrigLoop);
5092     ChosenFactor = ScalarCost;
5093   }
5094 
5095   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5096                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5097              << "LV: Vectorization seems to be not beneficial, "
5098              << "but was forced by a user.\n");
5099   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5100   return ChosenFactor;
5101 }
5102 
isCandidateForEpilogueVectorization(ElementCount VF) const5103 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104     ElementCount VF) const {
5105   // Cross iteration phis such as reductions need special handling and are
5106   // currently unsupported.
5107   if (any_of(OrigLoop->getHeader()->phis(),
5108              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5109     return false;
5110 
5111   // Phis with uses outside of the loop require special handling and are
5112   // currently unsupported.
5113   for (const auto &Entry : Legal->getInductionVars()) {
5114     // Look for uses of the value of the induction at the last iteration.
5115     Value *PostInc =
5116         Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5117     for (User *U : PostInc->users())
5118       if (!OrigLoop->contains(cast<Instruction>(U)))
5119         return false;
5120     // Look for uses of penultimate value of the induction.
5121     for (User *U : Entry.first->users())
5122       if (!OrigLoop->contains(cast<Instruction>(U)))
5123         return false;
5124   }
5125 
5126   // Epilogue vectorization code has not been auditted to ensure it handles
5127   // non-latch exits properly.  It may be fine, but it needs auditted and
5128   // tested.
5129   if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5130     return false;
5131 
5132   return true;
5133 }
5134 
isEpilogueVectorizationProfitable(const ElementCount VF) const5135 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136     const ElementCount VF) const {
5137   // FIXME: We need a much better cost-model to take different parameters such
5138   // as register pressure, code size increase and cost of extra branches into
5139   // account. For now we apply a very crude heuristic and only consider loops
5140   // with vectorization factors larger than a certain value.
5141 
5142   // Allow the target to opt out entirely.
5143   if (!TTI.preferEpilogueVectorization())
5144     return false;
5145 
5146   // We also consider epilogue vectorization unprofitable for targets that don't
5147   // consider interleaving beneficial (eg. MVE).
5148   if (TTI.getMaxInterleaveFactor(VF) <= 1)
5149     return false;
5150 
5151   unsigned Multiplier = 1;
5152   if (VF.isScalable())
5153     Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5154   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5155     return true;
5156   return false;
5157 }
5158 
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF,unsigned IC)5159 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160     const ElementCount MainLoopVF, unsigned IC) {
5161   VectorizationFactor Result = VectorizationFactor::Disabled();
5162   if (!EnableEpilogueVectorization) {
5163     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5164     return Result;
5165   }
5166 
5167   if (!CM.isScalarEpilogueAllowed()) {
5168     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169                          "epilogue is allowed.\n");
5170     return Result;
5171   }
5172 
5173   // Not really a cost consideration, but check for unsupported cases here to
5174   // simplify the logic.
5175   if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5176     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177                          "is not a supported candidate.\n");
5178     return Result;
5179   }
5180 
5181   if (EpilogueVectorizationForceVF > 1) {
5182     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5184     if (hasPlanWithVF(ForcedEC))
5185       return {ForcedEC, 0, 0};
5186     else {
5187       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5188                            "viable.\n");
5189       return Result;
5190     }
5191   }
5192 
5193   if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5194       OrigLoop->getHeader()->getParent()->hasMinSize()) {
5195     LLVM_DEBUG(
5196         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5197     return Result;
5198   }
5199 
5200   if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5201     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5202                          "this loop\n");
5203     return Result;
5204   }
5205 
5206   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207   // the main loop handles 8 lanes per iteration. We could still benefit from
5208   // vectorizing the epilogue loop with VF=4.
5209   ElementCount EstimatedRuntimeVF = MainLoopVF;
5210   if (MainLoopVF.isScalable()) {
5211     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5212     if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5213       EstimatedRuntimeVF *= *VScale;
5214   }
5215 
5216   ScalarEvolution &SE = *PSE.getSE();
5217   Type *TCType = Legal->getWidestInductionType();
5218   const SCEV *RemainingIterations = nullptr;
5219   for (auto &NextVF : ProfitableVFs) {
5220     // Skip candidate VFs without a corresponding VPlan.
5221     if (!hasPlanWithVF(NextVF.Width))
5222       continue;
5223 
5224     // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225     // vectors) or the VF of the main loop (fixed vectors).
5226     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5227          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5228         ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5229       continue;
5230 
5231     // If NextVF is greater than the number of remaining iterations, the
5232     // epilogue loop would be dead. Skip such factors.
5233     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5234       // TODO: extend to support scalable VFs.
5235       if (!RemainingIterations) {
5236         const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5237         RemainingIterations = SE.getURemExpr(
5238             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5239       }
5240       if (SE.isKnownPredicate(
5241               CmpInst::ICMP_UGT,
5242               SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5243               RemainingIterations))
5244         continue;
5245     }
5246 
5247     if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5248       Result = NextVF;
5249   }
5250 
5251   if (Result != VectorizationFactor::Disabled())
5252     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253                       << Result.Width << "\n");
5254   return Result;
5255 }
5256 
5257 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5258 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5259   unsigned MinWidth = -1U;
5260   unsigned MaxWidth = 8;
5261   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5262   // For in-loop reductions, no element types are added to ElementTypesInLoop
5263   // if there are no loads/stores in the loop. In this case, check through the
5264   // reduction variables to determine the maximum width.
5265   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5266     // Reset MaxWidth so that we can find the smallest type used by recurrences
5267     // in the loop.
5268     MaxWidth = -1U;
5269     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5270       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5271       // When finding the min width used by the recurrence we need to account
5272       // for casts on the input operands of the recurrence.
5273       MaxWidth = std::min<unsigned>(
5274           MaxWidth, std::min<unsigned>(
5275                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5276                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5277     }
5278   } else {
5279     for (Type *T : ElementTypesInLoop) {
5280       MinWidth = std::min<unsigned>(
5281           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5282       MaxWidth = std::max<unsigned>(
5283           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5284     }
5285   }
5286   return {MinWidth, MaxWidth};
5287 }
5288 
collectElementTypesForWidening()5289 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290   ElementTypesInLoop.clear();
5291   // For each block.
5292   for (BasicBlock *BB : TheLoop->blocks()) {
5293     // For each instruction in the loop.
5294     for (Instruction &I : BB->instructionsWithoutDebug()) {
5295       Type *T = I.getType();
5296 
5297       // Skip ignored values.
5298       if (ValuesToIgnore.count(&I))
5299         continue;
5300 
5301       // Only examine Loads, Stores and PHINodes.
5302       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5303         continue;
5304 
5305       // Examine PHI nodes that are reduction variables. Update the type to
5306       // account for the recurrence type.
5307       if (auto *PN = dyn_cast<PHINode>(&I)) {
5308         if (!Legal->isReductionVariable(PN))
5309           continue;
5310         const RecurrenceDescriptor &RdxDesc =
5311             Legal->getReductionVars().find(PN)->second;
5312         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5313             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5314                                       RdxDesc.getRecurrenceType(),
5315                                       TargetTransformInfo::ReductionFlags()))
5316           continue;
5317         T = RdxDesc.getRecurrenceType();
5318       }
5319 
5320       // Examine the stored values.
5321       if (auto *ST = dyn_cast<StoreInst>(&I))
5322         T = ST->getValueOperand()->getType();
5323 
5324       assert(T->isSized() &&
5325              "Expected the load/store/recurrence type to be sized");
5326 
5327       ElementTypesInLoop.insert(T);
5328     }
5329   }
5330 }
5331 
5332 unsigned
selectInterleaveCount(ElementCount VF,InstructionCost LoopCost)5333 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5334                                                   InstructionCost LoopCost) {
5335   // -- The interleave heuristics --
5336   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5337   // There are many micro-architectural considerations that we can't predict
5338   // at this level. For example, frontend pressure (on decode or fetch) due to
5339   // code size, or the number and capabilities of the execution ports.
5340   //
5341   // We use the following heuristics to select the interleave count:
5342   // 1. If the code has reductions, then we interleave to break the cross
5343   // iteration dependency.
5344   // 2. If the loop is really small, then we interleave to reduce the loop
5345   // overhead.
5346   // 3. We don't interleave if we think that we will spill registers to memory
5347   // due to the increased register pressure.
5348 
5349   if (!isScalarEpilogueAllowed())
5350     return 1;
5351 
5352   // We used the distance for the interleave count.
5353   if (!Legal->isSafeForAnyVectorWidth())
5354     return 1;
5355 
5356   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5357   const bool HasReductions = !Legal->getReductionVars().empty();
5358   // Do not interleave loops with a relatively small known or estimated trip
5359   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361   // because with the above conditions interleaving can expose ILP and break
5362   // cross iteration dependences for reductions.
5363   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5364       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5365     return 1;
5366 
5367   // If we did not calculate the cost for VF (because the user selected the VF)
5368   // then we calculate the cost of VF here.
5369   if (LoopCost == 0) {
5370     LoopCost = expectedCost(VF).first;
5371     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5372 
5373     // Loop body is free and there is no need for interleaving.
5374     if (LoopCost == 0)
5375       return 1;
5376   }
5377 
5378   RegisterUsage R = calculateRegisterUsage({VF})[0];
5379   // We divide by these constants so assume that we have at least one
5380   // instruction that uses at least one register.
5381   for (auto& pair : R.MaxLocalUsers) {
5382     pair.second = std::max(pair.second, 1U);
5383   }
5384 
5385   // We calculate the interleave count using the following formula.
5386   // Subtract the number of loop invariants from the number of available
5387   // registers. These registers are used by all of the interleaved instances.
5388   // Next, divide the remaining registers by the number of registers that is
5389   // required by the loop, in order to estimate how many parallel instances
5390   // fit without causing spills. All of this is rounded down if necessary to be
5391   // a power of two. We want power of two interleave count to simplify any
5392   // addressing operations or alignment considerations.
5393   // We also want power of two interleave counts to ensure that the induction
5394   // variable of the vector loop wraps to zero, when tail is folded by masking;
5395   // this currently happens when OptForSize, in which case IC is set to 1 above.
5396   unsigned IC = UINT_MAX;
5397 
5398   for (auto& pair : R.MaxLocalUsers) {
5399     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5400     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5401                       << " registers of "
5402                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5403     if (VF.isScalar()) {
5404       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5405         TargetNumRegisters = ForceTargetNumScalarRegs;
5406     } else {
5407       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5408         TargetNumRegisters = ForceTargetNumVectorRegs;
5409     }
5410     unsigned MaxLocalUsers = pair.second;
5411     unsigned LoopInvariantRegs = 0;
5412     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5413       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5414 
5415     unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5416                                      MaxLocalUsers);
5417     // Don't count the induction variable as interleaved.
5418     if (EnableIndVarRegisterHeur) {
5419       TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5420                               std::max(1U, (MaxLocalUsers - 1)));
5421     }
5422 
5423     IC = std::min(IC, TmpIC);
5424   }
5425 
5426   // Clamp the interleave ranges to reasonable counts.
5427   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5428 
5429   // Check if the user has overridden the max.
5430   if (VF.isScalar()) {
5431     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5432       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5433   } else {
5434     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5435       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5436   }
5437 
5438   unsigned EstimatedVF = VF.getKnownMinValue();
5439   if (VF.isScalable()) {
5440     if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5441       EstimatedVF *= *VScale;
5442   }
5443   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5444 
5445   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5446   if (KnownTC) {
5447     // If trip count is known we select between two prospective ICs, where
5448     // 1) the aggressive IC is capped by the trip count divided by VF
5449     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450     // The final IC is selected in a way that the epilogue loop trip count is
5451     // minimized while maximizing the IC itself, so that we either run the
5452     // vector loop at least once if it generates a small epilogue loop, or else
5453     // we run the vector loop at least twice.
5454 
5455     unsigned InterleaveCountUB = bit_floor(
5456         std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5457     unsigned InterleaveCountLB = bit_floor(std::max(
5458         1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5459     MaxInterleaveCount = InterleaveCountLB;
5460 
5461     if (InterleaveCountUB != InterleaveCountLB) {
5462       unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5463       unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5464       // If both produce same scalar tail, maximize the IC to do the same work
5465       // in fewer vector loop iterations
5466       if (TailTripCountUB == TailTripCountLB)
5467         MaxInterleaveCount = InterleaveCountUB;
5468     }
5469   } else if (BestKnownTC) {
5470     // If trip count is an estimated compile time constant, limit the
5471     // IC to be capped by the trip count divided by VF * 2, such that the vector
5472     // loop runs at least twice to make interleaving seem profitable when there
5473     // is an epilogue loop present. Since exact Trip count is not known we
5474     // choose to be conservative in our IC estimate.
5475     MaxInterleaveCount = bit_floor(std::max(
5476         1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5477   }
5478 
5479   assert(MaxInterleaveCount > 0 &&
5480          "Maximum interleave count must be greater than 0");
5481 
5482   // Clamp the calculated IC to be between the 1 and the max interleave count
5483   // that the target and trip count allows.
5484   if (IC > MaxInterleaveCount)
5485     IC = MaxInterleaveCount;
5486   else
5487     // Make sure IC is greater than 0.
5488     IC = std::max(1u, IC);
5489 
5490   assert(IC > 0 && "Interleave count must be greater than 0.");
5491 
5492   // Interleave if we vectorized this loop and there is a reduction that could
5493   // benefit from interleaving.
5494   if (VF.isVector() && HasReductions) {
5495     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5496     return IC;
5497   }
5498 
5499   // For any scalar loop that either requires runtime checks or predication we
5500   // are better off leaving this to the unroller. Note that if we've already
5501   // vectorized the loop we will have done the runtime check and so interleaving
5502   // won't require further checks.
5503   bool ScalarInterleavingRequiresPredication =
5504       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5505          return Legal->blockNeedsPredication(BB);
5506        }));
5507   bool ScalarInterleavingRequiresRuntimePointerCheck =
5508       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5509 
5510   // We want to interleave small loops in order to reduce the loop overhead and
5511   // potentially expose ILP opportunities.
5512   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5513                     << "LV: IC is " << IC << '\n'
5514                     << "LV: VF is " << VF << '\n');
5515   const bool AggressivelyInterleaveReductions =
5516       TTI.enableAggressiveInterleaving(HasReductions);
5517   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5518       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5519     // We assume that the cost overhead is 1 and we use the cost model
5520     // to estimate the cost of the loop and interleave until the cost of the
5521     // loop overhead is about 5% of the cost of the loop.
5522     unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5523                                         SmallLoopCost / *LoopCost.getValue()));
5524 
5525     // Interleave until store/load ports (estimated by max interleave count) are
5526     // saturated.
5527     unsigned NumStores = Legal->getNumStores();
5528     unsigned NumLoads = Legal->getNumLoads();
5529     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5530     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5531 
5532     // There is little point in interleaving for reductions containing selects
5533     // and compares when VF=1 since it may just create more overhead than it's
5534     // worth for loops with small trip counts. This is because we still have to
5535     // do the final reduction after the loop.
5536     bool HasSelectCmpReductions =
5537         HasReductions &&
5538         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5539           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5540           return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541               RdxDesc.getRecurrenceKind());
5542         });
5543     if (HasSelectCmpReductions) {
5544       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5545       return 1;
5546     }
5547 
5548     // If we have a scalar reduction (vector reductions are already dealt with
5549     // by this point), we can increase the critical path length if the loop
5550     // we're interleaving is inside another loop. For tree-wise reductions
5551     // set the limit to 2, and for ordered reductions it's best to disable
5552     // interleaving entirely.
5553     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5554       bool HasOrderedReductions =
5555           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5556             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5557             return RdxDesc.isOrdered();
5558           });
5559       if (HasOrderedReductions) {
5560         LLVM_DEBUG(
5561             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5562         return 1;
5563       }
5564 
5565       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5566       SmallIC = std::min(SmallIC, F);
5567       StoresIC = std::min(StoresIC, F);
5568       LoadsIC = std::min(LoadsIC, F);
5569     }
5570 
5571     if (EnableLoadStoreRuntimeInterleave &&
5572         std::max(StoresIC, LoadsIC) > SmallIC) {
5573       LLVM_DEBUG(
5574           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5575       return std::max(StoresIC, LoadsIC);
5576     }
5577 
5578     // If there are scalar reductions and TTI has enabled aggressive
5579     // interleaving for reductions, we will interleave to expose ILP.
5580     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5581         AggressivelyInterleaveReductions) {
5582       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583       // Interleave no less than SmallIC but not as aggressive as the normal IC
5584       // to satisfy the rare situation when resources are too limited.
5585       return std::max(IC / 2, SmallIC);
5586     } else {
5587       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5588       return SmallIC;
5589     }
5590   }
5591 
5592   // Interleave if this is a large loop (small loops are already dealt with by
5593   // this point) that could benefit from interleaving.
5594   if (AggressivelyInterleaveReductions) {
5595     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5596     return IC;
5597   }
5598 
5599   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5600   return 1;
5601 }
5602 
5603 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs)5604 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5605   // This function calculates the register usage by measuring the highest number
5606   // of values that are alive at a single location. Obviously, this is a very
5607   // rough estimation. We scan the loop in a topological order in order and
5608   // assign a number to each instruction. We use RPO to ensure that defs are
5609   // met before their users. We assume that each instruction that has in-loop
5610   // users starts an interval. We record every time that an in-loop value is
5611   // used, so we have a list of the first and last occurrences of each
5612   // instruction. Next, we transpose this data structure into a multi map that
5613   // holds the list of intervals that *end* at a specific location. This multi
5614   // map allows us to perform a linear search. We scan the instructions linearly
5615   // and record each time that a new interval starts, by placing it in a set.
5616   // If we find this value in the multi-map then we remove it from the set.
5617   // The max register usage is the maximum size of the set.
5618   // We also search for instructions that are defined outside the loop, but are
5619   // used inside the loop. We need this number separately from the max-interval
5620   // usage number because when we unroll, loop-invariant values do not take
5621   // more register.
5622   LoopBlocksDFS DFS(TheLoop);
5623   DFS.perform(LI);
5624 
5625   RegisterUsage RU;
5626 
5627   // Each 'key' in the map opens a new interval. The values
5628   // of the map are the index of the 'last seen' usage of the
5629   // instruction that is the key.
5630   using IntervalMap = DenseMap<Instruction *, unsigned>;
5631 
5632   // Maps instruction to its index.
5633   SmallVector<Instruction *, 64> IdxToInstr;
5634   // Marks the end of each interval.
5635   IntervalMap EndPoint;
5636   // Saves the list of instruction indices that are used in the loop.
5637   SmallPtrSet<Instruction *, 8> Ends;
5638   // Saves the list of values that are used in the loop but are defined outside
5639   // the loop (not including non-instruction values such as arguments and
5640   // constants).
5641   SmallSetVector<Instruction *, 8> LoopInvariants;
5642 
5643   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5644     for (Instruction &I : BB->instructionsWithoutDebug()) {
5645       IdxToInstr.push_back(&I);
5646 
5647       // Save the end location of each USE.
5648       for (Value *U : I.operands()) {
5649         auto *Instr = dyn_cast<Instruction>(U);
5650 
5651         // Ignore non-instruction values such as arguments, constants, etc.
5652         // FIXME: Might need some motivation why these values are ignored. If
5653         // for example an argument is used inside the loop it will increase the
5654         // register pressure (so shouldn't we add it to LoopInvariants).
5655         if (!Instr)
5656           continue;
5657 
5658         // If this instruction is outside the loop then record it and continue.
5659         if (!TheLoop->contains(Instr)) {
5660           LoopInvariants.insert(Instr);
5661           continue;
5662         }
5663 
5664         // Overwrite previous end points.
5665         EndPoint[Instr] = IdxToInstr.size();
5666         Ends.insert(Instr);
5667       }
5668     }
5669   }
5670 
5671   // Saves the list of intervals that end with the index in 'key'.
5672   using InstrList = SmallVector<Instruction *, 2>;
5673   DenseMap<unsigned, InstrList> TransposeEnds;
5674 
5675   // Transpose the EndPoints to a list of values that end at each index.
5676   for (auto &Interval : EndPoint)
5677     TransposeEnds[Interval.second].push_back(Interval.first);
5678 
5679   SmallPtrSet<Instruction *, 8> OpenIntervals;
5680   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5681   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5682 
5683   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5684 
5685   const auto &TTICapture = TTI;
5686   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5687     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5688       return 0;
5689     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5690   };
5691 
5692   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5693     Instruction *I = IdxToInstr[i];
5694 
5695     // Remove all of the instructions that end at this location.
5696     InstrList &List = TransposeEnds[i];
5697     for (Instruction *ToRemove : List)
5698       OpenIntervals.erase(ToRemove);
5699 
5700     // Ignore instructions that are never used within the loop.
5701     if (!Ends.count(I))
5702       continue;
5703 
5704     // Skip ignored values.
5705     if (ValuesToIgnore.count(I))
5706       continue;
5707 
5708     collectInLoopReductions();
5709 
5710     // For each VF find the maximum usage of registers.
5711     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5712       // Count the number of registers used, per register class, given all open
5713       // intervals.
5714       // Note that elements in this SmallMapVector will be default constructed
5715       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716       // there is no previous entry for ClassID.
5717       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5718 
5719       if (VFs[j].isScalar()) {
5720         for (auto *Inst : OpenIntervals) {
5721           unsigned ClassID =
5722               TTI.getRegisterClassForType(false, Inst->getType());
5723           // FIXME: The target might use more than one register for the type
5724           // even in the scalar case.
5725           RegUsage[ClassID] += 1;
5726         }
5727       } else {
5728         collectUniformsAndScalars(VFs[j]);
5729         for (auto *Inst : OpenIntervals) {
5730           // Skip ignored values for VF > 1.
5731           if (VecValuesToIgnore.count(Inst))
5732             continue;
5733           if (isScalarAfterVectorization(Inst, VFs[j])) {
5734             unsigned ClassID =
5735                 TTI.getRegisterClassForType(false, Inst->getType());
5736             // FIXME: The target might use more than one register for the type
5737             // even in the scalar case.
5738             RegUsage[ClassID] += 1;
5739           } else {
5740             unsigned ClassID =
5741                 TTI.getRegisterClassForType(true, Inst->getType());
5742             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5743           }
5744         }
5745       }
5746 
5747       for (auto& pair : RegUsage) {
5748         auto &Entry = MaxUsages[j][pair.first];
5749         Entry = std::max(Entry, pair.second);
5750       }
5751     }
5752 
5753     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5754                       << OpenIntervals.size() << '\n');
5755 
5756     // Add the current instruction to the list of open intervals.
5757     OpenIntervals.insert(I);
5758   }
5759 
5760   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5761     // Note that elements in this SmallMapVector will be default constructed
5762     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763     // there is no previous entry for ClassID.
5764     SmallMapVector<unsigned, unsigned, 4> Invariant;
5765 
5766     for (auto *Inst : LoopInvariants) {
5767       // FIXME: The target might use more than one register for the type
5768       // even in the scalar case.
5769       bool IsScalar = all_of(Inst->users(), [&](User *U) {
5770         auto *I = cast<Instruction>(U);
5771         return TheLoop != LI->getLoopFor(I->getParent()) ||
5772                isScalarAfterVectorization(I, VFs[i]);
5773       });
5774 
5775       ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5776       unsigned ClassID =
5777           TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5778       Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5779     }
5780 
5781     LLVM_DEBUG({
5782       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5783       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5784              << " item\n";
5785       for (const auto &pair : MaxUsages[i]) {
5786         dbgs() << "LV(REG): RegisterClass: "
5787                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5788                << " registers\n";
5789       }
5790       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5791              << " item\n";
5792       for (const auto &pair : Invariant) {
5793         dbgs() << "LV(REG): RegisterClass: "
5794                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5795                << " registers\n";
5796       }
5797     });
5798 
5799     RU.LoopInvariantRegs = Invariant;
5800     RU.MaxLocalUsers = MaxUsages[i];
5801     RUs[i] = RU;
5802   }
5803 
5804   return RUs;
5805 }
5806 
useEmulatedMaskMemRefHack(Instruction * I,ElementCount VF)5807 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5808                                                            ElementCount VF) {
5809   // TODO: Cost model for emulated masked load/store is completely
5810   // broken. This hack guides the cost model to use an artificially
5811   // high enough value to practically disable vectorization with such
5812   // operations, except where previously deployed legality hack allowed
5813   // using very low cost values. This is to avoid regressions coming simply
5814   // from moving "masked load/store" check from legality to cost model.
5815   // Masked Load/Gather emulation was previously never allowed.
5816   // Limited number of Masked Store/Scatter emulation was allowed.
5817   assert((isPredicatedInst(I)) &&
5818          "Expecting a scalar emulated instruction");
5819   return isa<LoadInst>(I) ||
5820          (isa<StoreInst>(I) &&
5821           NumPredStores > NumberOfStoresToPredicate);
5822 }
5823 
collectInstsToScalarize(ElementCount VF)5824 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5825   // If we aren't vectorizing the loop, or if we've already collected the
5826   // instructions to scalarize, there's nothing to do. Collection may already
5827   // have occurred if we have a user-selected VF and are now computing the
5828   // expected cost for interleaving.
5829   if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5830     return;
5831 
5832   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5833   // not profitable to scalarize any instructions, the presence of VF in the
5834   // map will indicate that we've analyzed it already.
5835   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5836 
5837   PredicatedBBsAfterVectorization[VF].clear();
5838 
5839   // Find all the instructions that are scalar with predication in the loop and
5840   // determine if it would be better to not if-convert the blocks they are in.
5841   // If so, we also record the instructions to scalarize.
5842   for (BasicBlock *BB : TheLoop->blocks()) {
5843     if (!blockNeedsPredicationForAnyReason(BB))
5844       continue;
5845     for (Instruction &I : *BB)
5846       if (isScalarWithPredication(&I, VF)) {
5847         ScalarCostsTy ScalarCosts;
5848         // Do not apply discount if scalable, because that would lead to
5849         // invalid scalarization costs.
5850         // Do not apply discount logic if hacked cost is needed
5851         // for emulated masked memrefs.
5852         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5853             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5854           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5855         // Remember that BB will remain after vectorization.
5856         PredicatedBBsAfterVectorization[VF].insert(BB);
5857       }
5858   }
5859 }
5860 
computePredInstDiscount(Instruction * PredInst,ScalarCostsTy & ScalarCosts,ElementCount VF)5861 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5862     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5863   assert(!isUniformAfterVectorization(PredInst, VF) &&
5864          "Instruction marked uniform-after-vectorization will be predicated");
5865 
5866   // Initialize the discount to zero, meaning that the scalar version and the
5867   // vector version cost the same.
5868   InstructionCost Discount = 0;
5869 
5870   // Holds instructions to analyze. The instructions we visit are mapped in
5871   // ScalarCosts. Those instructions are the ones that would be scalarized if
5872   // we find that the scalar version costs less.
5873   SmallVector<Instruction *, 8> Worklist;
5874 
5875   // Returns true if the given instruction can be scalarized.
5876   auto canBeScalarized = [&](Instruction *I) -> bool {
5877     // We only attempt to scalarize instructions forming a single-use chain
5878     // from the original predicated block that would otherwise be vectorized.
5879     // Although not strictly necessary, we give up on instructions we know will
5880     // already be scalar to avoid traversing chains that are unlikely to be
5881     // beneficial.
5882     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5883         isScalarAfterVectorization(I, VF))
5884       return false;
5885 
5886     // If the instruction is scalar with predication, it will be analyzed
5887     // separately. We ignore it within the context of PredInst.
5888     if (isScalarWithPredication(I, VF))
5889       return false;
5890 
5891     // If any of the instruction's operands are uniform after vectorization,
5892     // the instruction cannot be scalarized. This prevents, for example, a
5893     // masked load from being scalarized.
5894     //
5895     // We assume we will only emit a value for lane zero of an instruction
5896     // marked uniform after vectorization, rather than VF identical values.
5897     // Thus, if we scalarize an instruction that uses a uniform, we would
5898     // create uses of values corresponding to the lanes we aren't emitting code
5899     // for. This behavior can be changed by allowing getScalarValue to clone
5900     // the lane zero values for uniforms rather than asserting.
5901     for (Use &U : I->operands())
5902       if (auto *J = dyn_cast<Instruction>(U.get()))
5903         if (isUniformAfterVectorization(J, VF))
5904           return false;
5905 
5906     // Otherwise, we can scalarize the instruction.
5907     return true;
5908   };
5909 
5910   // Compute the expected cost discount from scalarizing the entire expression
5911   // feeding the predicated instruction. We currently only consider expressions
5912   // that are single-use instruction chains.
5913   Worklist.push_back(PredInst);
5914   while (!Worklist.empty()) {
5915     Instruction *I = Worklist.pop_back_val();
5916 
5917     // If we've already analyzed the instruction, there's nothing to do.
5918     if (ScalarCosts.contains(I))
5919       continue;
5920 
5921     // Compute the cost of the vector instruction. Note that this cost already
5922     // includes the scalarization overhead of the predicated instruction.
5923     InstructionCost VectorCost = getInstructionCost(I, VF).first;
5924 
5925     // Compute the cost of the scalarized instruction. This cost is the cost of
5926     // the instruction as if it wasn't if-converted and instead remained in the
5927     // predicated block. We will scale this cost by block probability after
5928     // computing the scalarization overhead.
5929     InstructionCost ScalarCost =
5930         VF.getFixedValue() *
5931         getInstructionCost(I, ElementCount::getFixed(1)).first;
5932 
5933     // Compute the scalarization overhead of needed insertelement instructions
5934     // and phi nodes.
5935     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5936     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5937       ScalarCost += TTI.getScalarizationOverhead(
5938           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5939           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5940           /*Extract*/ false, CostKind);
5941       ScalarCost +=
5942           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5943     }
5944 
5945     // Compute the scalarization overhead of needed extractelement
5946     // instructions. For each of the instruction's operands, if the operand can
5947     // be scalarized, add it to the worklist; otherwise, account for the
5948     // overhead.
5949     for (Use &U : I->operands())
5950       if (auto *J = dyn_cast<Instruction>(U.get())) {
5951         assert(VectorType::isValidElementType(J->getType()) &&
5952                "Instruction has non-scalar type");
5953         if (canBeScalarized(J))
5954           Worklist.push_back(J);
5955         else if (needsExtract(J, VF)) {
5956           ScalarCost += TTI.getScalarizationOverhead(
5957               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5958               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5959               /*Extract*/ true, CostKind);
5960         }
5961       }
5962 
5963     // Scale the total scalar cost by block probability.
5964     ScalarCost /= getReciprocalPredBlockProb();
5965 
5966     // Compute the discount. A non-negative discount means the vector version
5967     // of the instruction costs more, and scalarizing would be beneficial.
5968     Discount += VectorCost - ScalarCost;
5969     ScalarCosts[I] = ScalarCost;
5970   }
5971 
5972   return Discount;
5973 }
5974 
5975 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(ElementCount VF,SmallVectorImpl<InstructionVFPair> * Invalid)5976 LoopVectorizationCostModel::expectedCost(
5977     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5978   VectorizationCostTy Cost;
5979 
5980   // For each block.
5981   for (BasicBlock *BB : TheLoop->blocks()) {
5982     VectorizationCostTy BlockCost;
5983 
5984     // For each instruction in the old loop.
5985     for (Instruction &I : BB->instructionsWithoutDebug()) {
5986       // Skip ignored values.
5987       if (ValuesToIgnore.count(&I) ||
5988           (VF.isVector() && VecValuesToIgnore.count(&I)))
5989         continue;
5990 
5991       VectorizationCostTy C = getInstructionCost(&I, VF);
5992 
5993       // Check if we should override the cost.
5994       if (C.first.isValid() &&
5995           ForceTargetInstructionCost.getNumOccurrences() > 0)
5996         C.first = InstructionCost(ForceTargetInstructionCost);
5997 
5998       // Keep a list of instructions with invalid costs.
5999       if (Invalid && !C.first.isValid())
6000         Invalid->emplace_back(&I, VF);
6001 
6002       BlockCost.first += C.first;
6003       BlockCost.second |= C.second;
6004       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6005                         << " for VF " << VF << " For instruction: " << I
6006                         << '\n');
6007     }
6008 
6009     // If we are vectorizing a predicated block, it will have been
6010     // if-converted. This means that the block's instructions (aside from
6011     // stores and instructions that may divide by zero) will now be
6012     // unconditionally executed. For the scalar case, we may not always execute
6013     // the predicated block, if it is an if-else block. Thus, scale the block's
6014     // cost by the probability of executing it. blockNeedsPredication from
6015     // Legal is used so as to not include all blocks in tail folded loops.
6016     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6017       BlockCost.first /= getReciprocalPredBlockProb();
6018 
6019     Cost.first += BlockCost.first;
6020     Cost.second |= BlockCost.second;
6021   }
6022 
6023   return Cost;
6024 }
6025 
6026 /// Gets Address Access SCEV after verifying that the access pattern
6027 /// is loop invariant except the induction variable dependence.
6028 ///
6029 /// This SCEV can be sent to the Target in order to estimate the address
6030 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)6031 static const SCEV *getAddressAccessSCEV(
6032               Value *Ptr,
6033               LoopVectorizationLegality *Legal,
6034               PredicatedScalarEvolution &PSE,
6035               const Loop *TheLoop) {
6036 
6037   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6038   if (!Gep)
6039     return nullptr;
6040 
6041   // We are looking for a gep with all loop invariant indices except for one
6042   // which should be an induction variable.
6043   auto SE = PSE.getSE();
6044   unsigned NumOperands = Gep->getNumOperands();
6045   for (unsigned i = 1; i < NumOperands; ++i) {
6046     Value *Opd = Gep->getOperand(i);
6047     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6048         !Legal->isInductionVariable(Opd))
6049       return nullptr;
6050   }
6051 
6052   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6053   return PSE.getSCEV(Ptr);
6054 }
6055 
6056 InstructionCost
getMemInstScalarizationCost(Instruction * I,ElementCount VF)6057 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6058                                                         ElementCount VF) {
6059   assert(VF.isVector() &&
6060          "Scalarization cost of instruction implies vectorization.");
6061   if (VF.isScalable())
6062     return InstructionCost::getInvalid();
6063 
6064   Type *ValTy = getLoadStoreType(I);
6065   auto SE = PSE.getSE();
6066 
6067   unsigned AS = getLoadStoreAddressSpace(I);
6068   Value *Ptr = getLoadStorePointerOperand(I);
6069   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6070   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6071   //       that it is being called from this specific place.
6072 
6073   // Figure out whether the access is strided and get the stride value
6074   // if it's known in compile time
6075   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6076 
6077   // Get the cost of the scalar memory instruction and address computation.
6078   InstructionCost Cost =
6079       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6080 
6081   // Don't pass *I here, since it is scalar but will actually be part of a
6082   // vectorized loop where the user of it is a vectorized instruction.
6083   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6084   const Align Alignment = getLoadStoreAlignment(I);
6085   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6086                                                       ValTy->getScalarType(),
6087                                                       Alignment, AS, CostKind);
6088 
6089   // Get the overhead of the extractelement and insertelement instructions
6090   // we might create due to scalarization.
6091   Cost += getScalarizationOverhead(I, VF, CostKind);
6092 
6093   // If we have a predicated load/store, it will need extra i1 extracts and
6094   // conditional branches, but may not be executed for each vector lane. Scale
6095   // the cost by the probability of executing the predicated block.
6096   if (isPredicatedInst(I)) {
6097     Cost /= getReciprocalPredBlockProb();
6098 
6099     // Add the cost of an i1 extract and a branch
6100     auto *Vec_i1Ty =
6101         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6102     Cost += TTI.getScalarizationOverhead(
6103         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6104         /*Insert=*/false, /*Extract=*/true, CostKind);
6105     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6106 
6107     if (useEmulatedMaskMemRefHack(I, VF))
6108       // Artificially setting to a high enough value to practically disable
6109       // vectorization with such operations.
6110       Cost = 3000000;
6111   }
6112 
6113   return Cost;
6114 }
6115 
6116 InstructionCost
getConsecutiveMemOpCost(Instruction * I,ElementCount VF)6117 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6118                                                     ElementCount VF) {
6119   Type *ValTy = getLoadStoreType(I);
6120   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6121   Value *Ptr = getLoadStorePointerOperand(I);
6122   unsigned AS = getLoadStoreAddressSpace(I);
6123   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6124   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6125 
6126   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6127          "Stride should be 1 or -1 for consecutive memory access");
6128   const Align Alignment = getLoadStoreAlignment(I);
6129   InstructionCost Cost = 0;
6130   if (Legal->isMaskRequired(I)) {
6131     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6132                                       CostKind);
6133   } else {
6134     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6135     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6136                                 CostKind, OpInfo, I);
6137   }
6138 
6139   bool Reverse = ConsecutiveStride < 0;
6140   if (Reverse)
6141     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6142                                std::nullopt, CostKind, 0);
6143   return Cost;
6144 }
6145 
6146 InstructionCost
getUniformMemOpCost(Instruction * I,ElementCount VF)6147 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6148                                                 ElementCount VF) {
6149   assert(Legal->isUniformMemOp(*I, VF));
6150 
6151   Type *ValTy = getLoadStoreType(I);
6152   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6153   const Align Alignment = getLoadStoreAlignment(I);
6154   unsigned AS = getLoadStoreAddressSpace(I);
6155   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6156   if (isa<LoadInst>(I)) {
6157     return TTI.getAddressComputationCost(ValTy) +
6158            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6159                                CostKind) +
6160            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6161   }
6162   StoreInst *SI = cast<StoreInst>(I);
6163 
6164   bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6165   return TTI.getAddressComputationCost(ValTy) +
6166          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6167                              CostKind) +
6168          (isLoopInvariantStoreValue
6169               ? 0
6170               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6171                                        CostKind, VF.getKnownMinValue() - 1));
6172 }
6173 
6174 InstructionCost
getGatherScatterCost(Instruction * I,ElementCount VF)6175 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6176                                                  ElementCount VF) {
6177   Type *ValTy = getLoadStoreType(I);
6178   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6179   const Align Alignment = getLoadStoreAlignment(I);
6180   const Value *Ptr = getLoadStorePointerOperand(I);
6181 
6182   return TTI.getAddressComputationCost(VectorTy) +
6183          TTI.getGatherScatterOpCost(
6184              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6185              TargetTransformInfo::TCK_RecipThroughput, I);
6186 }
6187 
6188 InstructionCost
getInterleaveGroupCost(Instruction * I,ElementCount VF)6189 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6190                                                    ElementCount VF) {
6191   Type *ValTy = getLoadStoreType(I);
6192   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6193   unsigned AS = getLoadStoreAddressSpace(I);
6194   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6195 
6196   auto Group = getInterleavedAccessGroup(I);
6197   assert(Group && "Fail to get an interleaved access group.");
6198 
6199   unsigned InterleaveFactor = Group->getFactor();
6200   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6201 
6202   // Holds the indices of existing members in the interleaved group.
6203   SmallVector<unsigned, 4> Indices;
6204   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6205     if (Group->getMember(IF))
6206       Indices.push_back(IF);
6207 
6208   // Calculate the cost of the whole interleaved group.
6209   bool UseMaskForGaps =
6210       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6212   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6213       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6214       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6215 
6216   if (Group->isReverse()) {
6217     // TODO: Add support for reversed masked interleaved access.
6218     assert(!Legal->isMaskRequired(I) &&
6219            "Reverse masked interleaved access not supported.");
6220     Cost += Group->getNumMembers() *
6221             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6222                                std::nullopt, CostKind, 0);
6223   }
6224   return Cost;
6225 }
6226 
6227 std::optional<InstructionCost>
getReductionPatternCost(Instruction * I,ElementCount VF,Type * Ty,TTI::TargetCostKind CostKind) const6228 LoopVectorizationCostModel::getReductionPatternCost(
6229     Instruction *I, ElementCount VF, Type *Ty,
6230     TTI::TargetCostKind CostKind) const {
6231   using namespace llvm::PatternMatch;
6232   // Early exit for no inloop reductions
6233   if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6234     return std::nullopt;
6235   auto *VectorTy = cast<VectorType>(Ty);
6236 
6237   // We are looking for a pattern of, and finding the minimal acceptable cost:
6238   //  reduce(mul(ext(A), ext(B))) or
6239   //  reduce(mul(A, B)) or
6240   //  reduce(ext(A)) or
6241   //  reduce(A).
6242   // The basic idea is that we walk down the tree to do that, finding the root
6243   // reduction instruction in InLoopReductionImmediateChains. From there we find
6244   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245   // of the components. If the reduction cost is lower then we return it for the
6246   // reduction instruction and 0 for the other instructions in the pattern. If
6247   // it is not we return an invalid cost specifying the orignal cost method
6248   // should be used.
6249   Instruction *RetI = I;
6250   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6251     if (!RetI->hasOneUser())
6252       return std::nullopt;
6253     RetI = RetI->user_back();
6254   }
6255 
6256   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257       RetI->user_back()->getOpcode() == Instruction::Add) {
6258     RetI = RetI->user_back();
6259   }
6260 
6261   // Test if the found instruction is a reduction, and if not return an invalid
6262   // cost specifying the parent to use the original cost modelling.
6263   if (!InLoopReductionImmediateChains.count(RetI))
6264     return std::nullopt;
6265 
6266   // Find the reduction this chain is a part of and calculate the basic cost of
6267   // the reduction on its own.
6268   Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6269   Instruction *ReductionPhi = LastChain;
6270   while (!isa<PHINode>(ReductionPhi))
6271     ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6272 
6273   const RecurrenceDescriptor &RdxDesc =
6274       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6275 
6276   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6277       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6278 
6279   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6280   // normal fmul instruction to the cost of the fadd reduction.
6281   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6282     BaseCost +=
6283         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6284 
6285   // If we're using ordered reductions then we can just return the base cost
6286   // here, since getArithmeticReductionCost calculates the full ordered
6287   // reduction cost when FP reassociation is not allowed.
6288   if (useOrderedReductions(RdxDesc))
6289     return BaseCost;
6290 
6291   // Get the operand that was not the reduction chain and match it to one of the
6292   // patterns, returning the better cost if it is found.
6293   Instruction *RedOp = RetI->getOperand(1) == LastChain
6294                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6295                            : dyn_cast<Instruction>(RetI->getOperand(1));
6296 
6297   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6298 
6299   Instruction *Op0, *Op1;
6300   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6301       match(RedOp,
6302             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6303       match(Op0, m_ZExtOrSExt(m_Value())) &&
6304       Op0->getOpcode() == Op1->getOpcode() &&
6305       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6306       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6307       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6308 
6309     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310     // Note that the extend opcodes need to all match, or if A==B they will have
6311     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312     // which is equally fine.
6313     bool IsUnsigned = isa<ZExtInst>(Op0);
6314     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6315     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6316 
6317     InstructionCost ExtCost =
6318         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6319                              TTI::CastContextHint::None, CostKind, Op0);
6320     InstructionCost MulCost =
6321         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6322     InstructionCost Ext2Cost =
6323         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6324                              TTI::CastContextHint::None, CostKind, RedOp);
6325 
6326     InstructionCost RedCost = TTI.getMulAccReductionCost(
6327         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6328 
6329     if (RedCost.isValid() &&
6330         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6331       return I == RetI ? RedCost : 0;
6332   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6333              !TheLoop->isLoopInvariant(RedOp)) {
6334     // Matched reduce(ext(A))
6335     bool IsUnsigned = isa<ZExtInst>(RedOp);
6336     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6337     InstructionCost RedCost = TTI.getExtendedReductionCost(
6338         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6339         RdxDesc.getFastMathFlags(), CostKind);
6340 
6341     InstructionCost ExtCost =
6342         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6343                              TTI::CastContextHint::None, CostKind, RedOp);
6344     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6345       return I == RetI ? RedCost : 0;
6346   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6347              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6348     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6349         Op0->getOpcode() == Op1->getOpcode() &&
6350         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6351       bool IsUnsigned = isa<ZExtInst>(Op0);
6352       Type *Op0Ty = Op0->getOperand(0)->getType();
6353       Type *Op1Ty = Op1->getOperand(0)->getType();
6354       Type *LargestOpTy =
6355           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6356                                                                     : Op0Ty;
6357       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6358 
6359       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6360       // different sizes. We take the largest type as the ext to reduce, and add
6361       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6362       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6363           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6364           TTI::CastContextHint::None, CostKind, Op0);
6365       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6366           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6367           TTI::CastContextHint::None, CostKind, Op1);
6368       InstructionCost MulCost =
6369           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370 
6371       InstructionCost RedCost = TTI.getMulAccReductionCost(
6372           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6373       InstructionCost ExtraExtCost = 0;
6374       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6375         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6376         ExtraExtCost = TTI.getCastInstrCost(
6377             ExtraExtOp->getOpcode(), ExtType,
6378             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6379             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6380       }
6381 
6382       if (RedCost.isValid() &&
6383           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6384         return I == RetI ? RedCost : 0;
6385     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6386       // Matched reduce.add(mul())
6387       InstructionCost MulCost =
6388           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6389 
6390       InstructionCost RedCost = TTI.getMulAccReductionCost(
6391           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6392 
6393       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6394         return I == RetI ? RedCost : 0;
6395     }
6396   }
6397 
6398   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6399 }
6400 
6401 InstructionCost
getMemoryInstructionCost(Instruction * I,ElementCount VF)6402 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6403                                                      ElementCount VF) {
6404   // Calculate scalar cost only. Vectorization cost should be ready at this
6405   // moment.
6406   if (VF.isScalar()) {
6407     Type *ValTy = getLoadStoreType(I);
6408     const Align Alignment = getLoadStoreAlignment(I);
6409     unsigned AS = getLoadStoreAddressSpace(I);
6410 
6411     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6412     return TTI.getAddressComputationCost(ValTy) +
6413            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6414                                TTI::TCK_RecipThroughput, OpInfo, I);
6415   }
6416   return getWideningCost(I, VF);
6417 }
6418 
6419 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,ElementCount VF)6420 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6421                                                ElementCount VF) {
6422   // If we know that this instruction will remain uniform, check the cost of
6423   // the scalar version.
6424   if (isUniformAfterVectorization(I, VF))
6425     VF = ElementCount::getFixed(1);
6426 
6427   if (VF.isVector() && isProfitableToScalarize(I, VF))
6428     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6429 
6430   // Forced scalars do not have any scalarization overhead.
6431   auto ForcedScalar = ForcedScalars.find(VF);
6432   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6433     auto InstSet = ForcedScalar->second;
6434     if (InstSet.count(I))
6435       return VectorizationCostTy(
6436           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6437            VF.getKnownMinValue()),
6438           false);
6439   }
6440 
6441   Type *VectorTy;
6442   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6443 
6444   bool TypeNotScalarized = false;
6445   if (VF.isVector() && VectorTy->isVectorTy()) {
6446     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6447       if (VF.isScalable())
6448         // <vscale x 1 x iN> is assumed to be profitable over iN because
6449         // scalable registers are a distinct register class from scalar ones.
6450         // If we ever find a target which wants to lower scalable vectors
6451         // back to scalars, we'll need to update this code to explicitly
6452         // ask TTI about the register class uses for each part.
6453         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6454       else
6455         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6456     } else
6457       C = InstructionCost::getInvalid();
6458   }
6459   return VectorizationCostTy(C, TypeNotScalarized);
6460 }
6461 
getScalarizationOverhead(Instruction * I,ElementCount VF,TTI::TargetCostKind CostKind) const6462 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6463     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6464 
6465   // There is no mechanism yet to create a scalable scalarization loop,
6466   // so this is currently Invalid.
6467   if (VF.isScalable())
6468     return InstructionCost::getInvalid();
6469 
6470   if (VF.isScalar())
6471     return 0;
6472 
6473   InstructionCost Cost = 0;
6474   Type *RetTy = ToVectorTy(I->getType(), VF);
6475   if (!RetTy->isVoidTy() &&
6476       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6477     Cost += TTI.getScalarizationOverhead(
6478         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6479         /*Insert*/ true,
6480         /*Extract*/ false, CostKind);
6481 
6482   // Some targets keep addresses scalar.
6483   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6484     return Cost;
6485 
6486   // Some targets support efficient element stores.
6487   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6488     return Cost;
6489 
6490   // Collect operands to consider.
6491   CallInst *CI = dyn_cast<CallInst>(I);
6492   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6493 
6494   // Skip operands that do not require extraction/scalarization and do not incur
6495   // any overhead.
6496   SmallVector<Type *> Tys;
6497   for (auto *V : filterExtractingOperands(Ops, VF))
6498     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6499   return Cost + TTI.getOperandsScalarizationOverhead(
6500                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6501 }
6502 
setCostBasedWideningDecision(ElementCount VF)6503 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6504   if (VF.isScalar())
6505     return;
6506   NumPredStores = 0;
6507   for (BasicBlock *BB : TheLoop->blocks()) {
6508     // For each instruction in the old loop.
6509     for (Instruction &I : *BB) {
6510       Value *Ptr =  getLoadStorePointerOperand(&I);
6511       if (!Ptr)
6512         continue;
6513 
6514       // TODO: We should generate better code and update the cost model for
6515       // predicated uniform stores. Today they are treated as any other
6516       // predicated store (see added test cases in
6517       // invariant-store-vectorization.ll).
6518       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6519         NumPredStores++;
6520 
6521       if (Legal->isUniformMemOp(I, VF)) {
6522         auto isLegalToScalarize = [&]() {
6523           if (!VF.isScalable())
6524             // Scalarization of fixed length vectors "just works".
6525             return true;
6526 
6527           // We have dedicated lowering for unpredicated uniform loads and
6528           // stores.  Note that even with tail folding we know that at least
6529           // one lane is active (i.e. generalized predication is not possible
6530           // here), and the logic below depends on this fact.
6531           if (!foldTailByMasking())
6532             return true;
6533 
6534           // For scalable vectors, a uniform memop load is always
6535           // uniform-by-parts  and we know how to scalarize that.
6536           if (isa<LoadInst>(I))
6537             return true;
6538 
6539           // A uniform store isn't neccessarily uniform-by-part
6540           // and we can't assume scalarization.
6541           auto &SI = cast<StoreInst>(I);
6542           return TheLoop->isLoopInvariant(SI.getValueOperand());
6543         };
6544 
6545         const InstructionCost GatherScatterCost =
6546           isLegalGatherOrScatter(&I, VF) ?
6547           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6548 
6549         // Load: Scalar load + broadcast
6550         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551         // FIXME: This cost is a significant under-estimate for tail folded
6552         // memory ops.
6553         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6554           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6555 
6556         // Choose better solution for the current VF,  Note that Invalid
6557         // costs compare as maximumal large.  If both are invalid, we get
6558         // scalable invalid which signals a failure and a vectorization abort.
6559         if (GatherScatterCost < ScalarizationCost)
6560           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6561         else
6562           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6563         continue;
6564       }
6565 
6566       // We assume that widening is the best solution when possible.
6567       if (memoryInstructionCanBeWidened(&I, VF)) {
6568         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6569         int ConsecutiveStride = Legal->isConsecutivePtr(
6570             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6571         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6572                "Expected consecutive stride.");
6573         InstWidening Decision =
6574             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6575         setWideningDecision(&I, VF, Decision, Cost);
6576         continue;
6577       }
6578 
6579       // Choose between Interleaving, Gather/Scatter or Scalarization.
6580       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6581       unsigned NumAccesses = 1;
6582       if (isAccessInterleaved(&I)) {
6583         auto Group = getInterleavedAccessGroup(&I);
6584         assert(Group && "Fail to get an interleaved access group.");
6585 
6586         // Make one decision for the whole group.
6587         if (getWideningDecision(&I, VF) != CM_Unknown)
6588           continue;
6589 
6590         NumAccesses = Group->getNumMembers();
6591         if (interleavedAccessCanBeWidened(&I, VF))
6592           InterleaveCost = getInterleaveGroupCost(&I, VF);
6593       }
6594 
6595       InstructionCost GatherScatterCost =
6596           isLegalGatherOrScatter(&I, VF)
6597               ? getGatherScatterCost(&I, VF) * NumAccesses
6598               : InstructionCost::getInvalid();
6599 
6600       InstructionCost ScalarizationCost =
6601           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6602 
6603       // Choose better solution for the current VF,
6604       // write down this decision and use it during vectorization.
6605       InstructionCost Cost;
6606       InstWidening Decision;
6607       if (InterleaveCost <= GatherScatterCost &&
6608           InterleaveCost < ScalarizationCost) {
6609         Decision = CM_Interleave;
6610         Cost = InterleaveCost;
6611       } else if (GatherScatterCost < ScalarizationCost) {
6612         Decision = CM_GatherScatter;
6613         Cost = GatherScatterCost;
6614       } else {
6615         Decision = CM_Scalarize;
6616         Cost = ScalarizationCost;
6617       }
6618       // If the instructions belongs to an interleave group, the whole group
6619       // receives the same decision. The whole group receives the cost, but
6620       // the cost will actually be assigned to one instruction.
6621       if (auto Group = getInterleavedAccessGroup(&I))
6622         setWideningDecision(Group, VF, Decision, Cost);
6623       else
6624         setWideningDecision(&I, VF, Decision, Cost);
6625     }
6626   }
6627 
6628   // Make sure that any load of address and any other address computation
6629   // remains scalar unless there is gather/scatter support. This avoids
6630   // inevitable extracts into address registers, and also has the benefit of
6631   // activating LSR more, since that pass can't optimize vectorized
6632   // addresses.
6633   if (TTI.prefersVectorizedAddressing())
6634     return;
6635 
6636   // Start with all scalar pointer uses.
6637   SmallPtrSet<Instruction *, 8> AddrDefs;
6638   for (BasicBlock *BB : TheLoop->blocks())
6639     for (Instruction &I : *BB) {
6640       Instruction *PtrDef =
6641         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6642       if (PtrDef && TheLoop->contains(PtrDef) &&
6643           getWideningDecision(&I, VF) != CM_GatherScatter)
6644         AddrDefs.insert(PtrDef);
6645     }
6646 
6647   // Add all instructions used to generate the addresses.
6648   SmallVector<Instruction *, 4> Worklist;
6649   append_range(Worklist, AddrDefs);
6650   while (!Worklist.empty()) {
6651     Instruction *I = Worklist.pop_back_val();
6652     for (auto &Op : I->operands())
6653       if (auto *InstOp = dyn_cast<Instruction>(Op))
6654         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6655             AddrDefs.insert(InstOp).second)
6656           Worklist.push_back(InstOp);
6657   }
6658 
6659   for (auto *I : AddrDefs) {
6660     if (isa<LoadInst>(I)) {
6661       // Setting the desired widening decision should ideally be handled in
6662       // by cost functions, but since this involves the task of finding out
6663       // if the loaded register is involved in an address computation, it is
6664       // instead changed here when we know this is the case.
6665       InstWidening Decision = getWideningDecision(I, VF);
6666       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6667         // Scalarize a widened load of address.
6668         setWideningDecision(
6669             I, VF, CM_Scalarize,
6670             (VF.getKnownMinValue() *
6671              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6672       else if (auto Group = getInterleavedAccessGroup(I)) {
6673         // Scalarize an interleave group of address loads.
6674         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6675           if (Instruction *Member = Group->getMember(I))
6676             setWideningDecision(
6677                 Member, VF, CM_Scalarize,
6678                 (VF.getKnownMinValue() *
6679                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6680         }
6681       }
6682     } else
6683       // Make sure I gets scalarized and a cost estimate without
6684       // scalarization overhead.
6685       ForcedScalars[VF].insert(I);
6686   }
6687 }
6688 
setVectorizedCallDecision(ElementCount VF)6689 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6690   assert(!VF.isScalar() &&
6691          "Trying to set a vectorization decision for a scalar VF");
6692 
6693   for (BasicBlock *BB : TheLoop->blocks()) {
6694     // For each instruction in the old loop.
6695     for (Instruction &I : *BB) {
6696       CallInst *CI = dyn_cast<CallInst>(&I);
6697 
6698       if (!CI)
6699         continue;
6700 
6701       InstructionCost ScalarCost = InstructionCost::getInvalid();
6702       InstructionCost VectorCost = InstructionCost::getInvalid();
6703       InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6704       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6705 
6706       Function *ScalarFunc = CI->getCalledFunction();
6707       Type *ScalarRetTy = CI->getType();
6708       SmallVector<Type *, 4> Tys, ScalarTys;
6709       bool MaskRequired = Legal->isMaskRequired(CI);
6710       for (auto &ArgOp : CI->args())
6711         ScalarTys.push_back(ArgOp->getType());
6712 
6713       // Compute corresponding vector type for return value and arguments.
6714       Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6715       for (Type *ScalarTy : ScalarTys)
6716         Tys.push_back(ToVectorTy(ScalarTy, VF));
6717 
6718       // An in-loop reduction using an fmuladd intrinsic is a special case;
6719       // we don't want the normal cost for that intrinsic.
6720       if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6721         if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6722           setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6723                                   getVectorIntrinsicIDForCall(CI, TLI),
6724                                   std::nullopt, *RedCost);
6725           continue;
6726         }
6727 
6728       // Estimate cost of scalarized vector call. The source operands are
6729       // assumed to be vectors, so we need to extract individual elements from
6730       // there, execute VF scalar calls, and then gather the result into the
6731       // vector return value.
6732       InstructionCost ScalarCallCost =
6733           TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6734 
6735       // Compute costs of unpacking argument values for the scalar calls and
6736       // packing the return values to a vector.
6737       InstructionCost ScalarizationCost =
6738           getScalarizationOverhead(CI, VF, CostKind);
6739 
6740       ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6741 
6742       // Find the cost of vectorizing the call, if we can find a suitable
6743       // vector variant of the function.
6744       bool UsesMask = false;
6745       VFInfo FuncInfo;
6746       Function *VecFunc = nullptr;
6747       // Search through any available variants for one we can use at this VF.
6748       for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6749         // Must match requested VF.
6750         if (Info.Shape.VF != VF)
6751           continue;
6752 
6753         // Must take a mask argument if one is required
6754         if (MaskRequired && !Info.isMasked())
6755           continue;
6756 
6757         // Check that all parameter kinds are supported
6758         bool ParamsOk = true;
6759         for (VFParameter Param : Info.Shape.Parameters) {
6760           switch (Param.ParamKind) {
6761           case VFParamKind::Vector:
6762             break;
6763           case VFParamKind::OMP_Uniform: {
6764             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6765             // Make sure the scalar parameter in the loop is invariant.
6766             if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6767                                               TheLoop))
6768               ParamsOk = false;
6769             break;
6770           }
6771           case VFParamKind::OMP_Linear: {
6772             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6773             // Find the stride for the scalar parameter in this loop and see if
6774             // it matches the stride for the variant.
6775             // TODO: do we need to figure out the cost of an extract to get the
6776             // first lane? Or do we hope that it will be folded away?
6777             ScalarEvolution *SE = PSE.getSE();
6778             const auto *SAR =
6779                 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6780 
6781             if (!SAR || SAR->getLoop() != TheLoop) {
6782               ParamsOk = false;
6783               break;
6784             }
6785 
6786             const SCEVConstant *Step =
6787                 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6788 
6789             if (!Step ||
6790                 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6791               ParamsOk = false;
6792 
6793             break;
6794           }
6795           case VFParamKind::GlobalPredicate:
6796             UsesMask = true;
6797             break;
6798           default:
6799             ParamsOk = false;
6800             break;
6801           }
6802         }
6803 
6804         if (!ParamsOk)
6805           continue;
6806 
6807         // Found a suitable candidate, stop here.
6808         VecFunc = CI->getModule()->getFunction(Info.VectorName);
6809         FuncInfo = Info;
6810         break;
6811       }
6812 
6813       // Add in the cost of synthesizing a mask if one wasn't required.
6814       InstructionCost MaskCost = 0;
6815       if (VecFunc && UsesMask && !MaskRequired)
6816         MaskCost = TTI.getShuffleCost(
6817             TargetTransformInfo::SK_Broadcast,
6818             VectorType::get(IntegerType::getInt1Ty(
6819                                 VecFunc->getFunctionType()->getContext()),
6820                             VF));
6821 
6822       if (TLI && VecFunc && !CI->isNoBuiltin())
6823         VectorCost =
6824             TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6825 
6826       // Find the cost of an intrinsic; some targets may have instructions that
6827       // perform the operation without needing an actual call.
6828       Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6829       if (IID != Intrinsic::not_intrinsic)
6830         IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6831 
6832       InstructionCost Cost = ScalarCost;
6833       InstWidening Decision = CM_Scalarize;
6834 
6835       if (VectorCost <= Cost) {
6836         Cost = VectorCost;
6837         Decision = CM_VectorCall;
6838       }
6839 
6840       if (IntrinsicCost <= Cost) {
6841         Cost = IntrinsicCost;
6842         Decision = CM_IntrinsicCall;
6843       }
6844 
6845       setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6846                               FuncInfo.getParamIndexForOptionalMask(), Cost);
6847     }
6848   }
6849 }
6850 
6851 InstructionCost
getInstructionCost(Instruction * I,ElementCount VF,Type * & VectorTy)6852 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6853                                                Type *&VectorTy) {
6854   Type *RetTy = I->getType();
6855   if (canTruncateToMinimalBitwidth(I, VF))
6856     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6857   auto SE = PSE.getSE();
6858   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6859 
6860   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6861                                                 ElementCount VF) -> bool {
6862     if (VF.isScalar())
6863       return true;
6864 
6865     auto Scalarized = InstsToScalarize.find(VF);
6866     assert(Scalarized != InstsToScalarize.end() &&
6867            "VF not yet analyzed for scalarization profitability");
6868     return !Scalarized->second.count(I) &&
6869            llvm::all_of(I->users(), [&](User *U) {
6870              auto *UI = cast<Instruction>(U);
6871              return !Scalarized->second.count(UI);
6872            });
6873   };
6874   (void) hasSingleCopyAfterVectorization;
6875 
6876   if (isScalarAfterVectorization(I, VF)) {
6877     // With the exception of GEPs and PHIs, after scalarization there should
6878     // only be one copy of the instruction generated in the loop. This is
6879     // because the VF is either 1, or any instructions that need scalarizing
6880     // have already been dealt with by the time we get here. As a result,
6881     // it means we don't have to multiply the instruction cost by VF.
6882     assert(I->getOpcode() == Instruction::GetElementPtr ||
6883            I->getOpcode() == Instruction::PHI ||
6884            (I->getOpcode() == Instruction::BitCast &&
6885             I->getType()->isPointerTy()) ||
6886            hasSingleCopyAfterVectorization(I, VF));
6887     VectorTy = RetTy;
6888   } else
6889     VectorTy = ToVectorTy(RetTy, VF);
6890 
6891   // TODO: We need to estimate the cost of intrinsic calls.
6892   switch (I->getOpcode()) {
6893   case Instruction::GetElementPtr:
6894     // We mark this instruction as zero-cost because the cost of GEPs in
6895     // vectorized code depends on whether the corresponding memory instruction
6896     // is scalarized or not. Therefore, we handle GEPs with the memory
6897     // instruction cost.
6898     return 0;
6899   case Instruction::Br: {
6900     // In cases of scalarized and predicated instructions, there will be VF
6901     // predicated blocks in the vectorized loop. Each branch around these
6902     // blocks requires also an extract of its vector compare i1 element.
6903     bool ScalarPredicatedBB = false;
6904     BranchInst *BI = cast<BranchInst>(I);
6905     if (VF.isVector() && BI->isConditional() &&
6906         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6907          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6908       ScalarPredicatedBB = true;
6909 
6910     if (ScalarPredicatedBB) {
6911       // Not possible to scalarize scalable vector with predicated instructions.
6912       if (VF.isScalable())
6913         return InstructionCost::getInvalid();
6914       // Return cost for branches around scalarized and predicated blocks.
6915       auto *Vec_i1Ty =
6916           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6917       return (
6918           TTI.getScalarizationOverhead(
6919               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6920               /*Insert*/ false, /*Extract*/ true, CostKind) +
6921           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6922     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6923       // The back-edge branch will remain, as will all scalar branches.
6924       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6925     else
6926       // This branch will be eliminated by if-conversion.
6927       return 0;
6928     // Note: We currently assume zero cost for an unconditional branch inside
6929     // a predicated block since it will become a fall-through, although we
6930     // may decide in the future to call TTI for all branches.
6931   }
6932   case Instruction::PHI: {
6933     auto *Phi = cast<PHINode>(I);
6934 
6935     // First-order recurrences are replaced by vector shuffles inside the loop.
6936     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6937       SmallVector<int> Mask(VF.getKnownMinValue());
6938       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6939       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6940                                 cast<VectorType>(VectorTy), Mask, CostKind,
6941                                 VF.getKnownMinValue() - 1);
6942     }
6943 
6944     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6945     // converted into select instructions. We require N - 1 selects per phi
6946     // node, where N is the number of incoming values.
6947     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6948       return (Phi->getNumIncomingValues() - 1) *
6949              TTI.getCmpSelInstrCost(
6950                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6951                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6952                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6953 
6954     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6955   }
6956   case Instruction::UDiv:
6957   case Instruction::SDiv:
6958   case Instruction::URem:
6959   case Instruction::SRem:
6960     if (VF.isVector() && isPredicatedInst(I)) {
6961       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6962       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6963         ScalarCost : SafeDivisorCost;
6964     }
6965     // We've proven all lanes safe to speculate, fall through.
6966     [[fallthrough]];
6967   case Instruction::Add:
6968   case Instruction::FAdd:
6969   case Instruction::Sub:
6970   case Instruction::FSub:
6971   case Instruction::Mul:
6972   case Instruction::FMul:
6973   case Instruction::FDiv:
6974   case Instruction::FRem:
6975   case Instruction::Shl:
6976   case Instruction::LShr:
6977   case Instruction::AShr:
6978   case Instruction::And:
6979   case Instruction::Or:
6980   case Instruction::Xor: {
6981     // If we're speculating on the stride being 1, the multiplication may
6982     // fold away.  We can generalize this for all operations using the notion
6983     // of neutral elements.  (TODO)
6984     if (I->getOpcode() == Instruction::Mul &&
6985         (PSE.getSCEV(I->getOperand(0))->isOne() ||
6986          PSE.getSCEV(I->getOperand(1))->isOne()))
6987       return 0;
6988 
6989     // Detect reduction patterns
6990     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6991       return *RedCost;
6992 
6993     // Certain instructions can be cheaper to vectorize if they have a constant
6994     // second vector operand. One example of this are shifts on x86.
6995     Value *Op2 = I->getOperand(1);
6996     auto Op2Info = TTI.getOperandInfo(Op2);
6997     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6998         Legal->isInvariant(Op2))
6999       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7000 
7001     SmallVector<const Value *, 4> Operands(I->operand_values());
7002     auto InstrCost = TTI.getArithmeticInstrCost(
7003         I->getOpcode(), VectorTy, CostKind,
7004         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7005         Op2Info, Operands, I);
7006 
7007     // Some targets can replace frem with vector library calls.
7008     InstructionCost VecCallCost = InstructionCost::getInvalid();
7009     if (I->getOpcode() == Instruction::FRem) {
7010       LibFunc Func;
7011       if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
7012           TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
7013         SmallVector<Type *, 4> OpTypes;
7014         for (auto &Op : I->operands())
7015           OpTypes.push_back(Op->getType());
7016         VecCallCost =
7017             TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
7018       }
7019     }
7020     return std::min(InstrCost, VecCallCost);
7021   }
7022   case Instruction::FNeg: {
7023     return TTI.getArithmeticInstrCost(
7024         I->getOpcode(), VectorTy, CostKind,
7025         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7026         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7027         I->getOperand(0), I);
7028   }
7029   case Instruction::Select: {
7030     SelectInst *SI = cast<SelectInst>(I);
7031     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7032     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7033 
7034     const Value *Op0, *Op1;
7035     using namespace llvm::PatternMatch;
7036     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7037                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7038       // select x, y, false --> x & y
7039       // select x, true, y --> x | y
7040       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7041       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7042       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7043               Op1->getType()->getScalarSizeInBits() == 1);
7044 
7045       SmallVector<const Value *, 2> Operands{Op0, Op1};
7046       return TTI.getArithmeticInstrCost(
7047           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7048           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7049     }
7050 
7051     Type *CondTy = SI->getCondition()->getType();
7052     if (!ScalarCond)
7053       CondTy = VectorType::get(CondTy, VF);
7054 
7055     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7056     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7057       Pred = Cmp->getPredicate();
7058     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7059                                   CostKind, I);
7060   }
7061   case Instruction::ICmp:
7062   case Instruction::FCmp: {
7063     Type *ValTy = I->getOperand(0)->getType();
7064     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7065     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7066       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7067     VectorTy = ToVectorTy(ValTy, VF);
7068     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7069                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7070                                   I);
7071   }
7072   case Instruction::Store:
7073   case Instruction::Load: {
7074     ElementCount Width = VF;
7075     if (Width.isVector()) {
7076       InstWidening Decision = getWideningDecision(I, Width);
7077       assert(Decision != CM_Unknown &&
7078              "CM decision should be taken at this point");
7079       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7080         return InstructionCost::getInvalid();
7081       if (Decision == CM_Scalarize)
7082         Width = ElementCount::getFixed(1);
7083     }
7084     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7085     return getMemoryInstructionCost(I, VF);
7086   }
7087   case Instruction::BitCast:
7088     if (I->getType()->isPointerTy())
7089       return 0;
7090     [[fallthrough]];
7091   case Instruction::ZExt:
7092   case Instruction::SExt:
7093   case Instruction::FPToUI:
7094   case Instruction::FPToSI:
7095   case Instruction::FPExt:
7096   case Instruction::PtrToInt:
7097   case Instruction::IntToPtr:
7098   case Instruction::SIToFP:
7099   case Instruction::UIToFP:
7100   case Instruction::Trunc:
7101   case Instruction::FPTrunc: {
7102     // Computes the CastContextHint from a Load/Store instruction.
7103     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7104       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7105              "Expected a load or a store!");
7106 
7107       if (VF.isScalar() || !TheLoop->contains(I))
7108         return TTI::CastContextHint::Normal;
7109 
7110       switch (getWideningDecision(I, VF)) {
7111       case LoopVectorizationCostModel::CM_GatherScatter:
7112         return TTI::CastContextHint::GatherScatter;
7113       case LoopVectorizationCostModel::CM_Interleave:
7114         return TTI::CastContextHint::Interleave;
7115       case LoopVectorizationCostModel::CM_Scalarize:
7116       case LoopVectorizationCostModel::CM_Widen:
7117         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7118                                         : TTI::CastContextHint::Normal;
7119       case LoopVectorizationCostModel::CM_Widen_Reverse:
7120         return TTI::CastContextHint::Reversed;
7121       case LoopVectorizationCostModel::CM_Unknown:
7122         llvm_unreachable("Instr did not go through cost modelling?");
7123       case LoopVectorizationCostModel::CM_VectorCall:
7124       case LoopVectorizationCostModel::CM_IntrinsicCall:
7125         llvm_unreachable_internal("Instr has invalid widening decision");
7126       }
7127 
7128       llvm_unreachable("Unhandled case!");
7129     };
7130 
7131     unsigned Opcode = I->getOpcode();
7132     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7133     // For Trunc, the context is the only user, which must be a StoreInst.
7134     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7135       if (I->hasOneUse())
7136         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7137           CCH = ComputeCCH(Store);
7138     }
7139     // For Z/Sext, the context is the operand, which must be a LoadInst.
7140     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7141              Opcode == Instruction::FPExt) {
7142       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7143         CCH = ComputeCCH(Load);
7144     }
7145 
7146     // We optimize the truncation of induction variables having constant
7147     // integer steps. The cost of these truncations is the same as the scalar
7148     // operation.
7149     if (isOptimizableIVTruncate(I, VF)) {
7150       auto *Trunc = cast<TruncInst>(I);
7151       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7152                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7153     }
7154 
7155     // Detect reduction patterns
7156     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7157       return *RedCost;
7158 
7159     Type *SrcScalarTy = I->getOperand(0)->getType();
7160     Type *SrcVecTy =
7161         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7162     if (canTruncateToMinimalBitwidth(I, VF)) {
7163       // This cast is going to be shrunk. This may remove the cast or it might
7164       // turn it into slightly different cast. For example, if MinBW == 16,
7165       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7166       //
7167       // Calculate the modified src and dest types.
7168       Type *MinVecTy = VectorTy;
7169       if (Opcode == Instruction::Trunc) {
7170         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7171         VectorTy =
7172             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7173       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7174         // Leave SrcVecTy unchanged - we only shrink the destination element
7175         // type.
7176         VectorTy =
7177             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7178       }
7179     }
7180 
7181     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7182   }
7183   case Instruction::Call:
7184     return getVectorCallCost(cast<CallInst>(I), VF);
7185   case Instruction::ExtractValue:
7186     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7187   case Instruction::Alloca:
7188     // We cannot easily widen alloca to a scalable alloca, as
7189     // the result would need to be a vector of pointers.
7190     if (VF.isScalable())
7191       return InstructionCost::getInvalid();
7192     [[fallthrough]];
7193   default:
7194     // This opcode is unknown. Assume that it is the same as 'mul'.
7195     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7196   } // end of switch.
7197 }
7198 
collectValuesToIgnore()7199 void LoopVectorizationCostModel::collectValuesToIgnore() {
7200   // Ignore ephemeral values.
7201   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7202 
7203   // Find all stores to invariant variables. Since they are going to sink
7204   // outside the loop we do not need calculate cost for them.
7205   for (BasicBlock *BB : TheLoop->blocks())
7206     for (Instruction &I : *BB) {
7207       StoreInst *SI;
7208       if ((SI = dyn_cast<StoreInst>(&I)) &&
7209           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7210         ValuesToIgnore.insert(&I);
7211     }
7212 
7213   // Ignore type-promoting instructions we identified during reduction
7214   // detection.
7215   for (const auto &Reduction : Legal->getReductionVars()) {
7216     const RecurrenceDescriptor &RedDes = Reduction.second;
7217     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7218     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7219   }
7220   // Ignore type-casting instructions we identified during induction
7221   // detection.
7222   for (const auto &Induction : Legal->getInductionVars()) {
7223     const InductionDescriptor &IndDes = Induction.second;
7224     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7225     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7226   }
7227 }
7228 
collectInLoopReductions()7229 void LoopVectorizationCostModel::collectInLoopReductions() {
7230   for (const auto &Reduction : Legal->getReductionVars()) {
7231     PHINode *Phi = Reduction.first;
7232     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7233 
7234     // We don't collect reductions that are type promoted (yet).
7235     if (RdxDesc.getRecurrenceType() != Phi->getType())
7236       continue;
7237 
7238     // If the target would prefer this reduction to happen "in-loop", then we
7239     // want to record it as such.
7240     unsigned Opcode = RdxDesc.getOpcode();
7241     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7242         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7243                                    TargetTransformInfo::ReductionFlags()))
7244       continue;
7245 
7246     // Check that we can correctly put the reductions into the loop, by
7247     // finding the chain of operations that leads from the phi to the loop
7248     // exit value.
7249     SmallVector<Instruction *, 4> ReductionOperations =
7250         RdxDesc.getReductionOpChain(Phi, TheLoop);
7251     bool InLoop = !ReductionOperations.empty();
7252 
7253     if (InLoop) {
7254       InLoopReductions.insert(Phi);
7255       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256       Instruction *LastChain = Phi;
7257       for (auto *I : ReductionOperations) {
7258         InLoopReductionImmediateChains[I] = LastChain;
7259         LastChain = I;
7260       }
7261     }
7262     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263                       << " reduction for phi: " << *Phi << "\n");
7264   }
7265 }
7266 
createICmp(CmpInst::Predicate Pred,VPValue * A,VPValue * B,DebugLoc DL,const Twine & Name)7267 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7268                                DebugLoc DL, const Twine &Name) {
7269   assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7270          Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7271   return tryInsertInstruction(
7272       new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7273 }
7274 
7275 // This function will select a scalable VF if the target supports scalable
7276 // vectors and a fixed one otherwise.
7277 // TODO: we could return a pair of values that specify the max VF and
7278 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7279 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7280 // doesn't have a cost model that can choose which plan to execute if
7281 // more than one is generated.
determineVPlanVF(const TargetTransformInfo & TTI,LoopVectorizationCostModel & CM)7282 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7283                                      LoopVectorizationCostModel &CM) {
7284   unsigned WidestType;
7285   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7286 
7287   TargetTransformInfo::RegisterKind RegKind =
7288       TTI.enableScalableVectorization()
7289           ? TargetTransformInfo::RGK_ScalableVector
7290           : TargetTransformInfo::RGK_FixedWidthVector;
7291 
7292   TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7293   unsigned N = RegSize.getKnownMinValue() / WidestType;
7294   return ElementCount::get(N, RegSize.isScalable());
7295 }
7296 
7297 VectorizationFactor
planInVPlanNativePath(ElementCount UserVF)7298 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7299   ElementCount VF = UserVF;
7300   // Outer loop handling: They may require CFG and instruction level
7301   // transformations before even evaluating whether vectorization is profitable.
7302   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7303   // the vectorization pipeline.
7304   if (!OrigLoop->isInnermost()) {
7305     // If the user doesn't provide a vectorization factor, determine a
7306     // reasonable one.
7307     if (UserVF.isZero()) {
7308       VF = determineVPlanVF(TTI, CM);
7309       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7310 
7311       // Make sure we have a VF > 1 for stress testing.
7312       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7313         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7314                           << "overriding computed VF.\n");
7315         VF = ElementCount::getFixed(4);
7316       }
7317     } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7318                !ForceTargetSupportsScalableVectors) {
7319       LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320                         << "not supported by the target.\n");
7321       reportVectorizationFailure(
7322           "Scalable vectorization requested but not supported by the target",
7323           "the scalable user-specified vectorization width for outer-loop "
7324           "vectorization cannot be used because the target does not support "
7325           "scalable vectors.",
7326           "ScalableVFUnfeasible", ORE, OrigLoop);
7327       return VectorizationFactor::Disabled();
7328     }
7329     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7330     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7331            "VF needs to be a power of two");
7332     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7333                       << "VF " << VF << " to build VPlans.\n");
7334     buildVPlans(VF, VF);
7335 
7336     // For VPlan build stress testing, we bail out after VPlan construction.
7337     if (VPlanBuildStressTest)
7338       return VectorizationFactor::Disabled();
7339 
7340     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7341   }
7342 
7343   LLVM_DEBUG(
7344       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7345                 "VPlan-native path.\n");
7346   return VectorizationFactor::Disabled();
7347 }
7348 
7349 std::optional<VectorizationFactor>
plan(ElementCount UserVF,unsigned UserIC)7350 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7351   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7352   CM.collectValuesToIgnore();
7353   CM.collectElementTypesForWidening();
7354 
7355   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7356   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7357     return std::nullopt;
7358 
7359   // Invalidate interleave groups if all blocks of loop will be predicated.
7360   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7361       !useMaskedInterleavedAccesses(TTI)) {
7362     LLVM_DEBUG(
7363         dbgs()
7364         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7365            "which requires masked-interleaved support.\n");
7366     if (CM.InterleaveInfo.invalidateGroups())
7367       // Invalidating interleave groups also requires invalidating all decisions
7368       // based on them, which includes widening decisions and uniform and scalar
7369       // values.
7370       CM.invalidateCostModelingDecisions();
7371   }
7372 
7373   ElementCount MaxUserVF =
7374       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7375   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7376   if (!UserVF.isZero() && UserVFIsLegal) {
7377     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378            "VF needs to be a power of two");
7379     // Collect the instructions (and their associated costs) that will be more
7380     // profitable to scalarize.
7381     CM.collectInLoopReductions();
7382     if (CM.selectUserVectorizationFactor(UserVF)) {
7383       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7384       buildVPlansWithVPRecipes(UserVF, UserVF);
7385       if (!hasPlanWithVF(UserVF)) {
7386         LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7387                           << ".\n");
7388         return std::nullopt;
7389       }
7390 
7391       LLVM_DEBUG(printPlans(dbgs()));
7392       return {{UserVF, 0, 0}};
7393     } else
7394       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395                               "InvalidCost", ORE, OrigLoop);
7396   }
7397 
7398   // Populate the set of Vectorization Factor Candidates.
7399   ElementCountSet VFCandidates;
7400   for (auto VF = ElementCount::getFixed(1);
7401        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7402     VFCandidates.insert(VF);
7403   for (auto VF = ElementCount::getScalable(1);
7404        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7405     VFCandidates.insert(VF);
7406 
7407   CM.collectInLoopReductions();
7408   for (const auto &VF : VFCandidates) {
7409     // Collect Uniform and Scalar instructions after vectorization with VF.
7410     CM.collectUniformsAndScalars(VF);
7411 
7412     // Collect the instructions (and their associated costs) that will be more
7413     // profitable to scalarize.
7414     if (VF.isVector())
7415       CM.collectInstsToScalarize(VF);
7416   }
7417 
7418   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7419   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7420 
7421   LLVM_DEBUG(printPlans(dbgs()));
7422   if (!MaxFactors.hasVector())
7423     return VectorizationFactor::Disabled();
7424 
7425   // Select the optimal vectorization factor.
7426   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7427   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7428   if (!hasPlanWithVF(VF.Width)) {
7429     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7430                       << ".\n");
7431     return std::nullopt;
7432   }
7433   return VF;
7434 }
7435 
getBestPlanFor(ElementCount VF) const7436 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7437   assert(count_if(VPlans,
7438                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7439              1 &&
7440          "Best VF has not a single VPlan.");
7441 
7442   for (const VPlanPtr &Plan : VPlans) {
7443     if (Plan->hasVF(VF))
7444       return *Plan.get();
7445   }
7446   llvm_unreachable("No plan found!");
7447 }
7448 
AddRuntimeUnrollDisableMetaData(Loop * L)7449 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7450   SmallVector<Metadata *, 4> MDs;
7451   // Reserve first location for self reference to the LoopID metadata node.
7452   MDs.push_back(nullptr);
7453   bool IsUnrollMetadata = false;
7454   MDNode *LoopID = L->getLoopID();
7455   if (LoopID) {
7456     // First find existing loop unrolling disable metadata.
7457     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7458       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7459       if (MD) {
7460         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7461         IsUnrollMetadata =
7462             S && S->getString().starts_with("llvm.loop.unroll.disable");
7463       }
7464       MDs.push_back(LoopID->getOperand(i));
7465     }
7466   }
7467 
7468   if (!IsUnrollMetadata) {
7469     // Add runtime unroll disable metadata.
7470     LLVMContext &Context = L->getHeader()->getContext();
7471     SmallVector<Metadata *, 1> DisableOperands;
7472     DisableOperands.push_back(
7473         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7474     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7475     MDs.push_back(DisableNode);
7476     MDNode *NewLoopID = MDNode::get(Context, MDs);
7477     // Set operand 0 to refer to the loop id itself.
7478     NewLoopID->replaceOperandWith(0, NewLoopID);
7479     L->setLoopID(NewLoopID);
7480   }
7481 }
7482 
7483 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484 // create a merge phi node for it and add it to \p ReductionResumeValues.
createAndCollectMergePhiForReduction(VPInstruction * RedResult,DenseMap<const RecurrenceDescriptor *,Value * > & ReductionResumeValues,VPTransformState & State,Loop * OrigLoop,BasicBlock * LoopMiddleBlock)7485 static void createAndCollectMergePhiForReduction(
7486     VPInstruction *RedResult,
7487     DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7488     VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7489   if (!RedResult ||
7490       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7491     return;
7492 
7493   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7494   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7495 
7496   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7497   Value *FinalValue =
7498       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7499   auto *ResumePhi =
7500       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7501 
7502   // TODO: bc.merge.rdx should not be created here, instead it should be
7503   // modeled in VPlan.
7504   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7505   // Create a phi node that merges control-flow from the backedge-taken check
7506   // block and the middle block.
7507   auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7508                                      LoopScalarPreHeader->getTerminator());
7509 
7510   // If we are fixing reductions in the epilogue loop then we should already
7511   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512   // we carry over the incoming values correctly.
7513   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7514     if (Incoming == LoopMiddleBlock)
7515       BCBlockPhi->addIncoming(FinalValue, Incoming);
7516     else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7517       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7518                               Incoming);
7519     else
7520       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7521   }
7522 
7523   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7524   // TODO: This fixup should instead be modeled in VPlan.
7525   // Fix the scalar loop reduction variable with the incoming reduction sum
7526   // from the vector body and from the backedge value.
7527   int IncomingEdgeBlockIdx =
7528       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7529   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7530   // Pick the other block.
7531   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7532   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7533   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7534   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7535 
7536   ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7537 }
7538 
7539 std::pair<DenseMap<const SCEV *, Value *>,
7540           DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount BestVF,unsigned BestUF,VPlan & BestVPlan,InnerLoopVectorizer & ILV,DominatorTree * DT,bool IsEpilogueVectorization,const DenseMap<const SCEV *,Value * > * ExpandedSCEVs)7541 LoopVectorizationPlanner::executePlan(
7542     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7543     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7544     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7545   assert(BestVPlan.hasVF(BestVF) &&
7546          "Trying to execute plan with unsupported VF");
7547   assert(BestVPlan.hasUF(BestUF) &&
7548          "Trying to execute plan with unsupported UF");
7549   assert(
7550       (IsEpilogueVectorization || !ExpandedSCEVs) &&
7551       "expanded SCEVs to reuse can only be used during epilogue vectorization");
7552 
7553   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554                     << '\n');
7555 
7556   if (!IsEpilogueVectorization)
7557     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7558 
7559   // Perform the actual loop transformation.
7560   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7561                          OrigLoop->getHeader()->getContext());
7562 
7563   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564   // before making any changes to the CFG.
7565   if (!BestVPlan.getPreheader()->empty()) {
7566     State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7567     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7568     BestVPlan.getPreheader()->execute(&State);
7569   }
7570   if (!ILV.getTripCount())
7571     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7572   else
7573     assert(IsEpilogueVectorization && "should only re-use the existing trip "
7574                                       "count during epilogue vectorization");
7575 
7576   // 1. Set up the skeleton for vectorization, including vector pre-header and
7577   // middle block. The vector loop is created during VPlan execution.
7578   Value *CanonicalIVStartValue;
7579   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7580       ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7581                                                      : State.ExpandedSCEVs);
7582 
7583   // Only use noalias metadata when using memory checks guaranteeing no overlap
7584   // across all iterations.
7585   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7586   std::unique_ptr<LoopVersioning> LVer = nullptr;
7587   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7588       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7589 
7590     //  We currently don't use LoopVersioning for the actual loop cloning but we
7591     //  still use it to add the noalias metadata.
7592     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7593     //        metadata.
7594     LVer = std::make_unique<LoopVersioning>(
7595         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7596         PSE.getSE());
7597     State.LVer = &*LVer;
7598     State.LVer->prepareNoAliasMetadata();
7599   }
7600 
7601   ILV.collectPoisonGeneratingRecipes(State);
7602 
7603   ILV.printDebugTracesAtStart();
7604 
7605   //===------------------------------------------------===//
7606   //
7607   // Notice: any optimization or new instruction that go
7608   // into the code below should also be implemented in
7609   // the cost-model.
7610   //
7611   //===------------------------------------------------===//
7612 
7613   // 2. Copy and widen instructions from the old loop into the new loop.
7614   BestVPlan.prepareToExecute(ILV.getTripCount(),
7615                              ILV.getOrCreateVectorTripCount(nullptr),
7616                              CanonicalIVStartValue, State);
7617 
7618   BestVPlan.execute(&State);
7619 
7620   // 2.5 Collect reduction resume values.
7621   DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7622   auto *ExitVPBB =
7623       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7624   for (VPRecipeBase &R : *ExitVPBB) {
7625     createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7626                                          ReductionResumeValues, State, OrigLoop,
7627                                          State.CFG.VPBB2IRBB[ExitVPBB]);
7628   }
7629 
7630   // 2.6. Maintain Loop Hints
7631   // Keep all loop hints from the original loop on the vector loop (we'll
7632   // replace the vectorizer-specific hints below).
7633   MDNode *OrigLoopID = OrigLoop->getLoopID();
7634 
7635   std::optional<MDNode *> VectorizedLoopID =
7636       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7637                                       LLVMLoopVectorizeFollowupVectorized});
7638 
7639   VPBasicBlock *HeaderVPBB =
7640       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7641   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7642   if (VectorizedLoopID)
7643     L->setLoopID(*VectorizedLoopID);
7644   else {
7645     // Keep all loop hints from the original loop on the vector loop (we'll
7646     // replace the vectorizer-specific hints below).
7647     if (MDNode *LID = OrigLoop->getLoopID())
7648       L->setLoopID(LID);
7649 
7650     LoopVectorizeHints Hints(L, true, *ORE);
7651     Hints.setAlreadyVectorized();
7652   }
7653   TargetTransformInfo::UnrollingPreferences UP;
7654   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7655   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7656     AddRuntimeUnrollDisableMetaData(L);
7657 
7658   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7659   //    predication, updating analyses.
7660   ILV.fixVectorizedLoop(State, BestVPlan);
7661 
7662   ILV.printDebugTracesAtEnd();
7663 
7664   return {State.ExpandedSCEVs, ReductionResumeValues};
7665 }
7666 
7667 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
printPlans(raw_ostream & O)7668 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7669   for (const auto &Plan : VPlans)
7670     if (PrintVPlansInDotFormat)
7671       Plan->printDOT(O);
7672     else
7673       Plan->print(O);
7674 }
7675 #endif
7676 
7677 //===--------------------------------------------------------------------===//
7678 // EpilogueVectorizerMainLoop
7679 //===--------------------------------------------------------------------===//
7680 
7681 /// This function is partially responsible for generating the control flow
7682 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7683 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7684 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685     const SCEV2ValueTy &ExpandedSCEVs) {
7686   createVectorLoopSkeleton("");
7687 
7688   // Generate the code to check the minimum iteration count of the vector
7689   // epilogue (see below).
7690   EPI.EpilogueIterationCountCheck =
7691       emitIterationCountCheck(LoopScalarPreHeader, true);
7692   EPI.EpilogueIterationCountCheck->setName("iter.check");
7693 
7694   // Generate the code to check any assumptions that we've made for SCEV
7695   // expressions.
7696   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7697 
7698   // Generate the code that checks at runtime if arrays overlap. We put the
7699   // checks into a separate block to make the more common case of few elements
7700   // faster.
7701   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7702 
7703   // Generate the iteration count check for the main loop, *after* the check
7704   // for the epilogue loop, so that the path-length is shorter for the case
7705   // that goes directly through the vector epilogue. The longer-path length for
7706   // the main loop is compensated for, by the gain from vectorizing the larger
7707   // trip count. Note: the branch will get updated later on when we vectorize
7708   // the epilogue.
7709   EPI.MainLoopIterationCountCheck =
7710       emitIterationCountCheck(LoopScalarPreHeader, false);
7711 
7712   // Generate the induction variable.
7713   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7714 
7715   // Skip induction resume value creation here because they will be created in
7716   // the second pass for the scalar loop. The induction resume values for the
7717   // inductions in the epilogue loop are created before executing the plan for
7718   // the epilogue loop.
7719 
7720   return {completeLoopSkeleton(), nullptr};
7721 }
7722 
printDebugTracesAtStart()7723 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7724   LLVM_DEBUG({
7725     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726            << "Main Loop VF:" << EPI.MainLoopVF
7727            << ", Main Loop UF:" << EPI.MainLoopUF
7728            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7729            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7730   });
7731 }
7732 
printDebugTracesAtEnd()7733 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734   DEBUG_WITH_TYPE(VerboseDebug, {
7735     dbgs() << "intermediate fn:\n"
7736            << *OrigLoop->getHeader()->getParent() << "\n";
7737   });
7738 }
7739 
7740 BasicBlock *
emitIterationCountCheck(BasicBlock * Bypass,bool ForEpilogue)7741 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7742                                                     bool ForEpilogue) {
7743   assert(Bypass && "Expected valid bypass basic block.");
7744   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7745   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7746   Value *Count = getTripCount();
7747   // Reuse existing vector loop preheader for TC checks.
7748   // Note that new preheader block is generated for vector loop.
7749   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7750   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7751 
7752   // Generate code to check if the loop's trip count is less than VF * UF of the
7753   // main vector loop.
7754   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7755                                                     : VF.isVector())
7756                ? ICmpInst::ICMP_ULE
7757                : ICmpInst::ICMP_ULT;
7758 
7759   Value *CheckMinIters = Builder.CreateICmp(
7760       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7761       "min.iters.check");
7762 
7763   if (!ForEpilogue)
7764     TCCheckBlock->setName("vector.main.loop.iter.check");
7765 
7766   // Create new preheader for vector loop.
7767   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7768                                    DT, LI, nullptr, "vector.ph");
7769 
7770   if (ForEpilogue) {
7771     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7772                                  DT->getNode(Bypass)->getIDom()) &&
7773            "TC check is expected to dominate Bypass");
7774 
7775     // Update dominator for Bypass & LoopExit.
7776     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7777     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7778       // For loops with multiple exits, there's no edge from the middle block
7779       // to exit blocks (as the epilogue must run) and thus no need to update
7780       // the immediate dominator of the exit blocks.
7781       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7782 
7783     LoopBypassBlocks.push_back(TCCheckBlock);
7784 
7785     // Save the trip count so we don't have to regenerate it in the
7786     // vec.epilog.iter.check. This is safe to do because the trip count
7787     // generated here dominates the vector epilog iter check.
7788     EPI.TripCount = Count;
7789   }
7790 
7791   BranchInst &BI =
7792       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7793   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7794     setBranchWeights(BI, MinItersBypassWeights);
7795   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7796 
7797   return TCCheckBlock;
7798 }
7799 
7800 //===--------------------------------------------------------------------===//
7801 // EpilogueVectorizerEpilogueLoop
7802 //===--------------------------------------------------------------------===//
7803 
7804 /// This function is partially responsible for generating the control flow
7805 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7806 std::pair<BasicBlock *, Value *>
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy & ExpandedSCEVs)7807 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808     const SCEV2ValueTy &ExpandedSCEVs) {
7809   createVectorLoopSkeleton("vec.epilog.");
7810 
7811   // Now, compare the remaining count and if there aren't enough iterations to
7812   // execute the vectorized epilogue skip to the scalar part.
7813   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7814   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7815   LoopVectorPreHeader =
7816       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7817                  LI, nullptr, "vec.epilog.ph");
7818   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7819                                           VecEpilogueIterationCountCheck);
7820 
7821   // Adjust the control flow taking the state info from the main loop
7822   // vectorization into account.
7823   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7824          "expected this to be saved from the previous pass.");
7825   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7826       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7827 
7828   DT->changeImmediateDominator(LoopVectorPreHeader,
7829                                EPI.MainLoopIterationCountCheck);
7830 
7831   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7832       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7833 
7834   if (EPI.SCEVSafetyCheck)
7835     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7836         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7837   if (EPI.MemSafetyCheck)
7838     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7839         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7840 
7841   DT->changeImmediateDominator(
7842       VecEpilogueIterationCountCheck,
7843       VecEpilogueIterationCountCheck->getSinglePredecessor());
7844 
7845   DT->changeImmediateDominator(LoopScalarPreHeader,
7846                                EPI.EpilogueIterationCountCheck);
7847   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7848     // If there is an epilogue which must run, there's no edge from the
7849     // middle block to exit blocks  and thus no need to update the immediate
7850     // dominator of the exit blocks.
7851     DT->changeImmediateDominator(LoopExitBlock,
7852                                  EPI.EpilogueIterationCountCheck);
7853 
7854   // Keep track of bypass blocks, as they feed start values to the induction and
7855   // reduction phis in the scalar loop preheader.
7856   if (EPI.SCEVSafetyCheck)
7857     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7858   if (EPI.MemSafetyCheck)
7859     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7860   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7861 
7862   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863   // reductions which merge control-flow from the latch block and the middle
7864   // block. Update the incoming values here and move the Phi into the preheader.
7865   SmallVector<PHINode *, 4> PhisInBlock;
7866   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7867     PhisInBlock.push_back(&Phi);
7868 
7869   for (PHINode *Phi : PhisInBlock) {
7870     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7871     Phi->replaceIncomingBlockWith(
7872         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7873         VecEpilogueIterationCountCheck);
7874 
7875     // If the phi doesn't have an incoming value from the
7876     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877     // value and also those from other check blocks. This is needed for
7878     // reduction phis only.
7879     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7880           return EPI.EpilogueIterationCountCheck == IncB;
7881         }))
7882       continue;
7883     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7884     if (EPI.SCEVSafetyCheck)
7885       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7886     if (EPI.MemSafetyCheck)
7887       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7888   }
7889 
7890   // Generate a resume induction for the vector epilogue and put it in the
7891   // vector epilogue preheader
7892   Type *IdxTy = Legal->getWidestInductionType();
7893   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7894   EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7895   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897                            EPI.MainLoopIterationCountCheck);
7898 
7899   // Generate induction resume values. These variables save the new starting
7900   // indexes for the scalar loop. They are used to test if there are any tail
7901   // iterations left once the vector loop has completed.
7902   // Note that when the vectorized epilogue is skipped due to iteration count
7903   // check, then the resume value for the induction variable comes from
7904   // the trip count of the main vector loop, hence passing the AdditionalBypass
7905   // argument.
7906   createInductionResumeValues(ExpandedSCEVs,
7907                               {VecEpilogueIterationCountCheck,
7908                                EPI.VectorTripCount} /* AdditionalBypass */);
7909 
7910   return {completeLoopSkeleton(), EPResumeVal};
7911 }
7912 
7913 BasicBlock *
emitMinimumVectorEpilogueIterCountCheck(BasicBlock * Bypass,BasicBlock * Insert)7914 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7915     BasicBlock *Bypass, BasicBlock *Insert) {
7916 
7917   assert(EPI.TripCount &&
7918          "Expected trip count to have been safed in the first pass.");
7919   assert(
7920       (!isa<Instruction>(EPI.TripCount) ||
7921        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7922       "saved trip count does not dominate insertion point.");
7923   Value *TC = EPI.TripCount;
7924   IRBuilder<> Builder(Insert->getTerminator());
7925   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7926 
7927   // Generate code to check if the loop's trip count is less than VF * UF of the
7928   // vector epilogue loop.
7929   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7930                ? ICmpInst::ICMP_ULE
7931                : ICmpInst::ICMP_ULT;
7932 
7933   Value *CheckMinIters =
7934       Builder.CreateICmp(P, Count,
7935                          createStepForVF(Builder, Count->getType(),
7936                                          EPI.EpilogueVF, EPI.EpilogueUF),
7937                          "min.epilog.iters.check");
7938 
7939   BranchInst &BI =
7940       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7941   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7942     unsigned MainLoopStep = UF * VF.getKnownMinValue();
7943     unsigned EpilogueLoopStep =
7944         EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7945     // We assume the remaining `Count` is equally distributed in
7946     // [0, MainLoopStep)
7947     // So the probability for `Count < EpilogueLoopStep` should be
7948     // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7950     const uint32_t Weights[] = {EstimatedSkipCount,
7951                                 MainLoopStep - EstimatedSkipCount};
7952     setBranchWeights(BI, Weights);
7953   }
7954   ReplaceInstWithInst(Insert->getTerminator(), &BI);
7955 
7956   LoopBypassBlocks.push_back(Insert);
7957   return Insert;
7958 }
7959 
printDebugTracesAtStart()7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7961   LLVM_DEBUG({
7962     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963            << "Epilogue Loop VF:" << EPI.EpilogueVF
7964            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7965   });
7966 }
7967 
printDebugTracesAtEnd()7968 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969   DEBUG_WITH_TYPE(VerboseDebug, {
7970     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7971   });
7972 }
7973 
getDecisionAndClampRange(const std::function<bool (ElementCount)> & Predicate,VFRange & Range)7974 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7976   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7977   bool PredicateAtRangeStart = Predicate(Range.Start);
7978 
7979   for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7980     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7981       Range.End = TmpVF;
7982       break;
7983     }
7984 
7985   return PredicateAtRangeStart;
7986 }
7987 
7988 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7989 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7990 /// of VF's starting at a given VF and extending it as much as possible. Each
7991 /// vectorization decision can potentially shorten this sub-range during
7992 /// buildVPlan().
buildVPlans(ElementCount MinVF,ElementCount MaxVF)7993 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7994                                            ElementCount MaxVF) {
7995   auto MaxVFTimes2 = MaxVF * 2;
7996   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7997     VFRange SubRange = {VF, MaxVFTimes2};
7998     VPlans.push_back(buildVPlan(SubRange));
7999     VF = SubRange.End;
8000   }
8001 }
8002 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlan & Plan)8003 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8004                                          VPlan &Plan) {
8005   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8006 
8007   // Look for cached value.
8008   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8009   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8010   if (ECEntryIt != EdgeMaskCache.end())
8011     return ECEntryIt->second;
8012 
8013   VPValue *SrcMask = getBlockInMask(Src);
8014 
8015   // The terminator has to be a branch inst!
8016   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8017   assert(BI && "Unexpected terminator found");
8018 
8019   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8020     return EdgeMaskCache[Edge] = SrcMask;
8021 
8022   // If source is an exiting block, we know the exit edge is dynamically dead
8023   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8024   // adding uses of an otherwise potentially dead instruction.
8025   if (OrigLoop->isLoopExiting(Src))
8026     return EdgeMaskCache[Edge] = SrcMask;
8027 
8028   VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
8029   assert(EdgeMask && "No Edge Mask found for condition");
8030 
8031   if (BI->getSuccessor(0) != Dst)
8032     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8033 
8034   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8035     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037     // The select version does not introduce new UB if SrcMask is false and
8038     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039     VPValue *False = Plan.getVPValueOrAddLiveIn(
8040         ConstantInt::getFalse(BI->getCondition()->getType()));
8041     EdgeMask =
8042         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8043   }
8044 
8045   return EdgeMaskCache[Edge] = EdgeMask;
8046 }
8047 
createHeaderMask(VPlan & Plan)8048 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8049   BasicBlock *Header = OrigLoop->getHeader();
8050 
8051   // When not folding the tail, use nullptr to model all-true mask.
8052   if (!CM.foldTailByMasking()) {
8053     BlockMaskCache[Header] = nullptr;
8054     return;
8055   }
8056 
8057   // Introduce the early-exit compare IV <= BTC to form header block mask.
8058   // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8059   // constructing the desired canonical IV in the header block as its first
8060   // non-phi instructions.
8061 
8062   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8063   auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8064   auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8065   HeaderVPBB->insert(IV, NewInsertionPoint);
8066 
8067   VPBuilder::InsertPointGuard Guard(Builder);
8068   Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8069   VPValue *BlockMask = nullptr;
8070   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8071   BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8072   BlockMaskCache[Header] = BlockMask;
8073 }
8074 
getBlockInMask(BasicBlock * BB) const8075 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8076   // Return the cached value.
8077   BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8078   assert(BCEntryIt != BlockMaskCache.end() &&
8079          "Trying to access mask for block without one.");
8080   return BCEntryIt->second;
8081 }
8082 
createBlockInMask(BasicBlock * BB,VPlan & Plan)8083 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8084   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8085   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8086   assert(OrigLoop->getHeader() != BB &&
8087          "Loop header must have cached block mask");
8088 
8089   // All-one mask is modelled as no-mask following the convention for masked
8090   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091   VPValue *BlockMask = nullptr;
8092   // This is the block mask. We OR all incoming edges.
8093   for (auto *Predecessor : predecessors(BB)) {
8094     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8095     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8096       BlockMaskCache[BB] = EdgeMask;
8097       return;
8098     }
8099 
8100     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8101       BlockMask = EdgeMask;
8102       continue;
8103     }
8104 
8105     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8106   }
8107 
8108   BlockMaskCache[BB] = BlockMask;
8109 }
8110 
tryToWidenMemory(Instruction * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8111 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8112                                                 ArrayRef<VPValue *> Operands,
8113                                                 VFRange &Range,
8114                                                 VPlanPtr &Plan) {
8115   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8116          "Must be called with either a load or store");
8117 
8118   auto willWiden = [&](ElementCount VF) -> bool {
8119     LoopVectorizationCostModel::InstWidening Decision =
8120         CM.getWideningDecision(I, VF);
8121     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8122            "CM decision should be taken at this point.");
8123     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8124       return true;
8125     if (CM.isScalarAfterVectorization(I, VF) ||
8126         CM.isProfitableToScalarize(I, VF))
8127       return false;
8128     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8129   };
8130 
8131   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8132     return nullptr;
8133 
8134   VPValue *Mask = nullptr;
8135   if (Legal->isMaskRequired(I))
8136     Mask = getBlockInMask(I->getParent());
8137 
8138   // Determine if the pointer operand of the access is either consecutive or
8139   // reverse consecutive.
8140   LoopVectorizationCostModel::InstWidening Decision =
8141       CM.getWideningDecision(I, Range.Start);
8142   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8143   bool Consecutive =
8144       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8145 
8146   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8147   if (Consecutive) {
8148     auto *GEP = dyn_cast<GetElementPtrInst>(
8149         Ptr->getUnderlyingValue()->stripPointerCasts());
8150     auto *VectorPtr = new VPVectorPointerRecipe(
8151         Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8152         I->getDebugLoc());
8153     Builder.getInsertBlock()->appendRecipe(VectorPtr);
8154     Ptr = VectorPtr;
8155   }
8156   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8157     return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8158                                               Reverse);
8159 
8160   StoreInst *Store = cast<StoreInst>(I);
8161   return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8162                                             Consecutive, Reverse);
8163 }
8164 
8165 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8166 /// insert a recipe to expand the step for the induction recipe.
8167 static VPWidenIntOrFpInductionRecipe *
createWidenInductionRecipes(PHINode * Phi,Instruction * PhiOrTrunc,VPValue * Start,const InductionDescriptor & IndDesc,VPlan & Plan,ScalarEvolution & SE,Loop & OrigLoop,VFRange & Range)8168 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8169                             VPValue *Start, const InductionDescriptor &IndDesc,
8170                             VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8171                             VFRange &Range) {
8172   assert(IndDesc.getStartValue() ==
8173          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8174   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8175          "step must be loop invariant");
8176 
8177   VPValue *Step =
8178       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8179   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8180     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8181   }
8182   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8183   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8184 }
8185 
tryToOptimizeInductionPHI(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlan & Plan,VFRange & Range)8186 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8187     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8188 
8189   // Check if this is an integer or fp induction. If so, build the recipe that
8190   // produces its scalar and vector values.
8191   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8192     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8193                                        *PSE.getSE(), *OrigLoop, Range);
8194 
8195   // Check if this is pointer induction. If so, build the recipe for it.
8196   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8197     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8198                                                            *PSE.getSE());
8199     return new VPWidenPointerInductionRecipe(
8200         Phi, Operands[0], Step, *II,
8201         LoopVectorizationPlanner::getDecisionAndClampRange(
8202             [&](ElementCount VF) {
8203               return CM.isScalarAfterVectorization(Phi, VF);
8204             },
8205             Range));
8206   }
8207   return nullptr;
8208 }
8209 
tryToOptimizeInductionTruncate(TruncInst * I,ArrayRef<VPValue * > Operands,VFRange & Range,VPlan & Plan)8210 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8211     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8212   // Optimize the special case where the source is a constant integer
8213   // induction variable. Notice that we can only optimize the 'trunc' case
8214   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8215   // (c) other casts depend on pointer size.
8216 
8217   // Determine whether \p K is a truncation based on an induction variable that
8218   // can be optimized.
8219   auto isOptimizableIVTruncate =
8220       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8221     return [=](ElementCount VF) -> bool {
8222       return CM.isOptimizableIVTruncate(K, VF);
8223     };
8224   };
8225 
8226   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227           isOptimizableIVTruncate(I), Range)) {
8228 
8229     auto *Phi = cast<PHINode>(I->getOperand(0));
8230     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8231     VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8232     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8233                                        *OrigLoop, Range);
8234   }
8235   return nullptr;
8236 }
8237 
tryToBlend(PHINode * Phi,ArrayRef<VPValue * > Operands,VPlanPtr & Plan)8238 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8239                                                 ArrayRef<VPValue *> Operands,
8240                                                 VPlanPtr &Plan) {
8241   // If all incoming values are equal, the incoming VPValue can be used directly
8242   // instead of creating a new VPBlendRecipe.
8243   if (llvm::all_equal(Operands))
8244     return Operands[0];
8245 
8246   unsigned NumIncoming = Phi->getNumIncomingValues();
8247   // For in-loop reductions, we do not need to create an additional select.
8248   VPValue *InLoopVal = nullptr;
8249   for (unsigned In = 0; In < NumIncoming; In++) {
8250     PHINode *PhiOp =
8251         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8252     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8253       assert(!InLoopVal && "Found more than one in-loop reduction!");
8254       InLoopVal = Operands[In];
8255     }
8256   }
8257 
8258   assert((!InLoopVal || NumIncoming == 2) &&
8259          "Found an in-loop reduction for PHI with unexpected number of "
8260          "incoming values");
8261   if (InLoopVal)
8262     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8263 
8264   // We know that all PHIs in non-header blocks are converted into selects, so
8265   // we don't have to worry about the insertion order and we can just use the
8266   // builder. At this point we generate the predication tree. There may be
8267   // duplications since this is a simple recursive scan, but future
8268   // optimizations will clean it up.
8269   SmallVector<VPValue *, 2> OperandsWithMask;
8270 
8271   for (unsigned In = 0; In < NumIncoming; In++) {
8272     VPValue *EdgeMask =
8273         createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8274     assert((EdgeMask || NumIncoming == 1) &&
8275            "Multiple predecessors with one having a full mask");
8276     OperandsWithMask.push_back(Operands[In]);
8277     if (EdgeMask)
8278       OperandsWithMask.push_back(EdgeMask);
8279   }
8280   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8281 }
8282 
tryToWidenCall(CallInst * CI,ArrayRef<VPValue * > Operands,VFRange & Range,VPlanPtr & Plan)8283 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8284                                                    ArrayRef<VPValue *> Operands,
8285                                                    VFRange &Range,
8286                                                    VPlanPtr &Plan) {
8287   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8288       [this, CI](ElementCount VF) {
8289         return CM.isScalarWithPredication(CI, VF);
8290       },
8291       Range);
8292 
8293   if (IsPredicated)
8294     return nullptr;
8295 
8296   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8297   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8298              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8299              ID == Intrinsic::pseudoprobe ||
8300              ID == Intrinsic::experimental_noalias_scope_decl))
8301     return nullptr;
8302 
8303   SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8304 
8305   // Is it beneficial to perform intrinsic call compared to lib call?
8306   bool ShouldUseVectorIntrinsic =
8307       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8308                 [&](ElementCount VF) -> bool {
8309                   return CM.getCallWideningDecision(CI, VF).Kind ==
8310                          LoopVectorizationCostModel::CM_IntrinsicCall;
8311                 },
8312                 Range);
8313   if (ShouldUseVectorIntrinsic)
8314     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8315                                  CI->getDebugLoc());
8316 
8317   Function *Variant = nullptr;
8318   std::optional<unsigned> MaskPos;
8319   // Is better to call a vectorized version of the function than to to scalarize
8320   // the call?
8321   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8322       [&](ElementCount VF) -> bool {
8323         // The following case may be scalarized depending on the VF.
8324         // The flag shows whether we can use a usual Call for vectorized
8325         // version of the instruction.
8326 
8327         // If we've found a variant at a previous VF, then stop looking. A
8328         // vectorized variant of a function expects input in a certain shape
8329         // -- basically the number of input registers, the number of lanes
8330         // per register, and whether there's a mask required.
8331         // We store a pointer to the variant in the VPWidenCallRecipe, so
8332         // once we have an appropriate variant it's only valid for that VF.
8333         // This will force a different vplan to be generated for each VF that
8334         // finds a valid variant.
8335         if (Variant)
8336           return false;
8337         LoopVectorizationCostModel::CallWideningDecision Decision =
8338             CM.getCallWideningDecision(CI, VF);
8339         if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8340           Variant = Decision.Variant;
8341           MaskPos = Decision.MaskPos;
8342           return true;
8343         }
8344 
8345         return false;
8346       },
8347       Range);
8348   if (ShouldUseVectorCall) {
8349     if (MaskPos.has_value()) {
8350       // We have 2 cases that would require a mask:
8351       //   1) The block needs to be predicated, either due to a conditional
8352       //      in the scalar loop or use of an active lane mask with
8353       //      tail-folding, and we use the appropriate mask for the block.
8354       //   2) No mask is required for the block, but the only available
8355       //      vector variant at this VF requires a mask, so we synthesize an
8356       //      all-true mask.
8357       VPValue *Mask = nullptr;
8358       if (Legal->isMaskRequired(CI))
8359         Mask = getBlockInMask(CI->getParent());
8360       else
8361         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8363 
8364       Ops.insert(Ops.begin() + *MaskPos, Mask);
8365     }
8366 
8367     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8368                                  Intrinsic::not_intrinsic, CI->getDebugLoc(),
8369                                  Variant);
8370   }
8371 
8372   return nullptr;
8373 }
8374 
shouldWiden(Instruction * I,VFRange & Range) const8375 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8376   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8377          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8378   // Instruction should be widened, unless it is scalar after vectorization,
8379   // scalarization is profitable or it is predicated.
8380   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8381     return CM.isScalarAfterVectorization(I, VF) ||
8382            CM.isProfitableToScalarize(I, VF) ||
8383            CM.isScalarWithPredication(I, VF);
8384   };
8385   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8386                                                              Range);
8387 }
8388 
tryToWiden(Instruction * I,ArrayRef<VPValue * > Operands,VPBasicBlock * VPBB,VPlanPtr & Plan)8389 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8390                                           ArrayRef<VPValue *> Operands,
8391                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8392   switch (I->getOpcode()) {
8393   default:
8394     return nullptr;
8395   case Instruction::SDiv:
8396   case Instruction::UDiv:
8397   case Instruction::SRem:
8398   case Instruction::URem: {
8399     // If not provably safe, use a select to form a safe divisor before widening the
8400     // div/rem operation itself.  Otherwise fall through to general handling below.
8401     if (CM.isPredicatedInst(I)) {
8402       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8403       VPValue *Mask = getBlockInMask(I->getParent());
8404       VPValue *One = Plan->getVPValueOrAddLiveIn(
8405           ConstantInt::get(I->getType(), 1u, false));
8406       auto *SafeRHS =
8407          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8408                            I->getDebugLoc());
8409       VPBB->appendRecipe(SafeRHS);
8410       Ops[1] = SafeRHS;
8411       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8412     }
8413     [[fallthrough]];
8414   }
8415   case Instruction::Add:
8416   case Instruction::And:
8417   case Instruction::AShr:
8418   case Instruction::FAdd:
8419   case Instruction::FCmp:
8420   case Instruction::FDiv:
8421   case Instruction::FMul:
8422   case Instruction::FNeg:
8423   case Instruction::FRem:
8424   case Instruction::FSub:
8425   case Instruction::ICmp:
8426   case Instruction::LShr:
8427   case Instruction::Mul:
8428   case Instruction::Or:
8429   case Instruction::Select:
8430   case Instruction::Shl:
8431   case Instruction::Sub:
8432   case Instruction::Xor:
8433   case Instruction::Freeze:
8434     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8435   };
8436 }
8437 
fixHeaderPhis()8438 void VPRecipeBuilder::fixHeaderPhis() {
8439   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8440   for (VPHeaderPHIRecipe *R : PhisToFix) {
8441     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8442     VPRecipeBase *IncR =
8443         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8444     R->addOperand(IncR->getVPSingleValue());
8445   }
8446 }
8447 
handleReplication(Instruction * I,VFRange & Range,VPlan & Plan)8448 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8449                                                        VFRange &Range,
8450                                                        VPlan &Plan) {
8451   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8452       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8453       Range);
8454 
8455   bool IsPredicated = CM.isPredicatedInst(I);
8456 
8457   // Even if the instruction is not marked as uniform, there are certain
8458   // intrinsic calls that can be effectively treated as such, so we check for
8459   // them here. Conservatively, we only do this for scalable vectors, since
8460   // for fixed-width VFs we can always fall back on full scalarization.
8461   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8462     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8463     case Intrinsic::assume:
8464     case Intrinsic::lifetime_start:
8465     case Intrinsic::lifetime_end:
8466       // For scalable vectors if one of the operands is variant then we still
8467       // want to mark as uniform, which will generate one instruction for just
8468       // the first lane of the vector. We can't scalarize the call in the same
8469       // way as for fixed-width vectors because we don't know how many lanes
8470       // there are.
8471       //
8472       // The reasons for doing it this way for scalable vectors are:
8473       //   1. For the assume intrinsic generating the instruction for the first
8474       //      lane is still be better than not generating any at all. For
8475       //      example, the input may be a splat across all lanes.
8476       //   2. For the lifetime start/end intrinsics the pointer operand only
8477       //      does anything useful when the input comes from a stack object,
8478       //      which suggests it should always be uniform. For non-stack objects
8479       //      the effect is to poison the object, which still allows us to
8480       //      remove the call.
8481       IsUniform = true;
8482       break;
8483     default:
8484       break;
8485     }
8486   }
8487   VPValue *BlockInMask = nullptr;
8488   if (!IsPredicated) {
8489     // Finalize the recipe for Instr, first if it is not predicated.
8490     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8491   } else {
8492     LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8493     // Instructions marked for predication are replicated and a mask operand is
8494     // added initially. Masked replicate recipes will later be placed under an
8495     // if-then construct to prevent side-effects. Generate recipes to compute
8496     // the block mask for this region.
8497     BlockInMask = getBlockInMask(I->getParent());
8498   }
8499 
8500   auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8501                                        IsUniform, BlockInMask);
8502   return toVPRecipeResult(Recipe);
8503 }
8504 
8505 VPRecipeOrVPValueTy
tryToCreateWidenRecipe(Instruction * Instr,ArrayRef<VPValue * > Operands,VFRange & Range,VPBasicBlock * VPBB,VPlanPtr & Plan)8506 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8507                                         ArrayRef<VPValue *> Operands,
8508                                         VFRange &Range, VPBasicBlock *VPBB,
8509                                         VPlanPtr &Plan) {
8510   // First, check for specific widening recipes that deal with inductions, Phi
8511   // nodes, calls and memory operations.
8512   VPRecipeBase *Recipe;
8513   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8514     if (Phi->getParent() != OrigLoop->getHeader())
8515       return tryToBlend(Phi, Operands, Plan);
8516 
8517     // Always record recipes for header phis. Later first-order recurrence phis
8518     // can have earlier phis as incoming values.
8519     recordRecipeOf(Phi);
8520 
8521     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8522       return toVPRecipeResult(Recipe);
8523 
8524     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8525     assert((Legal->isReductionVariable(Phi) ||
8526             Legal->isFixedOrderRecurrence(Phi)) &&
8527            "can only widen reductions and fixed-order recurrences here");
8528     VPValue *StartV = Operands[0];
8529     if (Legal->isReductionVariable(Phi)) {
8530       const RecurrenceDescriptor &RdxDesc =
8531           Legal->getReductionVars().find(Phi)->second;
8532       assert(RdxDesc.getRecurrenceStartValue() ==
8533              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8534       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8535                                            CM.isInLoopReduction(Phi),
8536                                            CM.useOrderedReductions(RdxDesc));
8537     } else {
8538       // TODO: Currently fixed-order recurrences are modeled as chains of
8539       // first-order recurrences. If there are no users of the intermediate
8540       // recurrences in the chain, the fixed order recurrence should be modeled
8541       // directly, enabling more efficient codegen.
8542       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8543     }
8544 
8545     // Record the incoming value from the backedge, so we can add the incoming
8546     // value from the backedge after all recipes have been created.
8547     auto *Inc = cast<Instruction>(
8548         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8549     auto RecipeIter = Ingredient2Recipe.find(Inc);
8550     if (RecipeIter == Ingredient2Recipe.end())
8551       recordRecipeOf(Inc);
8552 
8553     PhisToFix.push_back(PhiRecipe);
8554     return toVPRecipeResult(PhiRecipe);
8555   }
8556 
8557   if (isa<TruncInst>(Instr) &&
8558       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559                                                Range, *Plan)))
8560     return toVPRecipeResult(Recipe);
8561 
8562   // All widen recipes below deal only with VF > 1.
8563   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8564           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8565     return nullptr;
8566 
8567   if (auto *CI = dyn_cast<CallInst>(Instr))
8568     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8569 
8570   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8571     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8572 
8573   if (!shouldWiden(Instr, Range))
8574     return nullptr;
8575 
8576   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577     return toVPRecipeResult(new VPWidenGEPRecipe(
8578         GEP, make_range(Operands.begin(), Operands.end())));
8579 
8580   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581     return toVPRecipeResult(new VPWidenSelectRecipe(
8582         *SI, make_range(Operands.begin(), Operands.end())));
8583   }
8584 
8585   if (auto *CI = dyn_cast<CastInst>(Instr)) {
8586     return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8587                                                   CI->getType(), *CI));
8588   }
8589 
8590   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8591 }
8592 
buildVPlansWithVPRecipes(ElementCount MinVF,ElementCount MaxVF)8593 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8594                                                         ElementCount MaxVF) {
8595   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8596 
8597   auto MaxVFTimes2 = MaxVF * 2;
8598   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8599     VFRange SubRange = {VF, MaxVFTimes2};
8600     if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8601       // Now optimize the initial VPlan.
8602       if (!Plan->hasVF(ElementCount::getFixed(1)))
8603         VPlanTransforms::truncateToMinimalBitwidths(
8604             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8605       VPlanTransforms::optimize(*Plan, *PSE.getSE());
8606       assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8607       VPlans.push_back(std::move(Plan));
8608     }
8609     VF = SubRange.End;
8610   }
8611 }
8612 
8613 // Add the necessary canonical IV and branch recipes required to control the
8614 // loop.
addCanonicalIVRecipes(VPlan & Plan,Type * IdxTy,bool HasNUW,DebugLoc DL)8615 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8616                                   DebugLoc DL) {
8617   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8618   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8619 
8620   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8621   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8622   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8623   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8624   Header->insert(CanonicalIVPHI, Header->begin());
8625 
8626   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8627   // IV by VF * UF.
8628   auto *CanonicalIVIncrement =
8629       new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8630                         {HasNUW, false}, DL, "index.next");
8631   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8632 
8633   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8634   EB->appendRecipe(CanonicalIVIncrement);
8635 
8636   // Add the BranchOnCount VPInstruction to the latch.
8637   VPInstruction *BranchBack =
8638       new VPInstruction(VPInstruction::BranchOnCount,
8639                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8640   EB->appendRecipe(BranchBack);
8641 }
8642 
8643 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8644 // original exit block.
addUsersInExitBlock(VPBasicBlock * HeaderVPBB,Loop * OrigLoop,VPlan & Plan)8645 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8646                                 VPlan &Plan) {
8647   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8648   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8649   // Only handle single-exit loops with unique exit blocks for now.
8650   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8651     return;
8652 
8653   // Introduce VPUsers modeling the exit values.
8654   for (PHINode &ExitPhi : ExitBB->phis()) {
8655     Value *IncomingValue =
8656         ExitPhi.getIncomingValueForBlock(ExitingBB);
8657     VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8658     Plan.addLiveOut(&ExitPhi, V);
8659   }
8660 }
8661 
8662 VPlanPtr
tryToBuildVPlanWithVPRecipes(VFRange & Range)8663 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8664 
8665   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8666 
8667   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8668 
8669   // ---------------------------------------------------------------------------
8670   // Pre-construction: record ingredients whose recipes we'll need to further
8671   // process after constructing the initial VPlan.
8672   // ---------------------------------------------------------------------------
8673 
8674   // For each interleave group which is relevant for this (possibly trimmed)
8675   // Range, add it to the set of groups to be later applied to the VPlan and add
8676   // placeholders for its members' Recipes which we'll be replacing with a
8677   // single VPInterleaveRecipe.
8678   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8679     auto applyIG = [IG, this](ElementCount VF) -> bool {
8680       bool Result = (VF.isVector() && // Query is illegal for VF == 1
8681                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
8682                          LoopVectorizationCostModel::CM_Interleave);
8683       // For scalable vectors, the only interleave factor currently supported
8684       // is 2 since we require the (de)interleave2 intrinsics instead of
8685       // shufflevectors.
8686       assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8687              "Unsupported interleave factor for scalable vectors");
8688       return Result;
8689     };
8690     if (!getDecisionAndClampRange(applyIG, Range))
8691       continue;
8692     InterleaveGroups.insert(IG);
8693     for (unsigned i = 0; i < IG->getFactor(); i++)
8694       if (Instruction *Member = IG->getMember(i))
8695         RecipeBuilder.recordRecipeOf(Member);
8696   };
8697 
8698   // ---------------------------------------------------------------------------
8699   // Build initial VPlan: Scan the body of the loop in a topological order to
8700   // visit each basic block after having visited its predecessor basic blocks.
8701   // ---------------------------------------------------------------------------
8702 
8703   // Create initial VPlan skeleton, having a basic block for the pre-header
8704   // which contains SCEV expansions that need to happen before the CFG is
8705   // modified; a basic block for the vector pre-header, followed by a region for
8706   // the vector loop, followed by the middle basic block. The skeleton vector
8707   // loop region contains a header and latch basic blocks.
8708   VPlanPtr Plan = VPlan::createInitialVPlan(
8709       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8710       *PSE.getSE());
8711   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8712   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8713   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8714   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8715   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8716 
8717   // Don't use getDecisionAndClampRange here, because we don't know the UF
8718   // so this function is better to be conservative, rather than to split
8719   // it up into different VPlans.
8720   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721   bool IVUpdateMayOverflow = false;
8722   for (ElementCount VF : Range)
8723     IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8724 
8725   DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8726   TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8727   // When not folding the tail, we know that the induction increment will not
8728   // overflow.
8729   bool HasNUW = Style == TailFoldingStyle::None;
8730   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8731 
8732   // Scan the body of the loop in a topological order to visit each basic block
8733   // after having visited its predecessor basic blocks.
8734   LoopBlocksDFS DFS(OrigLoop);
8735   DFS.perform(LI);
8736 
8737   VPBasicBlock *VPBB = HeaderVPBB;
8738   bool NeedsMasks = CM.foldTailByMasking() ||
8739                     any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8740                       return Legal->blockNeedsPredication(BB);
8741                     });
8742   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8743     // Relevant instructions from basic block BB will be grouped into VPRecipe
8744     // ingredients and fill a new VPBasicBlock.
8745     if (VPBB != HeaderVPBB)
8746       VPBB->setName(BB->getName());
8747     Builder.setInsertPoint(VPBB);
8748 
8749     if (VPBB == HeaderVPBB)
8750       RecipeBuilder.createHeaderMask(*Plan);
8751     else if (NeedsMasks)
8752       RecipeBuilder.createBlockInMask(BB, *Plan);
8753 
8754     // Introduce each ingredient into VPlan.
8755     // TODO: Model and preserve debug intrinsics in VPlan.
8756     for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8757       Instruction *Instr = &I;
8758       SmallVector<VPValue *, 4> Operands;
8759       auto *Phi = dyn_cast<PHINode>(Instr);
8760       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8761         Operands.push_back(Plan->getVPValueOrAddLiveIn(
8762             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8763       } else {
8764         auto OpRange = Plan->mapToVPValues(Instr->operands());
8765         Operands = {OpRange.begin(), OpRange.end()};
8766       }
8767 
8768       // Invariant stores inside loop will be deleted and a single store
8769       // with the final reduction value will be added to the exit block
8770       StoreInst *SI;
8771       if ((SI = dyn_cast<StoreInst>(&I)) &&
8772           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8773         continue;
8774 
8775       auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8776           Instr, Operands, Range, VPBB, Plan);
8777       if (!RecipeOrValue)
8778         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8779       // If Instr can be simplified to an existing VPValue, use it.
8780       if (isa<VPValue *>(RecipeOrValue)) {
8781         auto *VPV = cast<VPValue *>(RecipeOrValue);
8782         Plan->addVPValue(Instr, VPV);
8783         // If the re-used value is a recipe, register the recipe for the
8784         // instruction, in case the recipe for Instr needs to be recorded.
8785         if (VPRecipeBase *R = VPV->getDefiningRecipe())
8786           RecipeBuilder.setRecipe(Instr, R);
8787         continue;
8788       }
8789       // Otherwise, add the new recipe.
8790       VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8791       for (auto *Def : Recipe->definedValues()) {
8792         auto *UV = Def->getUnderlyingValue();
8793         Plan->addVPValue(UV, Def);
8794       }
8795 
8796       RecipeBuilder.setRecipe(Instr, Recipe);
8797       if (isa<VPHeaderPHIRecipe>(Recipe)) {
8798         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799         // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800         // recipes and need to be moved to the phi section of HeaderVPBB:
8801         // * tail-folding (non-phi recipes computing the header mask are
8802         // introduced earlier than regular header phi recipes, and should appear
8803         // after them)
8804         // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8805 
8806         assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8807                 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8808                "unexpected recipe needs moving");
8809         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8810       } else
8811         VPBB->appendRecipe(Recipe);
8812     }
8813 
8814     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8815     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8816   }
8817 
8818   // After here, VPBB should not be used.
8819   VPBB = nullptr;
8820 
8821   if (CM.requiresScalarEpilogue(Range)) {
8822     // No edge from the middle block to the unique exit block has been inserted
8823     // and there is nothing to fix from vector loop; phis should have incoming
8824     // from scalar loop only.
8825   } else
8826     addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8827 
8828   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8829          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8830          "entry block must be set to a VPRegionBlock having a non-empty entry "
8831          "VPBasicBlock");
8832   RecipeBuilder.fixHeaderPhis();
8833 
8834   // ---------------------------------------------------------------------------
8835   // Transform initial VPlan: Apply previously taken decisions, in order, to
8836   // bring the VPlan to its final state.
8837   // ---------------------------------------------------------------------------
8838 
8839   // Adjust the recipes for any inloop reductions.
8840   adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8841 
8842   // Interleave memory: for each Interleave Group we marked earlier as relevant
8843   // for this VPlan, replace the Recipes widening its memory instructions with a
8844   // single VPInterleaveRecipe at its insertion point.
8845   for (const auto *IG : InterleaveGroups) {
8846     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8847         RecipeBuilder.getRecipe(IG->getInsertPos()));
8848     SmallVector<VPValue *, 4> StoredValues;
8849     for (unsigned i = 0; i < IG->getFactor(); ++i)
8850       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8851         auto *StoreR =
8852             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8853         StoredValues.push_back(StoreR->getStoredValue());
8854       }
8855 
8856     bool NeedsMaskForGaps =
8857         IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8858     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8859                                         Recipe->getMask(), NeedsMaskForGaps);
8860     VPIG->insertBefore(Recipe);
8861     unsigned J = 0;
8862     for (unsigned i = 0; i < IG->getFactor(); ++i)
8863       if (Instruction *Member = IG->getMember(i)) {
8864         VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8865         if (!Member->getType()->isVoidTy()) {
8866           VPValue *OriginalV = MemberR->getVPSingleValue();
8867           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8868           J++;
8869         }
8870         MemberR->eraseFromParent();
8871       }
8872   }
8873 
8874   for (ElementCount VF : Range)
8875     Plan->addVF(VF);
8876   Plan->setName("Initial VPlan");
8877 
8878   // Replace VPValues for known constant strides guaranteed by predicate scalar
8879   // evolution.
8880   for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8881     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8882     auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8883     // Only handle constant strides for now.
8884     if (!ScevStride)
8885       continue;
8886     Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8887 
8888     auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8889     // The versioned value may not be used in the loop directly, so just add a
8890     // new live-in in those cases.
8891     Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8892   }
8893 
8894   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8895   // in ways that accessing values using original IR values is incorrect.
8896   Plan->disableValue2VPValue();
8897 
8898   // Sink users of fixed-order recurrence past the recipe defining the previous
8899   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900   if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8901     return nullptr;
8902 
8903   if (useActiveLaneMask(Style)) {
8904     // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905     // TailFoldingStyle is visible there.
8906     bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8907     bool WithoutRuntimeCheck =
8908         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8909     VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8910                                        WithoutRuntimeCheck);
8911   }
8912   return Plan;
8913 }
8914 
buildVPlan(VFRange & Range)8915 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8916   // Outer loop handling: They may require CFG and instruction level
8917   // transformations before even evaluating whether vectorization is profitable.
8918   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919   // the vectorization pipeline.
8920   assert(!OrigLoop->isInnermost());
8921   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922 
8923   // Create new empty VPlan
8924   auto Plan = VPlan::createInitialVPlan(
8925       createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8926       *PSE.getSE());
8927 
8928   // Build hierarchical CFG
8929   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8930   HCFGBuilder.buildHierarchicalCFG();
8931 
8932   for (ElementCount VF : Range)
8933     Plan->addVF(VF);
8934 
8935   VPlanTransforms::VPInstructionsToVPRecipes(
8936       Plan,
8937       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8938       *PSE.getSE(), *TLI);
8939 
8940   // Remove the existing terminator of the exiting block of the top-most region.
8941   // A BranchOnCount will be added instead when adding the canonical IV recipes.
8942   auto *Term =
8943       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8944   Term->eraseFromParent();
8945 
8946   // Tail folding is not supported for outer loops, so the induction increment
8947   // is guaranteed to not wrap.
8948   bool HasNUW = true;
8949   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8950                         DebugLoc());
8951   return Plan;
8952 }
8953 
8954 // Adjust the recipes for reductions. For in-loop reductions the chain of
8955 // instructions leading from the loop exit instr to the phi need to be converted
8956 // to reductions, with one operand being vector and the other being the scalar
8957 // reduction chain. For other reductions, a select is introduced between the phi
8958 // and live-out recipes when folding the tail.
8959 //
8960 // A ComputeReductionResult recipe is added to the middle block, also for
8961 // in-loop reductions which compute their result in-loop, because generating
8962 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
adjustRecipesForReductions(VPBasicBlock * LatchVPBB,VPlanPtr & Plan,VPRecipeBuilder & RecipeBuilder,ElementCount MinVF)8963 void LoopVectorizationPlanner::adjustRecipesForReductions(
8964     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8965     ElementCount MinVF) {
8966   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8967   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8968   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969   // sank outside of the loop would keep the same order as they had in the
8970   // original loop.
8971   SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8972   for (VPRecipeBase &R : Header->phis()) {
8973     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8974       ReductionPHIList.emplace_back(ReductionPhi);
8975   }
8976   bool HasIntermediateStore = false;
8977   stable_sort(ReductionPHIList,
8978               [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8979                                             const VPReductionPHIRecipe *R2) {
8980                 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8981                 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8982                 HasIntermediateStore |= IS1 || IS2;
8983 
8984                 // If neither of the recipes has an intermediate store, keep the
8985                 // order the same.
8986                 if (!IS1 && !IS2)
8987                   return false;
8988 
8989                 // If only one of the recipes has an intermediate store, then
8990                 // move it towards the beginning of the list.
8991                 if (IS1 && !IS2)
8992                   return true;
8993 
8994                 if (!IS1 && IS2)
8995                   return false;
8996 
8997                 // If both recipes have an intermediate store, then the recipe
8998                 // with the later store should be processed earlier. So it
8999                 // should go to the beginning of the list.
9000                 return DT->dominates(IS2, IS1);
9001               });
9002 
9003   if (HasIntermediateStore && ReductionPHIList.size() > 1)
9004     for (VPRecipeBase *R : ReductionPHIList)
9005       R->moveBefore(*Header, Header->getFirstNonPhi());
9006 
9007   for (VPRecipeBase &R : Header->phis()) {
9008     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9009     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9010       continue;
9011 
9012     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9013     RecurKind Kind = RdxDesc.getRecurrenceKind();
9014     assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9015            "AnyOf reductions are not allowed for in-loop reductions");
9016 
9017     // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018     SetVector<VPSingleDefRecipe *> Worklist;
9019     Worklist.insert(PhiR);
9020     for (unsigned I = 0; I != Worklist.size(); ++I) {
9021       VPSingleDefRecipe *Cur = Worklist[I];
9022       for (VPUser *U : Cur->users()) {
9023         auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9024         if (!UserRecipe) {
9025           assert(isa<VPLiveOut>(U) &&
9026                  "U must either be a VPSingleDef or VPLiveOut");
9027           continue;
9028         }
9029         Worklist.insert(UserRecipe);
9030       }
9031     }
9032 
9033     // Visit operation "Links" along the reduction chain top-down starting from
9034     // the phi until LoopExitValue. We keep track of the previous item
9035     // (PreviousLink) to tell which of the two operands of a Link will remain
9036     // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037     // the select instructions.
9038     VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9039     for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9040       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9041 
9042       // Index of the first operand which holds a non-mask vector operand.
9043       unsigned IndexOfFirstOperand;
9044       // Recognize a call to the llvm.fmuladd intrinsic.
9045       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9046       VPValue *VecOp;
9047       VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9048       if (IsFMulAdd) {
9049         assert(
9050             RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9051             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9053                 isa<VPWidenCallRecipe>(CurrentLink)) &&
9054                CurrentLink->getOperand(2) == PreviousLink &&
9055                "expected a call where the previous link is the added operand");
9056 
9057         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058         // need to create an fmul recipe (multiplying the first two operands of
9059         // the fmuladd together) to use as the vector operand for the fadd
9060         // reduction.
9061         VPInstruction *FMulRecipe = new VPInstruction(
9062             Instruction::FMul,
9063             {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9064             CurrentLinkI->getFastMathFlags());
9065         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9066         VecOp = FMulRecipe;
9067       } else {
9068         if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9069           if (isa<VPWidenRecipe>(CurrentLink)) {
9070             assert(isa<CmpInst>(CurrentLinkI) &&
9071                    "need to have the compare of the select");
9072             continue;
9073           }
9074           assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9075                  "must be a select recipe");
9076           IndexOfFirstOperand = 1;
9077         } else {
9078           assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9079                  "Expected to replace a VPWidenSC");
9080           IndexOfFirstOperand = 0;
9081         }
9082         // Note that for non-commutable operands (cmp-selects), the semantics of
9083         // the cmp-select are captured in the recurrence kind.
9084         unsigned VecOpId =
9085             CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9086                 ? IndexOfFirstOperand + 1
9087                 : IndexOfFirstOperand;
9088         VecOp = CurrentLink->getOperand(VecOpId);
9089         assert(VecOp != PreviousLink &&
9090                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9091                                        (VecOpId - IndexOfFirstOperand)) ==
9092                    PreviousLink &&
9093                "PreviousLink must be the operand other than VecOp");
9094       }
9095 
9096       BasicBlock *BB = CurrentLinkI->getParent();
9097       VPValue *CondOp = nullptr;
9098       if (CM.blockNeedsPredicationForAnyReason(BB)) {
9099         VPBuilder::InsertPointGuard Guard(Builder);
9100         Builder.setInsertPoint(CurrentLink);
9101         CondOp = RecipeBuilder.getBlockInMask(BB);
9102       }
9103 
9104       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9105           RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9106       // Append the recipe to the end of the VPBasicBlock because we need to
9107       // ensure that it comes after all of it's inputs, including CondOp.
9108       // Note that this transformation may leave over dead recipes (including
9109       // CurrentLink), which will be cleaned by a later VPlan transform.
9110       LinkVPBB->appendRecipe(RedRecipe);
9111       CurrentLink->replaceAllUsesWith(RedRecipe);
9112       PreviousLink = RedRecipe;
9113     }
9114   }
9115   Builder.setInsertPoint(&*LatchVPBB->begin());
9116   for (VPRecipeBase &R :
9117        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118     VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119     if (!PhiR)
9120       continue;
9121 
9122     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123     // If tail is folded by masking, introduce selects between the phi
9124     // and the live-out instruction of each reduction, at the beginning of the
9125     // dedicated latch block.
9126     auto *OrigExitingVPV = PhiR->getBackedgeValue();
9127     auto *NewExitingVPV = PhiR->getBackedgeValue();
9128     if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9129       VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9130       assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9131              "reduction recipe must be defined before latch");
9132       Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9133       std::optional<FastMathFlags> FMFs =
9134           PhiTy->isFloatingPointTy()
9135               ? std::make_optional(RdxDesc.getFastMathFlags())
9136               : std::nullopt;
9137       NewExitingVPV =
9138           Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9139       OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9140         return isa<VPInstruction>(&U) &&
9141                cast<VPInstruction>(&U)->getOpcode() ==
9142                    VPInstruction::ComputeReductionResult;
9143       });
9144       if (PreferPredicatedReductionSelect ||
9145           TTI.preferPredicatedReductionSelect(
9146               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9147               TargetTransformInfo::ReductionFlags()))
9148         PhiR->setOperand(1, NewExitingVPV);
9149     }
9150 
9151     // If the vector reduction can be performed in a smaller type, we truncate
9152     // then extend the loop exit value to enable InstCombine to evaluate the
9153     // entire expression in the smaller type.
9154     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9155     if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9156       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9157       Type *RdxTy = RdxDesc.getRecurrenceType();
9158       auto *Trunc =
9159           new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9160       auto *Extnd =
9161           RdxDesc.isSigned()
9162               ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9163               : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9164 
9165       Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9166       Extnd->insertAfter(Trunc);
9167       if (PhiR->getOperand(1) == NewExitingVPV)
9168         PhiR->setOperand(1, Extnd->getVPSingleValue());
9169       NewExitingVPV = Extnd;
9170     }
9171 
9172     // We want code in the middle block to appear to execute on the location of
9173     // the scalar loop's latch terminator because: (a) it is all compiler
9174     // generated, (b) these instructions are always executed after evaluating
9175     // the latch conditional branch, and (c) other passes may add new
9176     // predecessors which terminate on this line. This is the easiest way to
9177     // ensure we don't accidentally cause an extra step back into the loop while
9178     // debugging.
9179     DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9180 
9181     // TODO: At the moment ComputeReductionResult also drives creation of the
9182     // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183     // even for in-loop reductions, until the reduction resume value handling is
9184     // also modeled in VPlan.
9185     auto *FinalReductionResult = new VPInstruction(
9186         VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9187     cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9188         ->appendRecipe(FinalReductionResult);
9189     OrigExitingVPV->replaceUsesWithIf(
9190         FinalReductionResult,
9191         [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9192   }
9193 
9194   VPlanTransforms::clearReductionWrapFlags(*Plan);
9195 }
9196 
9197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const9198 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9199                                VPSlotTracker &SlotTracker) const {
9200   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9201   IG->getInsertPos()->printAsOperand(O, false);
9202   O << ", ";
9203   getAddr()->printAsOperand(O, SlotTracker);
9204   VPValue *Mask = getMask();
9205   if (Mask) {
9206     O << ", ";
9207     Mask->printAsOperand(O, SlotTracker);
9208   }
9209 
9210   unsigned OpIdx = 0;
9211   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9212     if (!IG->getMember(i))
9213       continue;
9214     if (getNumStoreOperands() > 0) {
9215       O << "\n" << Indent << "  store ";
9216       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9217       O << " to index " << i;
9218     } else {
9219       O << "\n" << Indent << "  ";
9220       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9221       O << " = load from index " << i;
9222     }
9223     ++OpIdx;
9224   }
9225 }
9226 #endif
9227 
execute(VPTransformState & State)9228 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9229   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9230          "Not a pointer induction according to InductionDescriptor!");
9231   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9232          "Unexpected type.");
9233 
9234   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9235   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9236 
9237   if (onlyScalarsGenerated(State.VF)) {
9238     // This is the normalized GEP that starts counting at zero.
9239     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9240         CanonicalIV, IndDesc.getStep()->getType());
9241     // Determine the number of scalars we need to generate for each unroll
9242     // iteration. If the instruction is uniform, we only need to generate the
9243     // first lane. Otherwise, we generate all VF values.
9244     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9245     assert((IsUniform || !State.VF.isScalable()) &&
9246            "Cannot scalarize a scalable VF");
9247     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9248 
9249     for (unsigned Part = 0; Part < State.UF; ++Part) {
9250       Value *PartStart =
9251           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9252 
9253       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9254         Value *Idx = State.Builder.CreateAdd(
9255             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9256         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9257 
9258         Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9259         Value *SclrGep = emitTransformedIndex(
9260             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9261             IndDesc.getKind(), IndDesc.getInductionBinOp());
9262         SclrGep->setName("next.gep");
9263         State.set(this, SclrGep, VPIteration(Part, Lane));
9264       }
9265     }
9266     return;
9267   }
9268 
9269   Type *PhiType = IndDesc.getStep()->getType();
9270 
9271   // Build a pointer phi
9272   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9273   Type *ScStValueType = ScalarStartValue->getType();
9274   PHINode *NewPointerPhi =
9275       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9276 
9277   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9278   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9279 
9280   // A pointer induction, performed by using a gep
9281   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9282 
9283   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9284   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9285   Value *NumUnrolledElems =
9286       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9287   Value *InductionGEP = GetElementPtrInst::Create(
9288       State.Builder.getInt8Ty(), NewPointerPhi,
9289       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9290       InductionLoc);
9291   // Add induction update using an incorrect block temporarily. The phi node
9292   // will be fixed after VPlan execution. Note that at this point the latch
9293   // block cannot be used, as it does not exist yet.
9294   // TODO: Model increment value in VPlan, by turning the recipe into a
9295   // multi-def and a subclass of VPHeaderPHIRecipe.
9296   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9297 
9298   // Create UF many actual address geps that use the pointer
9299   // phi as base and a vectorized version of the step value
9300   // (<step*0, ..., step*N>) as offset.
9301   for (unsigned Part = 0; Part < State.UF; ++Part) {
9302     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9303     Value *StartOffsetScalar =
9304         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9305     Value *StartOffset =
9306         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9307     // Create a vector of consecutive numbers from zero to VF.
9308     StartOffset = State.Builder.CreateAdd(
9309         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9310 
9311     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9312            "scalar step must be the same across all parts");
9313     Value *GEP = State.Builder.CreateGEP(
9314         State.Builder.getInt8Ty(), NewPointerPhi,
9315         State.Builder.CreateMul(
9316             StartOffset,
9317             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9318             "vector.gep"));
9319     State.set(this, GEP, Part);
9320   }
9321 }
9322 
execute(VPTransformState & State)9323 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9324   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9325 
9326   // Fast-math-flags propagate from the original induction instruction.
9327   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9328   if (FPBinOp)
9329     State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9330 
9331   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9332   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9333   Value *DerivedIV = emitTransformedIndex(
9334       State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9335       Kind, cast_if_present<BinaryOperator>(FPBinOp));
9336   DerivedIV->setName("offset.idx");
9337   if (TruncResultTy) {
9338     assert(TruncResultTy != DerivedIV->getType() &&
9339            Step->getType()->isIntegerTy() &&
9340            "Truncation requires an integer step");
9341     DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9342   }
9343   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9344 
9345   State.set(this, DerivedIV, VPIteration(0, 0));
9346 }
9347 
execute(VPTransformState & State)9348 void VPInterleaveRecipe::execute(VPTransformState &State) {
9349   assert(!State.Instance && "Interleave group being replicated.");
9350   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9351                                       getStoredValues(), getMask(),
9352                                       NeedsMaskForGaps);
9353 }
9354 
execute(VPTransformState & State)9355 void VPReductionRecipe::execute(VPTransformState &State) {
9356   assert(!State.Instance && "Reduction being replicated.");
9357   Value *PrevInChain = State.get(getChainOp(), 0);
9358   RecurKind Kind = RdxDesc.getRecurrenceKind();
9359   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9360   // Propagate the fast-math flags carried by the underlying instruction.
9361   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9362   State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9363   for (unsigned Part = 0; Part < State.UF; ++Part) {
9364     Value *NewVecOp = State.get(getVecOp(), Part);
9365     if (VPValue *Cond = getCondOp()) {
9366       Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9367                                            : State.get(Cond, {Part, 0});
9368       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9369       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9370       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9371                                                   RdxDesc.getFastMathFlags());
9372       if (State.VF.isVector()) {
9373         Iden =
9374             State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9375       }
9376 
9377       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9378       NewVecOp = Select;
9379     }
9380     Value *NewRed;
9381     Value *NextInChain;
9382     if (IsOrdered) {
9383       if (State.VF.isVector())
9384         NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9385                                         PrevInChain);
9386       else
9387         NewRed = State.Builder.CreateBinOp(
9388             (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9389             NewVecOp);
9390       PrevInChain = NewRed;
9391     } else {
9392       PrevInChain = State.get(getChainOp(), Part);
9393       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9394     }
9395     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9396       NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9397                                    NewRed, PrevInChain);
9398     } else if (IsOrdered)
9399       NextInChain = NewRed;
9400     else
9401       NextInChain = State.Builder.CreateBinOp(
9402           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9403     State.set(this, NextInChain, Part);
9404   }
9405 }
9406 
execute(VPTransformState & State)9407 void VPReplicateRecipe::execute(VPTransformState &State) {
9408   Instruction *UI = getUnderlyingInstr();
9409   if (State.Instance) { // Generate a single instance.
9410     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411     State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9412     // Insert scalar instance packing it into a vector.
9413     if (State.VF.isVector() && shouldPack()) {
9414       // If we're constructing lane 0, initialize to start from poison.
9415       if (State.Instance->Lane.isFirstLane()) {
9416         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9417         Value *Poison = PoisonValue::get(
9418             VectorType::get(UI->getType(), State.VF));
9419         State.set(this, Poison, State.Instance->Part);
9420       }
9421       State.packScalarIntoVectorValue(this, *State.Instance);
9422     }
9423     return;
9424   }
9425 
9426   if (IsUniform) {
9427     // If the recipe is uniform across all parts (instead of just per VF), only
9428     // generate a single instance.
9429     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9430         all_of(operands(), [](VPValue *Op) {
9431           return Op->isDefinedOutsideVectorRegions();
9432         })) {
9433       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9434       if (user_begin() != user_end()) {
9435         for (unsigned Part = 1; Part < State.UF; ++Part)
9436           State.set(this, State.get(this, VPIteration(0, 0)),
9437                     VPIteration(Part, 0));
9438       }
9439       return;
9440     }
9441 
9442     // Uniform within VL means we need to generate lane 0 only for each
9443     // unrolled copy.
9444     for (unsigned Part = 0; Part < State.UF; ++Part)
9445       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9446     return;
9447   }
9448 
9449   // A store of a loop varying value to a uniform address only needs the last
9450   // copy of the store.
9451   if (isa<StoreInst>(UI) &&
9452       vputils::isUniformAfterVectorization(getOperand(1))) {
9453     auto Lane = VPLane::getLastLaneForVF(State.VF);
9454     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9455                                     State);
9456     return;
9457   }
9458 
9459   // Generate scalar instances for all VF lanes of all UF parts.
9460   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9461   const unsigned EndLane = State.VF.getKnownMinValue();
9462   for (unsigned Part = 0; Part < State.UF; ++Part)
9463     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9464       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9465 }
9466 
execute(VPTransformState & State)9467 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9468   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9469 
9470   // Attempt to issue a wide load.
9471   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9472   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9473 
9474   assert((LI || SI) && "Invalid Load/Store instruction");
9475   assert((!SI || StoredValue) && "No stored value provided for widened store");
9476   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9477 
9478   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9479 
9480   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9481   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9482   bool CreateGatherScatter = !isConsecutive();
9483 
9484   auto &Builder = State.Builder;
9485   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9486   bool isMaskRequired = getMask();
9487   if (isMaskRequired) {
9488     // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489     // a null all-one mask is a null mask.
9490     for (unsigned Part = 0; Part < State.UF; ++Part) {
9491       Value *Mask = State.get(getMask(), Part);
9492       if (isReverse())
9493         Mask = Builder.CreateVectorReverse(Mask, "reverse");
9494       BlockInMaskParts[Part] = Mask;
9495     }
9496   }
9497 
9498   // Handle Stores:
9499   if (SI) {
9500     State.setDebugLocFrom(SI->getDebugLoc());
9501 
9502     for (unsigned Part = 0; Part < State.UF; ++Part) {
9503       Instruction *NewSI = nullptr;
9504       Value *StoredVal = State.get(StoredValue, Part);
9505       if (CreateGatherScatter) {
9506         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9507         Value *VectorGep = State.get(getAddr(), Part);
9508         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9509                                             MaskPart);
9510       } else {
9511         if (isReverse()) {
9512           // If we store to reverse consecutive memory locations, then we need
9513           // to reverse the order of elements in the stored value.
9514           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9515           // We don't want to update the value in the map as it might be used in
9516           // another expression. So don't call resetVectorValue(StoredVal).
9517         }
9518         auto *VecPtr = State.get(getAddr(), Part);
9519         if (isMaskRequired)
9520           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9521                                             BlockInMaskParts[Part]);
9522         else
9523           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9524       }
9525       State.addMetadata(NewSI, SI);
9526     }
9527     return;
9528   }
9529 
9530   // Handle loads.
9531   assert(LI && "Must have a load instruction");
9532   State.setDebugLocFrom(LI->getDebugLoc());
9533   for (unsigned Part = 0; Part < State.UF; ++Part) {
9534     Value *NewLI;
9535     if (CreateGatherScatter) {
9536       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9537       Value *VectorGep = State.get(getAddr(), Part);
9538       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9539                                          nullptr, "wide.masked.gather");
9540       State.addMetadata(NewLI, LI);
9541     } else {
9542       auto *VecPtr = State.get(getAddr(), Part);
9543       if (isMaskRequired)
9544         NewLI = Builder.CreateMaskedLoad(
9545             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9546             PoisonValue::get(DataTy), "wide.masked.load");
9547       else
9548         NewLI =
9549             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9550 
9551       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9552       State.addMetadata(NewLI, LI);
9553       if (Reverse)
9554         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9555     }
9556 
9557     State.set(getVPSingleValue(), NewLI, Part);
9558   }
9559 }
9560 
9561 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9562 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9564 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,LoopVectorizationLegality & LVL,InterleavedAccessInfo * IAI)9565 static ScalarEpilogueLowering getScalarEpilogueLowering(
9566     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9567     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9568     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9569   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570   // don't look at hints or options, and don't request a scalar epilogue.
9571   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572   // LoopAccessInfo (due to code dependency and not being able to reliably get
9573   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576   // back to the old way and vectorize with versioning when forced. See D81345.)
9577   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9578                                                       PGSOQueryType::IRPass) &&
9579                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9580     return CM_ScalarEpilogueNotAllowedOptSize;
9581 
9582   // 2) If set, obey the directives
9583   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9584     switch (PreferPredicateOverEpilogue) {
9585     case PreferPredicateTy::ScalarEpilogue:
9586       return CM_ScalarEpilogueAllowed;
9587     case PreferPredicateTy::PredicateElseScalarEpilogue:
9588       return CM_ScalarEpilogueNotNeededUsePredicate;
9589     case PreferPredicateTy::PredicateOrDontVectorize:
9590       return CM_ScalarEpilogueNotAllowedUsePredicate;
9591     };
9592   }
9593 
9594   // 3) If set, obey the hints
9595   switch (Hints.getPredicate()) {
9596   case LoopVectorizeHints::FK_Enabled:
9597     return CM_ScalarEpilogueNotNeededUsePredicate;
9598   case LoopVectorizeHints::FK_Disabled:
9599     return CM_ScalarEpilogueAllowed;
9600   };
9601 
9602   // 4) if the TTI hook indicates this is profitable, request predication.
9603   TailFoldingInfo TFI(TLI, &LVL, IAI);
9604   if (TTI->preferPredicateOverEpilogue(&TFI))
9605     return CM_ScalarEpilogueNotNeededUsePredicate;
9606 
9607   return CM_ScalarEpilogueAllowed;
9608 }
9609 
9610 // Process the loop in the VPlan-native vectorization path. This path builds
9611 // VPlan upfront in the vectorization pipeline, which allows to apply
9612 // VPlan-to-VPlan transformations from the very beginning without modifying the
9613 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints,LoopVectorizationRequirements & Requirements)9614 static bool processLoopInVPlanNativePath(
9615     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9616     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9617     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9618     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9619     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9620     LoopVectorizationRequirements &Requirements) {
9621 
9622   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9623     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9624     return false;
9625   }
9626   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9627   Function *F = L->getHeader()->getParent();
9628   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9629 
9630   ScalarEpilogueLowering SEL =
9631       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9632 
9633   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9634                                 &Hints, IAI);
9635   // Use the planner for outer loop vectorization.
9636   // TODO: CM is not used at this point inside the planner. Turn CM into an
9637   // optional argument if we don't need it in the future.
9638   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9639                                ORE);
9640 
9641   // Get user vectorization factor.
9642   ElementCount UserVF = Hints.getWidth();
9643 
9644   CM.collectElementTypesForWidening();
9645 
9646   // Plan how to best vectorize, return the best VF and its cost.
9647   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9648 
9649   // If we are stress testing VPlan builds, do not attempt to generate vector
9650   // code. Masked vector code generation support will follow soon.
9651   // Also, do not attempt to vectorize if no vector code will be produced.
9652   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9653     return false;
9654 
9655   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9656 
9657   {
9658     bool AddBranchWeights =
9659         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9660     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9661                              F->getParent()->getDataLayout(), AddBranchWeights);
9662     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9663                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9664     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9665                       << L->getHeader()->getParent()->getName() << "\"\n");
9666     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9667   }
9668 
9669   reportVectorization(ORE, L, VF, 1);
9670 
9671   // Mark the loop as already vectorized to avoid vectorizing again.
9672   Hints.setAlreadyVectorized();
9673   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9674   return true;
9675 }
9676 
9677 // Emit a remark if there are stores to floats that required a floating point
9678 // extension. If the vectorized loop was generated with floating point there
9679 // will be a performance penalty from the conversion overhead and the change in
9680 // the vector width.
checkMixedPrecision(Loop * L,OptimizationRemarkEmitter * ORE)9681 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9682   SmallVector<Instruction *, 4> Worklist;
9683   for (BasicBlock *BB : L->getBlocks()) {
9684     for (Instruction &Inst : *BB) {
9685       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9686         if (S->getValueOperand()->getType()->isFloatTy())
9687           Worklist.push_back(S);
9688       }
9689     }
9690   }
9691 
9692   // Traverse the floating point stores upwards searching, for floating point
9693   // conversions.
9694   SmallPtrSet<const Instruction *, 4> Visited;
9695   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9696   while (!Worklist.empty()) {
9697     auto *I = Worklist.pop_back_val();
9698     if (!L->contains(I))
9699       continue;
9700     if (!Visited.insert(I).second)
9701       continue;
9702 
9703     // Emit a remark if the floating point store required a floating
9704     // point conversion.
9705     // TODO: More work could be done to identify the root cause such as a
9706     // constant or a function return type and point the user to it.
9707     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9708       ORE->emit([&]() {
9709         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9710                                           I->getDebugLoc(), L->getHeader())
9711                << "floating point conversion changes vector width. "
9712                << "Mixed floating point precision requires an up/down "
9713                << "cast that will negatively impact performance.";
9714       });
9715 
9716     for (Use &Op : I->operands())
9717       if (auto *OpI = dyn_cast<Instruction>(Op))
9718         Worklist.push_back(OpI);
9719   }
9720 }
9721 
areRuntimeChecksProfitable(GeneratedRTChecks & Checks,VectorizationFactor & VF,std::optional<unsigned> VScale,Loop * L,ScalarEvolution & SE,ScalarEpilogueLowering SEL)9722 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9723                                        VectorizationFactor &VF,
9724                                        std::optional<unsigned> VScale, Loop *L,
9725                                        ScalarEvolution &SE,
9726                                        ScalarEpilogueLowering SEL) {
9727   InstructionCost CheckCost = Checks.getCost();
9728   if (!CheckCost.isValid())
9729     return false;
9730 
9731   // When interleaving only scalar and vector cost will be equal, which in turn
9732   // would lead to a divide by 0. Fall back to hard threshold.
9733   if (VF.Width.isScalar()) {
9734     if (CheckCost > VectorizeMemoryCheckThreshold) {
9735       LLVM_DEBUG(
9736           dbgs()
9737           << "LV: Interleaving only is not profitable due to runtime checks\n");
9738       return false;
9739     }
9740     return true;
9741   }
9742 
9743   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744   double ScalarC = *VF.ScalarCost.getValue();
9745   if (ScalarC == 0)
9746     return true;
9747 
9748   // First, compute the minimum iteration count required so that the vector
9749   // loop outperforms the scalar loop.
9750   //  The total cost of the scalar loop is
9751   //   ScalarC * TC
9752   //  where
9753   //  * TC is the actual trip count of the loop.
9754   //  * ScalarC is the cost of a single scalar iteration.
9755   //
9756   //  The total cost of the vector loop is
9757   //    RtC + VecC * (TC / VF) + EpiC
9758   //  where
9759   //  * RtC is the cost of the generated runtime checks
9760   //  * VecC is the cost of a single vector iteration.
9761   //  * TC is the actual trip count of the loop
9762   //  * VF is the vectorization factor
9763   //  * EpiCost is the cost of the generated epilogue, including the cost
9764   //    of the remaining scalar operations.
9765   //
9766   // Vectorization is profitable once the total vector cost is less than the
9767   // total scalar cost:
9768   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9769   //
9770   // Now we can compute the minimum required trip count TC as
9771   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9772   //
9773   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774   // the computations are performed on doubles, not integers and the result
9775   // is rounded up, hence we get an upper estimate of the TC.
9776   unsigned IntVF = VF.Width.getKnownMinValue();
9777   if (VF.Width.isScalable()) {
9778     unsigned AssumedMinimumVscale = 1;
9779     if (VScale)
9780       AssumedMinimumVscale = *VScale;
9781     IntVF *= AssumedMinimumVscale;
9782   }
9783   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9784   double RtC = *CheckCost.getValue();
9785   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9786 
9787   // Second, compute a minimum iteration count so that the cost of the
9788   // runtime checks is only a fraction of the total scalar loop cost. This
9789   // adds a loop-dependent bound on the overhead incurred if the runtime
9790   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9792   // cost, compute
9793   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9794   double MinTC2 = RtC * 10 / ScalarC;
9795 
9796   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797   // epilogue is allowed, choose the next closest multiple of VF. This should
9798   // partly compensate for ignoring the epilogue cost.
9799   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9800   if (SEL == CM_ScalarEpilogueAllowed)
9801     MinTC = alignTo(MinTC, IntVF);
9802   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9803 
9804   LLVM_DEBUG(
9805       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806              << VF.MinProfitableTripCount << "\n");
9807 
9808   // Skip vectorization if the expected trip count is less than the minimum
9809   // required trip count.
9810   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9811     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9812                                 VF.MinProfitableTripCount)) {
9813       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814                            "trip count < minimum profitable VF ("
9815                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
9816                         << ")\n");
9817 
9818       return false;
9819     }
9820   }
9821   return true;
9822 }
9823 
LoopVectorizePass(LoopVectorizeOptions Opts)9824 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9825     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9826                                !EnableLoopInterleaving),
9827       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9828                               !EnableLoopVectorization) {}
9829 
processLoop(Loop * L)9830 bool LoopVectorizePass::processLoop(Loop *L) {
9831   assert((EnableVPlanNativePath || L->isInnermost()) &&
9832          "VPlan-native path is not enabled. Only process inner loops.");
9833 
9834 #ifndef NDEBUG
9835   const std::string DebugLocStr = getDebugLocString(L);
9836 #endif /* NDEBUG */
9837 
9838   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9839                     << L->getHeader()->getParent()->getName() << "' from "
9840                     << DebugLocStr << "\n");
9841 
9842   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9843 
9844   LLVM_DEBUG(
9845       dbgs() << "LV: Loop hints:"
9846              << " force="
9847              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9848                      ? "disabled"
9849                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9850                             ? "enabled"
9851                             : "?"))
9852              << " width=" << Hints.getWidth()
9853              << " interleave=" << Hints.getInterleave() << "\n");
9854 
9855   // Function containing loop
9856   Function *F = L->getHeader()->getParent();
9857 
9858   // Looking at the diagnostic output is the only way to determine if a loop
9859   // was vectorized (other than looking at the IR or machine code), so it
9860   // is important to generate an optimization remark for each loop. Most of
9861   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9862   // generated as OptimizationRemark and OptimizationRemarkMissed are
9863   // less verbose reporting vectorized loops and unvectorized loops that may
9864   // benefit from vectorization, respectively.
9865 
9866   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9867     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9868     return false;
9869   }
9870 
9871   PredicatedScalarEvolution PSE(*SE, *L);
9872 
9873   // Check if it is legal to vectorize the loop.
9874   LoopVectorizationRequirements Requirements;
9875   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9876                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9877   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9878     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9879     Hints.emitRemarkWithHints();
9880     return false;
9881   }
9882 
9883   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9884   // here. They may require CFG and instruction level transformations before
9885   // even evaluating whether vectorization is profitable. Since we cannot modify
9886   // the incoming IR, we need to build VPlan upfront in the vectorization
9887   // pipeline.
9888   if (!L->isInnermost())
9889     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9890                                         ORE, BFI, PSI, Hints, Requirements);
9891 
9892   assert(L->isInnermost() && "Inner loop expected.");
9893 
9894   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9895   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9896 
9897   // If an override option has been passed in for interleaved accesses, use it.
9898   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9899     UseInterleaved = EnableInterleavedMemAccesses;
9900 
9901   // Analyze interleaved memory accesses.
9902   if (UseInterleaved)
9903     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9904 
9905   // Check the function attributes and profiles to find out if this function
9906   // should be optimized for size.
9907   ScalarEpilogueLowering SEL =
9908       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9909 
9910   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9911   // count by optimizing for size, to minimize overheads.
9912   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9913   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9914     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9915                       << "This loop is worth vectorizing only if no scalar "
9916                       << "iteration overheads are incurred.");
9917     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9918       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9919     else {
9920       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9921         LLVM_DEBUG(dbgs() << "\n");
9922         // Predicate tail-folded loops are efficient even when the loop
9923         // iteration count is low. However, setting the epilogue policy to
9924         // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925         // with runtime checks. It's more effective to let
9926         // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9927         // for the loop.
9928         if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9929           SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9930       } else {
9931         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9932                              "small to consider vectorizing.\n");
9933         reportVectorizationFailure(
9934             "The trip count is below the minial threshold value.",
9935             "loop trip count is too low, avoiding vectorization",
9936             "LowTripCount", ORE, L);
9937         Hints.emitRemarkWithHints();
9938         return false;
9939       }
9940     }
9941   }
9942 
9943   // Check the function attributes to see if implicit floats or vectors are
9944   // allowed.
9945   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9946     reportVectorizationFailure(
9947         "Can't vectorize when the NoImplicitFloat attribute is used",
9948         "loop not vectorized due to NoImplicitFloat attribute",
9949         "NoImplicitFloat", ORE, L);
9950     Hints.emitRemarkWithHints();
9951     return false;
9952   }
9953 
9954   // Check if the target supports potentially unsafe FP vectorization.
9955   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9956   // for the target we're vectorizing for, to make sure none of the
9957   // additional fp-math flags can help.
9958   if (Hints.isPotentiallyUnsafe() &&
9959       TTI->isFPVectorizationPotentiallyUnsafe()) {
9960     reportVectorizationFailure(
9961         "Potentially unsafe FP op prevents vectorization",
9962         "loop not vectorized due to unsafe FP support.",
9963         "UnsafeFP", ORE, L);
9964     Hints.emitRemarkWithHints();
9965     return false;
9966   }
9967 
9968   bool AllowOrderedReductions;
9969   // If the flag is set, use that instead and override the TTI behaviour.
9970   if (ForceOrderedReductions.getNumOccurrences() > 0)
9971     AllowOrderedReductions = ForceOrderedReductions;
9972   else
9973     AllowOrderedReductions = TTI->enableOrderedReductions();
9974   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9975     ORE->emit([&]() {
9976       auto *ExactFPMathInst = Requirements.getExactFPInst();
9977       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9978                                                  ExactFPMathInst->getDebugLoc(),
9979                                                  ExactFPMathInst->getParent())
9980              << "loop not vectorized: cannot prove it is safe to reorder "
9981                 "floating-point operations";
9982     });
9983     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984                          "reorder floating-point operations\n");
9985     Hints.emitRemarkWithHints();
9986     return false;
9987   }
9988 
9989   // Use the cost model.
9990   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9991                                 F, &Hints, IAI);
9992   // Use the planner for vectorization.
9993   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9994                                ORE);
9995 
9996   // Get user vectorization factor and interleave count.
9997   ElementCount UserVF = Hints.getWidth();
9998   unsigned UserIC = Hints.getInterleave();
9999 
10000   // Plan how to best vectorize, return the best VF and its cost.
10001   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10002 
10003   VectorizationFactor VF = VectorizationFactor::Disabled();
10004   unsigned IC = 1;
10005 
10006   bool AddBranchWeights =
10007       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10008   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10009                            F->getParent()->getDataLayout(), AddBranchWeights);
10010   if (MaybeVF) {
10011     VF = *MaybeVF;
10012     // Select the interleave count.
10013     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10014 
10015     unsigned SelectedIC = std::max(IC, UserIC);
10016     //  Optimistically generate runtime checks if they are needed. Drop them if
10017     //  they turn out to not be profitable.
10018     if (VF.Width.isVector() || SelectedIC > 1)
10019       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10020 
10021     // Check if it is profitable to vectorize with runtime checks.
10022     bool ForceVectorization =
10023         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10024     if (!ForceVectorization &&
10025         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10026                                     *PSE.getSE(), SEL)) {
10027       ORE->emit([&]() {
10028         return OptimizationRemarkAnalysisAliasing(
10029                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10030                    L->getHeader())
10031                << "loop not vectorized: cannot prove it is safe to reorder "
10032                   "memory operations";
10033       });
10034       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10035       Hints.emitRemarkWithHints();
10036       return false;
10037     }
10038   }
10039 
10040   // Identify the diagnostic messages that should be produced.
10041   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10042   bool VectorizeLoop = true, InterleaveLoop = true;
10043   if (VF.Width.isScalar()) {
10044     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10045     VecDiagMsg = std::make_pair(
10046         "VectorizationNotBeneficial",
10047         "the cost-model indicates that vectorization is not beneficial");
10048     VectorizeLoop = false;
10049   }
10050 
10051   if (!MaybeVF && UserIC > 1) {
10052     // Tell the user interleaving was avoided up-front, despite being explicitly
10053     // requested.
10054     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10055                          "interleaving should be avoided up front\n");
10056     IntDiagMsg = std::make_pair(
10057         "InterleavingAvoided",
10058         "Ignoring UserIC, because interleaving was avoided up front");
10059     InterleaveLoop = false;
10060   } else if (IC == 1 && UserIC <= 1) {
10061     // Tell the user interleaving is not beneficial.
10062     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10063     IntDiagMsg = std::make_pair(
10064         "InterleavingNotBeneficial",
10065         "the cost-model indicates that interleaving is not beneficial");
10066     InterleaveLoop = false;
10067     if (UserIC == 1) {
10068       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10069       IntDiagMsg.second +=
10070           " and is explicitly disabled or interleave count is set to 1";
10071     }
10072   } else if (IC > 1 && UserIC == 1) {
10073     // Tell the user interleaving is beneficial, but it explicitly disabled.
10074     LLVM_DEBUG(
10075         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10076     IntDiagMsg = std::make_pair(
10077         "InterleavingBeneficialButDisabled",
10078         "the cost-model indicates that interleaving is beneficial "
10079         "but is explicitly disabled or interleave count is set to 1");
10080     InterleaveLoop = false;
10081   }
10082 
10083   // Override IC if user provided an interleave count.
10084   IC = UserIC > 0 ? UserIC : IC;
10085 
10086   // Emit diagnostic messages, if any.
10087   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10088   if (!VectorizeLoop && !InterleaveLoop) {
10089     // Do not vectorize or interleaving the loop.
10090     ORE->emit([&]() {
10091       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10092                                       L->getStartLoc(), L->getHeader())
10093              << VecDiagMsg.second;
10094     });
10095     ORE->emit([&]() {
10096       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10097                                       L->getStartLoc(), L->getHeader())
10098              << IntDiagMsg.second;
10099     });
10100     return false;
10101   } else if (!VectorizeLoop && InterleaveLoop) {
10102     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10103     ORE->emit([&]() {
10104       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10105                                         L->getStartLoc(), L->getHeader())
10106              << VecDiagMsg.second;
10107     });
10108   } else if (VectorizeLoop && !InterleaveLoop) {
10109     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10110                       << ") in " << DebugLocStr << '\n');
10111     ORE->emit([&]() {
10112       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10113                                         L->getStartLoc(), L->getHeader())
10114              << IntDiagMsg.second;
10115     });
10116   } else if (VectorizeLoop && InterleaveLoop) {
10117     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10118                       << ") in " << DebugLocStr << '\n');
10119     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10120   }
10121 
10122   bool DisableRuntimeUnroll = false;
10123   MDNode *OrigLoopID = L->getLoopID();
10124   {
10125     using namespace ore;
10126     if (!VectorizeLoop) {
10127       assert(IC > 1 && "interleave count should not be 1 or 0");
10128       // If we decided that it is not legal to vectorize the loop, then
10129       // interleave it.
10130       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10131                                  &CM, BFI, PSI, Checks);
10132 
10133       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10134       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10135 
10136       ORE->emit([&]() {
10137         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10138                                   L->getHeader())
10139                << "interleaved loop (interleaved count: "
10140                << NV("InterleaveCount", IC) << ")";
10141       });
10142     } else {
10143       // If we decided that it is *legal* to vectorize the loop, then do it.
10144 
10145       // Consider vectorizing the epilogue too if it's profitable.
10146       VectorizationFactor EpilogueVF =
10147           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10148       if (EpilogueVF.Width.isVector()) {
10149 
10150         // The first pass vectorizes the main loop and creates a scalar epilogue
10151         // to be vectorized by executing the plan (potentially with a different
10152         // factor) again shortly afterwards.
10153         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10154         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10155                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10156 
10157         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10158         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10159             EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10160         ++LoopsVectorized;
10161 
10162         // Second pass vectorizes the epilogue and adjusts the control flow
10163         // edges from the first pass.
10164         EPI.MainLoopVF = EPI.EpilogueVF;
10165         EPI.MainLoopUF = EPI.EpilogueUF;
10166         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10167                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10168                                                  Checks);
10169 
10170         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10171         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10172         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10173         Header->setName("vec.epilog.vector.body");
10174 
10175         // Re-use the trip count and steps expanded for the main loop, as
10176         // skeleton creation needs it as a value that dominates both the scalar
10177         // and vector epilogue loops
10178         // TODO: This is a workaround needed for epilogue vectorization and it
10179         // should be removed once induction resume value creation is done
10180         // directly in VPlan.
10181         EpilogILV.setTripCount(MainILV.getTripCount());
10182         for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10183           auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10184           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10185               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10186           ExpandR->replaceAllUsesWith(ExpandedVal);
10187           ExpandR->eraseFromParent();
10188         }
10189 
10190         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192         // before vectorizing the epilogue loop.
10193         for (VPRecipeBase &R : Header->phis()) {
10194           if (isa<VPCanonicalIVPHIRecipe>(&R))
10195             continue;
10196 
10197           Value *ResumeV = nullptr;
10198           // TODO: Move setting of resume values to prepareToExecute.
10199           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10200             ResumeV = ReductionResumeValues
10201                           .find(&ReductionPhi->getRecurrenceDescriptor())
10202                           ->second;
10203           } else {
10204             // Create induction resume values for both widened pointer and
10205             // integer/fp inductions and update the start value of the induction
10206             // recipes to use the resume value.
10207             PHINode *IndPhi = nullptr;
10208             const InductionDescriptor *ID;
10209             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10210               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10211               ID = &Ind->getInductionDescriptor();
10212             } else {
10213               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10214               IndPhi = WidenInd->getPHINode();
10215               ID = &WidenInd->getInductionDescriptor();
10216             }
10217 
10218             ResumeV = MainILV.createInductionResumeValue(
10219                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10220                 {EPI.MainLoopIterationCountCheck});
10221           }
10222           assert(ResumeV && "Must have a resume value");
10223           VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10224           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10225         }
10226 
10227         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10228                         DT, true, &ExpandedSCEVs);
10229         ++LoopsEpilogueVectorized;
10230 
10231         if (!MainILV.areSafetyChecksAdded())
10232           DisableRuntimeUnroll = true;
10233       } else {
10234         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10235                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10236                                PSI, Checks);
10237 
10238         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10239         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10240         ++LoopsVectorized;
10241 
10242         // Add metadata to disable runtime unrolling a scalar loop when there
10243         // are no runtime checks about strides and memory. A scalar loop that is
10244         // rarely used is not worth unrolling.
10245         if (!LB.areSafetyChecksAdded())
10246           DisableRuntimeUnroll = true;
10247       }
10248       // Report the vectorization decision.
10249       reportVectorization(ORE, L, VF, IC);
10250     }
10251 
10252     if (ORE->allowExtraAnalysis(LV_NAME))
10253       checkMixedPrecision(L, ORE);
10254   }
10255 
10256   std::optional<MDNode *> RemainderLoopID =
10257       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10258                                       LLVMLoopVectorizeFollowupEpilogue});
10259   if (RemainderLoopID) {
10260     L->setLoopID(*RemainderLoopID);
10261   } else {
10262     if (DisableRuntimeUnroll)
10263       AddRuntimeUnrollDisableMetaData(L);
10264 
10265     // Mark the loop as already vectorized to avoid vectorizing again.
10266     Hints.setAlreadyVectorized();
10267   }
10268 
10269   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10270   return true;
10271 }
10272 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo * BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AssumptionCache & AC_,LoopAccessInfoManager & LAIs_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)10273 LoopVectorizeResult LoopVectorizePass::runImpl(
10274     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10275     DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10276     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10277     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10278   SE = &SE_;
10279   LI = &LI_;
10280   TTI = &TTI_;
10281   DT = &DT_;
10282   BFI = BFI_;
10283   TLI = TLI_;
10284   AC = &AC_;
10285   LAIs = &LAIs_;
10286   DB = &DB_;
10287   ORE = &ORE_;
10288   PSI = PSI_;
10289 
10290   // Don't attempt if
10291   // 1. the target claims to have no vector registers, and
10292   // 2. interleaving won't help ILP.
10293   //
10294   // The second condition is necessary because, even if the target has no
10295   // vector registers, loop vectorization may still enable scalar
10296   // interleaving.
10297   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10298       TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10299     return LoopVectorizeResult(false, false);
10300 
10301   bool Changed = false, CFGChanged = false;
10302 
10303   // The vectorizer requires loops to be in simplified form.
10304   // Since simplification may add new inner loops, it has to run before the
10305   // legality and profitability checks. This means running the loop vectorizer
10306   // will simplify all loops, regardless of whether anything end up being
10307   // vectorized.
10308   for (const auto &L : *LI)
10309     Changed |= CFGChanged |=
10310         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10311 
10312   // Build up a worklist of inner-loops to vectorize. This is necessary as
10313   // the act of vectorizing or partially unrolling a loop creates new loops
10314   // and can invalidate iterators across the loops.
10315   SmallVector<Loop *, 8> Worklist;
10316 
10317   for (Loop *L : *LI)
10318     collectSupportedLoops(*L, LI, ORE, Worklist);
10319 
10320   LoopsAnalyzed += Worklist.size();
10321 
10322   // Now walk the identified inner loops.
10323   while (!Worklist.empty()) {
10324     Loop *L = Worklist.pop_back_val();
10325 
10326     // For the inner loops we actually process, form LCSSA to simplify the
10327     // transform.
10328     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10329 
10330     Changed |= CFGChanged |= processLoop(L);
10331 
10332     if (Changed) {
10333       LAIs->clear();
10334 
10335 #ifndef NDEBUG
10336       if (VerifySCEV)
10337         SE->verify();
10338 #endif
10339     }
10340   }
10341 
10342   // Process each loop nest in the function.
10343   return LoopVectorizeResult(Changed, CFGChanged);
10344 }
10345 
run(Function & F,FunctionAnalysisManager & AM)10346 PreservedAnalyses LoopVectorizePass::run(Function &F,
10347                                          FunctionAnalysisManager &AM) {
10348     auto &LI = AM.getResult<LoopAnalysis>(F);
10349     // There are no loops in the function. Return before computing other expensive
10350     // analyses.
10351     if (LI.empty())
10352       return PreservedAnalyses::all();
10353     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10354     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10355     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10356     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10357     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10358     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10359     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10360 
10361     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10362     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10363     ProfileSummaryInfo *PSI =
10364         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10365     BlockFrequencyInfo *BFI = nullptr;
10366     if (PSI && PSI->hasProfileSummary())
10367       BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10368     LoopVectorizeResult Result =
10369         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10370     if (!Result.MadeAnyChange)
10371       return PreservedAnalyses::all();
10372     PreservedAnalyses PA;
10373 
10374     if (isAssignmentTrackingEnabled(*F.getParent())) {
10375       for (auto &BB : F)
10376         RemoveRedundantDbgInstrs(&BB);
10377     }
10378 
10379     // We currently do not preserve loopinfo/dominator analyses with outer loop
10380     // vectorization. Until this is addressed, mark these analyses as preserved
10381     // only for non-VPlan-native path.
10382     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10383     if (!EnableVPlanNativePath) {
10384       PA.preserve<LoopAnalysis>();
10385       PA.preserve<DominatorTreeAnalysis>();
10386       PA.preserve<ScalarEvolutionAnalysis>();
10387     }
10388 
10389     if (Result.MadeCFGChange) {
10390       // Making CFG changes likely means a loop got vectorized. Indicate that
10391       // extra simplification passes should be run.
10392       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10393       // be run if runtime checks have been added.
10394       AM.getResult<ShouldRunExtraVectorPasses>(F);
10395       PA.preserve<ShouldRunExtraVectorPasses>();
10396     } else {
10397       PA.preserveSet<CFGAnalyses>();
10398     }
10399     return PA;
10400 }
10401 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)10402 void LoopVectorizePass::printPipeline(
10403     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10404   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10405       OS, MapClassName2PassName);
10406 
10407   OS << '<';
10408   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10409   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10410   OS << '>';
10411 }
10412